## Using Spacy OOP based

In [43]:
import spacy

In [44]:
# to load english language modules
nlp = spacy.load('en_core_web_sm')
doc = nlp("The sun slowly descended behind the mountains. Mr. casting a warm golden glow across the tranquil valley.")

In [45]:
# doc.sents to get list of sentences
for sentence in doc.sents:
    print(sentence)

The sun slowly descended behind the mountains.
Mr. casting a warm golden glow across the tranquil valley.


In [46]:
for sentence in doc.sents:
    for word in sentence:
        print(word)

The
sun
slowly
descended
behind
the
mountains
.
Mr.
casting
a
warm
golden
glow
across
the
tranquil
valley
.


## Using NLTK String Processing Library

In [47]:
import nltk
nltk.download('punkt') # tokenizser type

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [48]:
# to import tokensization type and features
from nltk.tokenize import sent_tokenize,word_tokenize

In [49]:
sent_tokenize("The sun slowly descended behind the mountains. Mr. casting. a warm golden glow across the tranquil valley.")

['The sun slowly descended behind the mountains.',
 'Mr. casting.',
 'a warm golden glow across the tranquil valley.']

In [50]:
word_tokenize("The sun slowly descended behind the mountains, casting a warm golden glow across the tranquil valley.")

['The',
 'sun',
 'slowly',
 'descended',
 'behind',
 'the',
 'mountains',
 ',',
 'casting',
 'a',
 'warm',
 'golden',
 'glow',
 'across',
 'the',
 'tranquil',
 'valley',
 '.']

In [51]:
nlp = spacy.blank("en")
doc = nlp("The sun slowly descended behind the mountains, casting a warm golden glow across the tranquil valley. two 2$")
for token in doc:
    print(token)

The
sun
slowly
descended
behind
the
mountains
,
casting
a
warm
golden
glow
across
the
tranquil
valley
.
two
2
$


In [52]:
span = doc[1:5]
span

sun slowly descended behind

In [53]:
token1 = doc[-3]
token1.like_num

True

In [54]:
with open("./Txts/email.txt","r") as file:
    data = ' '.join(file.readlines())
email = []
doc = nlp(data)
for token in doc:
    if token.like_email:
        email.append(token) 
print(email)

[abcxyz@gmail.com]


In [55]:
doc = nlp("gimme double cheese extra large healthy pizza")

In [57]:
# for custom rule making
tokens = [token.text for token in doc]
from spacy.symbols import ORTH
nlp.tokenizer.add_special_case("gimme",[
    {ORTH: "gim"},
    {ORTH: "me"}
])

In [60]:
# adding synthensizer to the empty pipeline
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x28113d98190>

In [61]:
nlp.pipe_names

['sentencizer']