# Text processing with NLTK and spacy

    pip install nltk
    pip install spacy
    python -m spacy download en_core_web_sm

In [1]:
import nltk

In [3]:
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("tagsets")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

    - Tokenization
    - Lemmatization
    - PoS Tagging
    - Spelling correction
    - NER

### Tokenization

In [4]:
data="Hyderabad is the capital of southern India's Telangana state. A major center for the technology industry, it's home to many upscale restaurants and shops. Its historic sites include Golconda Fort, a former diamond-trading center that was once the Qutb Shahi dynastic capital. The Charminar, a 16th-century mosque whose 4 arches support towering minarets, is an old city landmark near the long-standing Laad Bazaar."
print(data)

Hyderabad is the capital of southern India's Telangana state. A major center for the technology industry, it's home to many upscale restaurants and shops. Its historic sites include Golconda Fort, a former diamond-trading center that was once the Qutb Shahi dynastic capital. The Charminar, a 16th-century mosque whose 4 arches support towering minarets, is an old city landmark near the long-standing Laad Bazaar.


In [5]:
nltk.sent_tokenize(data)

["Hyderabad is the capital of southern India's Telangana state.",
 "A major center for the technology industry, it's home to many upscale restaurants and shops.",
 'Its historic sites include Golconda Fort, a former diamond-trading center that was once the Qutb Shahi dynastic capital.',
 'The Charminar, a 16th-century mosque whose 4 arches support towering minarets, is an old city landmark near the long-standing Laad Bazaar.']

In [6]:
nltk.word_tokenize(data)

['Hyderabad',
 'is',
 'the',
 'capital',
 'of',
 'southern',
 'India',
 "'s",
 'Telangana',
 'state',
 '.',
 'A',
 'major',
 'center',
 'for',
 'the',
 'technology',
 'industry',
 ',',
 'it',
 "'s",
 'home',
 'to',
 'many',
 'upscale',
 'restaurants',
 'and',
 'shops',
 '.',
 'Its',
 'historic',
 'sites',
 'include',
 'Golconda',
 'Fort',
 ',',
 'a',
 'former',
 'diamond-trading',
 'center',
 'that',
 'was',
 'once',
 'the',
 'Qutb',
 'Shahi',
 'dynastic',
 'capital',
 '.',
 'The',
 'Charminar',
 ',',
 'a',
 '16th-century',
 'mosque',
 'whose',
 '4',
 'arches',
 'support',
 'towering',
 'minarets',
 ',',
 'is',
 'an',
 'old',
 'city',
 'landmark',
 'near',
 'the',
 'long-standing',
 'Laad',
 'Bazaar',
 '.']

### Lemmatization

In [7]:
from nltk.stem import WordNetLemmatizer
wd = WordNetLemmatizer()
wd.lemmatize("boxes")

'box'

In [8]:
wd.lemmatize("children")

'child'

In [9]:
wd.lemmatize("wives")

'wife'

In [10]:
wd.lemmatize("happier",'a')# a = adjective

'happy'

In [11]:
wd.lemmatize('went','v') # v = verb

'go'

### PoS Tagging

In [12]:
doc = "John lost his watch in Mumbai while travelling to Delhi via Indigo flight."
nltk.pos_tag(nltk.word_tokenize(doc))

[('John', 'NNP'),
 ('lost', 'VBD'),
 ('his', 'PRP$'),
 ('watch', 'NN'),
 ('in', 'IN'),
 ('Mumbai', 'NNP'),
 ('while', 'IN'),
 ('travelling', 'VBG'),
 ('to', 'TO'),
 ('Delhi', 'NNP'),
 ('via', 'IN'),
 ('Indigo', 'NNP'),
 ('flight', 'NN'),
 ('.', '.')]

In [13]:
nltk.help.upenn_tagset("VBG")

VBG: verb, present participle or gerund
    telegraphing stirring focusing angering judging stalling lactating
    hankerin' alleging veering capping approaching traveling besieging
    encrypting interrupting erasing wincing ...


### Spelling Correction

In [16]:
import numpy as np

In [14]:
nltk.jaccard_distance(set("king"),set("kong"))

0.4

In [15]:
nltk.jaccard_distance(set("king"),set("anshu"))

0.875

In [17]:
dic = ["orange",'apple','mango','grapes','banana']
def recommend(w):
    scores = [nltk.jaccard_distance(set(w),set(i)) for i in dic]
    return dic[np.argmin(scores)]

In [18]:
recommend("bonano")

'banana'

In [19]:
recommend("applo")

'apple'

### NER and working with spaCy
### Named Entity Recognition

In [21]:
import spacy

In [22]:
nlp = spacy.load("en_core_web_sm")

In [23]:
doc = nlp("John lost his watch while travelling from  Mumbai to Delhi last friday with Jessy from TATA Steel")

In [24]:
# lemmatization
for w in doc:
    print(w.lemma_)

John
lose
his
watch
while
travel
from
 
Mumbai
to
Delhi
last
friday
with
Jessy
from
TATA
Steel


In [27]:
# sentence tokenization
doc = nlp("John lost his watch. Are you happy with it? we are travelling from  Mumbai to Delhi.")
for w in doc.sents:
    print(w)

John lost his watch.
Are you happy with it?
we are travelling from  Mumbai to Delhi.


In [28]:
# word tokenization
doc = nlp("John lost his watch. Are you happy with it? we are travelling from  Mumbai to Delhi.")
for w in doc:
    print(w)

John
lost
his
watch
.
Are
you
happy
with
it
?
we
are
travelling
from
 
Mumbai
to
Delhi
.


In [25]:
from spacy import displacy

In [26]:
displacy.render(doc,style='ent',jupyter=True)

In [43]:
doc = nlp("John lost his watch. Are you happy with it? we are travelling from  Mumbai to Delhi.")
for w in doc.sents:
    print(w)
    displacy.render(w.as_doc(),style='ent',jupyter=True)

John lost his watch.


Are you happy with it?


we are travelling from  Mumbai to Delhi.


In [35]:
type(doc)

spacy.tokens.doc.Doc

In [36]:
type(w)

spacy.tokens.span.Span

In [37]:
doc

John lost his watch. Are you happy with it? we are travelling from  Mumbai to Delhi.

In [38]:
w

we are travelling from  Mumbai to Delhi.

In [41]:
doc = nlp("Mr. John lost his watch. Are you happy with it? we are travelling from  Mumbai to Delhi. please email me at anshu@gmail.com.")
for w in doc.sents:
    print(w)

Mr. John lost his watch.
Are you happy with it?
we are travelling from  Mumbai to Delhi.
please email me at anshu@gmail.com.
