# Text processing with NLTK
    - tokenization
    - lemmatization
    - pos tagging
    - spelling correction

In [1]:
import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
nltk.download("tagsets")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

## tokenization

In [2]:
data = "Pune is a sprawling city in the western Indian state of Maharashtra. It was once the base of the Peshwas (prime ministers) of the Maratha Empire, which lasted from 1674 to 1818. It's known for the grand Aga Khan Palace, built in 1892 and now a memorial to Mahatma Gandhi, whose ashes are preserved in the garden. The 8th-century Pataleshwar Cave Temple is dedicated to the Hindu god Shiva."
print(data)

Pune is a sprawling city in the western Indian state of Maharashtra. It was once the base of the Peshwas (prime ministers) of the Maratha Empire, which lasted from 1674 to 1818. It's known for the grand Aga Khan Palace, built in 1892 and now a memorial to Mahatma Gandhi, whose ashes are preserved in the garden. The 8th-century Pataleshwar Cave Temple is dedicated to the Hindu god Shiva.


In [3]:
nltk.sent_tokenize(data)

['Pune is a sprawling city in the western Indian state of Maharashtra.',
 'It was once the base of the Peshwas (prime ministers) of the Maratha Empire, which lasted from 1674 to 1818.',
 "It's known for the grand Aga Khan Palace, built in 1892 and now a memorial to Mahatma Gandhi, whose ashes are preserved in the garden.",
 'The 8th-century Pataleshwar Cave Temple is dedicated to the Hindu god Shiva.']

In [4]:
nltk.word_tokenize(data)

['Pune',
 'is',
 'a',
 'sprawling',
 'city',
 'in',
 'the',
 'western',
 'Indian',
 'state',
 'of',
 'Maharashtra',
 '.',
 'It',
 'was',
 'once',
 'the',
 'base',
 'of',
 'the',
 'Peshwas',
 '(',
 'prime',
 'ministers',
 ')',
 'of',
 'the',
 'Maratha',
 'Empire',
 ',',
 'which',
 'lasted',
 'from',
 '1674',
 'to',
 '1818',
 '.',
 'It',
 "'s",
 'known',
 'for',
 'the',
 'grand',
 'Aga',
 'Khan',
 'Palace',
 ',',
 'built',
 'in',
 '1892',
 'and',
 'now',
 'a',
 'memorial',
 'to',
 'Mahatma',
 'Gandhi',
 ',',
 'whose',
 'ashes',
 'are',
 'preserved',
 'in',
 'the',
 'garden',
 '.',
 'The',
 '8th-century',
 'Pataleshwar',
 'Cave',
 'Temple',
 'is',
 'dedicated',
 'to',
 'the',
 'Hindu',
 'god',
 'Shiva',
 '.']

### Lemmatization

In [5]:
from nltk.stem import WordNetLemmatizer
wd = WordNetLemmatizer()
wd.lemmatize("cars")

'car'

In [6]:
wd.lemmatize("boxes")

'box'

In [7]:
wd.lemmatize("wives")

'wife'

In [8]:
wd.lemmatize("children")

'child'

In [9]:
wd.lemmatize("went",'v') # v = verb

'go'

In [10]:
wd.lemmatize("happier",'a')# a = adjective

'happy'

### Pos tagging

In [11]:
nltk.pos_tag(nltk.word_tokenize("I lost my watch in Mumbai while roaming around beach."))

[('I', 'PRP'),
 ('lost', 'VBD'),
 ('my', 'PRP$'),
 ('watch', 'NN'),
 ('in', 'IN'),
 ('Mumbai', 'NNP'),
 ('while', 'IN'),
 ('roaming', 'VBG'),
 ('around', 'RB'),
 ('beach', 'NN'),
 ('.', '.')]

In [12]:
nltk.help.upenn_tagset("RB")

RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...


### Spelling

In [13]:
nltk.jaccard_distance(set("orange"),set("mumbai"))

0.9

In [14]:
nltk.jaccard_distance(set("orange"),set("orenge"))

0.16666666666666666

In [15]:
import numpy as np

In [19]:
dic = ['orange','apple','mango','banana','grapes']
def recommend(word):
    score = [nltk.jaccard_distance(set(w),set(word)) for w in dic]
    return dic[np.argmin(score)]

In [20]:
recommend("applo")

'apple'

In [21]:
recommend("mongo")

'mango'