# Text processing with NLTK
    - tokenization - splitting text into tokens of words,sentences
    - lemmatization - a process of converting a word into its root form
    - pos tagging
    - spelling correction

In [1]:
import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("tagsets")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
data = "Pune is a sprawling city in the western Indian state of Maharashtra. It was once the base of the Peshwas (prime ministers) of the Maratha Empire, which lasted from 1674 to 1818. It's known for the grand Aga Khan Palace, built in 1892 and now a memorial to Mahatma Gandhi, whose ashes are preserved in the garden. The 8th-century Pataleshwar Cave Temple is dedicated to the Hindu god Shiva."
print(data)

Pune is a sprawling city in the western Indian state of Maharashtra. It was once the base of the Peshwas (prime ministers) of the Maratha Empire, which lasted from 1674 to 1818. It's known for the grand Aga Khan Palace, built in 1892 and now a memorial to Mahatma Gandhi, whose ashes are preserved in the garden. The 8th-century Pataleshwar Cave Temple is dedicated to the Hindu god Shiva.


In [3]:
nltk.sent_tokenize(data)

['Pune is a sprawling city in the western Indian state of Maharashtra.',
 'It was once the base of the Peshwas (prime ministers) of the Maratha Empire, which lasted from 1674 to 1818.',
 "It's known for the grand Aga Khan Palace, built in 1892 and now a memorial to Mahatma Gandhi, whose ashes are preserved in the garden.",
 'The 8th-century Pataleshwar Cave Temple is dedicated to the Hindu god Shiva.']

In [4]:
nltk.word_tokenize(data)

['Pune',
 'is',
 'a',
 'sprawling',
 'city',
 'in',
 'the',
 'western',
 'Indian',
 'state',
 'of',
 'Maharashtra',
 '.',
 'It',
 'was',
 'once',
 'the',
 'base',
 'of',
 'the',
 'Peshwas',
 '(',
 'prime',
 'ministers',
 ')',
 'of',
 'the',
 'Maratha',
 'Empire',
 ',',
 'which',
 'lasted',
 'from',
 '1674',
 'to',
 '1818',
 '.',
 'It',
 "'s",
 'known',
 'for',
 'the',
 'grand',
 'Aga',
 'Khan',
 'Palace',
 ',',
 'built',
 'in',
 '1892',
 'and',
 'now',
 'a',
 'memorial',
 'to',
 'Mahatma',
 'Gandhi',
 ',',
 'whose',
 'ashes',
 'are',
 'preserved',
 'in',
 'the',
 'garden',
 '.',
 'The',
 '8th-century',
 'Pataleshwar',
 'Cave',
 'Temple',
 'is',
 'dedicated',
 'to',
 'the',
 'Hindu',
 'god',
 'Shiva',
 '.']

### Lemmatization

In [6]:
from nltk.stem import WordNetLemmatizer
wd = WordNetLemmatizer()
wd.lemmatize('children')

'child'

In [7]:
wd.lemmatize('wives')

'wife'

In [8]:
wd.lemmatize('boxes')

'box'

In [9]:
wd.lemmatize('went','v') # v= verb

'go'

In [11]:
wd.lemmatize('happier','a')# a= adjective

'happy'

## Pos Tagging

In [12]:
nltk.pos_tag(nltk.word_tokenize("I lost my watch while roaming in Mumbai."))

[('I', 'PRP'),
 ('lost', 'VBD'),
 ('my', 'PRP$'),
 ('watch', 'NN'),
 ('while', 'IN'),
 ('roaming', 'VBG'),
 ('in', 'IN'),
 ('Mumbai', 'NNP'),
 ('.', '.')]

In [21]:
nltk.help.upenn_tagset("PRP$")

PRP$: pronoun, possessive
    her his mine my our ours their thy your


## Spelling Correction

In [14]:
nltk.jaccard_distance(set("orange"),set("mumbai"))

0.9

In [16]:
nltk.jaccard_distance(set("orange"),set("orenge"))

0.16666666666666666

In [17]:
import numpy as np

In [18]:
dic = ["orange",'apple','grapes','mango','banana']

def recommend(word):
    score = [nltk.jaccard_distance(set(word),set(w)) for w in dic]
    return dic[np.argmin(score)]

In [19]:
recommend("applo")

'apple'

In [20]:
recommend("mongo")

'mango'