# Text Processing using NLTK

    - Tokenization
    - Stemming and Lemmatization
    - POS tagging
    - Spelling Correction
    - Named Entity Recognition (NER)
    

In [2]:
import nltk

In [3]:
data = "Jakarta, Indonesia's massive capital, sits on the northwest coast of the island of Java. A historic mix of cultures – Javanese, Malay, Chinese, Arab, Indian and European – has influenced its architecture, language and cuisine. The old town, Kota Tua, is home to Dutch colonial buildings, Glodok (Jakarta’s Chinatown) and the old port of Sunda Kelapa, where traditional wooden schooners dock."

In [4]:
print(data)

Jakarta, Indonesia's massive capital, sits on the northwest coast of the island of Java. A historic mix of cultures – Javanese, Malay, Chinese, Arab, Indian and European – has influenced its architecture, language and cuisine. The old town, Kota Tua, is home to Dutch colonial buildings, Glodok (Jakarta’s Chinatown) and the old port of Sunda Kelapa, where traditional wooden schooners dock.


In [5]:
# Tokenization - converting text into list of tokens - sentences, words
nltk.sent_tokenize(data)

["Jakarta, Indonesia's massive capital, sits on the northwest coast of the island of Java.",
 'A historic mix of cultures – Javanese, Malay, Chinese, Arab, Indian and European – has influenced its architecture, language and cuisine.',
 'The old town, Kota Tua, is home to Dutch colonial buildings, Glodok (Jakarta’s Chinatown) and the old port of Sunda Kelapa, where traditional wooden schooners dock.']

In [6]:
data = """
Jakarta, Indonesia's massive capital, sits on the northwest coast of the island of Java. A historic mix of cultures – Javanese, Malay, Chinese, Arab, Indian and European – has influenced its architecture, language and cuisine. The old town, Kota Tua, is home to Dutch colonial buildings, Glodok (Jakarta’s Chinatown) and the old port of Sunda Kelapa, where traditional wooden schooners dock.
Have you been to Jakarta ever? Hey thanks! Can you drop an email to Mr. James Carter on 
james@hello.com? please also keep manager@hello.com in cc along with dd@hello.com.
"""

In [7]:
print(data)


Jakarta, Indonesia's massive capital, sits on the northwest coast of the island of Java. A historic mix of cultures – Javanese, Malay, Chinese, Arab, Indian and European – has influenced its architecture, language and cuisine. The old town, Kota Tua, is home to Dutch colonial buildings, Glodok (Jakarta’s Chinatown) and the old port of Sunda Kelapa, where traditional wooden schooners dock.
Have you been to Jakarta ever? Hey thanks! Can you drop an email to Mr. James Carter on 
james@hello.com? please also keep manager@hello.com in cc along with dd@hello.com.



In [8]:
nltk.sent_tokenize(data)

["\nJakarta, Indonesia's massive capital, sits on the northwest coast of the island of Java.",
 'A historic mix of cultures – Javanese, Malay, Chinese, Arab, Indian and European – has influenced its architecture, language and cuisine.',
 'The old town, Kota Tua, is home to Dutch colonial buildings, Glodok (Jakarta’s Chinatown) and the old port of Sunda Kelapa, where traditional wooden schooners dock.',
 'Have you been to Jakarta ever?',
 'Hey thanks!',
 'Can you drop an email to Mr. James Carter on \njames@hello.com?',
 'please also keep manager@hello.com in cc along with dd@hello.com.']

In [9]:
nltk.word_tokenize(data)

['Jakarta',
 ',',
 'Indonesia',
 "'s",
 'massive',
 'capital',
 ',',
 'sits',
 'on',
 'the',
 'northwest',
 'coast',
 'of',
 'the',
 'island',
 'of',
 'Java',
 '.',
 'A',
 'historic',
 'mix',
 'of',
 'cultures',
 '–',
 'Javanese',
 ',',
 'Malay',
 ',',
 'Chinese',
 ',',
 'Arab',
 ',',
 'Indian',
 'and',
 'European',
 '–',
 'has',
 'influenced',
 'its',
 'architecture',
 ',',
 'language',
 'and',
 'cuisine',
 '.',
 'The',
 'old',
 'town',
 ',',
 'Kota',
 'Tua',
 ',',
 'is',
 'home',
 'to',
 'Dutch',
 'colonial',
 'buildings',
 ',',
 'Glodok',
 '(',
 'Jakarta',
 '’',
 's',
 'Chinatown',
 ')',
 'and',
 'the',
 'old',
 'port',
 'of',
 'Sunda',
 'Kelapa',
 ',',
 'where',
 'traditional',
 'wooden',
 'schooners',
 'dock',
 '.',
 'Have',
 'you',
 'been',
 'to',
 'Jakarta',
 'ever',
 '?',
 'Hey',
 'thanks',
 '!',
 'Can',
 'you',
 'drop',
 'an',
 'email',
 'to',
 'Mr.',
 'James',
 'Carter',
 'on',
 'james',
 '@',
 'hello.com',
 '?',
 'please',
 'also',
 'keep',
 'manager',
 '@',
 'hello.com',
 '

## Stemming & lemmatization

    - morphological techniques to convert a word to its root form
    - stemming is faster but less efficient
    - lemmatization is slower comparatively but more efficient

In [11]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
ps.stem("cars")

'car'

In [12]:
ps.stem("going")

'go'

In [13]:
ps.stem("flying")

'fli'

In [14]:
ps.stem("boxes")

'box'

In [16]:
from nltk.stem import WordNetLemmatizer
wd = WordNetLemmatizer()
wd.lemmatize("children")

'child'

In [17]:
wd.lemmatize("wives")

'wife'

In [18]:
wd.lemmatize("flying",'v')

'fly'

## POS Tagging

In [19]:
doc = "John lost his watch while walking along the streets of West Jakarta and found at 100 steps ahead again. He met his friend Jasmine yesterday and thanked her for the watch."
nltk.pos_tag(nltk.word_tokenize(doc))

[('John', 'NNP'),
 ('lost', 'VBD'),
 ('his', 'PRP$'),
 ('watch', 'NN'),
 ('while', 'IN'),
 ('walking', 'VBG'),
 ('along', 'IN'),
 ('the', 'DT'),
 ('streets', 'NNS'),
 ('of', 'IN'),
 ('West', 'NNP'),
 ('Jakarta', 'NNP'),
 ('and', 'CC'),
 ('found', 'VBN'),
 ('at', 'IN'),
 ('100', 'CD'),
 ('steps', 'NNS'),
 ('ahead', 'RB'),
 ('again', 'RB'),
 ('.', '.'),
 ('He', 'PRP'),
 ('met', 'VBD'),
 ('his', 'PRP$'),
 ('friend', 'NN'),
 ('Jasmine', 'NNP'),
 ('yesterday', 'NN'),
 ('and', 'CC'),
 ('thanked', 'VBD'),
 ('her', 'PRP'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('watch', 'NN'),
 ('.', '.')]

In [20]:
nltk.help.upenn_tagset("VBG")

VBG: verb, present participle or gerund
    telegraphing stirring focusing angering judging stalling lactating
    hankerin' alleging veering capping approaching traveling besieging
    encrypting interrupting erasing wincing ...


## Spelling Correction

In [25]:
import numpy as np

In [22]:
nltk.jaccard_distance(set("Hello"),set("Anshu"))

1.0

In [23]:
nltk.jaccard_distance(set("orange"),set("orenge"))

0.16666666666666666

In [24]:
mydic = ['orange','apple','mango','banana','grapes']

In [26]:
def recommend(word):
    scores = [nltk.jaccard_distance(set(w),set(word)) for w in mydic]
    ind = np.argmin(scores)
    return mydic[ind]

In [27]:
recommend("mongo")

'mango'

In [30]:
recommend('banano')

'banana'

In [31]:
recommend("opplo")

'apple'

## Named Entity Recognition

    pip install spacy
    python -m spacy download en_core_web_sm

In [32]:
import spacy

In [42]:
nlp = spacy.load("en_core_web_sm")
doc  = nlp("Jhon Clary bought watch this sunday from Glax Watch & Co. shop from 45 green cross street, in Mumbai. John met Jasmin last and went back to Dubai on 4 JAN 2022 to meet somone from Microsoft Inc. and handed over $ 5000 to his friend.")

In [43]:
from spacy import displacy

In [44]:
displacy.render(doc,style='ent',jupyter=True)

In [48]:
displacy.render(doc)