# Natural Language Processing Using NLTK

In [4]:
import numpy as np
import pandas as pd

import nltk

# Tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize, PunktSentenceTokenizer, RegexpTokenizer

# Stemming and Lemmatizing
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Stopwords
from nltk.corpus import stopwords, state_union

### POS TAG LIST

1.	CC	Coordinating conjunction
2.	CD	Cardinal number
3.	DT	Determiner
4.	EX	Existential there
5.	FW	Foreign word
6.	IN	Preposition or subordinating conjunction
7.	JJ	Adjective
8.	JJR	Adjective, comparative
9.	JJS	Adjective, superlative
10.	LS	List item marker
11.	MD	Modal
12.	NN	Noun, singular or mass
13.	NNS	Noun, plural
14.	NNP	Proper noun, singular
15.	NNPS	Proper noun, plural
16.	PDT	Predeterminer
17.	POS	Possessive ending
18.	PRP	Personal pronoun
20.	RB  Adverb
21.	RBR	Adverb, comparative
22.	RBS	Adverb, superlative
23.	RP	Particle
24.	SYM	Symbol
25.	TO	to
26.	UH	Interjection
27.	VB	Verb, base form
28.	VBD	Verb, past tense
29.	VBG	Verb, gerund or present participle
30.	VBN	Verb, past participle
31.	VBP	Verb, non-3rd person singular present
32.	VBZ	Verb, 3rd person singular present
33.	WDT	Wh-determiner
34.	WP	Wh-pronoun
36.	WRB	Wh-adverb

### Data

In [5]:
sample_data = "Silently run Pranjal man Pathak. I work for Wipro. I like playing with pets. I am a good human!"

# Reading a txt file
text = open('Cat_100.txt').read()i

### Tokenize


In [6]:
words = word_tokenize(sample_data)
sentence = sent_tokenize(sample_data)

# Using Regex in Tokenizer:
pattern = RegexpTokenizer(r'\w+')
filtered_words = pattern.tokenize(sample_data)

### Stopwords


In [7]:
#Setting lang = 'english'
stop_words = set(stopwords.words("english"))

filtered_words = []

for i in words:
    if i not in stop_words:
        filtered_words.append(i)    

### Stemming

In [8]:
ps = PorterStemmer()

for i in words:
    print(ps.stem(i))

silent
run
pranjal
man
pathak
.
I
work
for
wipro
.
I
like
play
with
pet
.
I
am
a
good
human
!


### Part of Speech Tagging

In [9]:
sample_data

'Silently run Pranjal man Pathak. I work for Wipro. I like playing with pets. I am a good human!'

In [10]:
# Tokenize the sentences -
tokens = sent_tokenize(sample_data)

for i in tokens:
    words = nltk.word_tokenize(i)
    tag = nltk.pos_tag(words)
    
    print(tag,"\n")

[('Silently', 'RB'), ('run', 'VBN'), ('Pranjal', 'NNP'), ('man', 'NN'), ('Pathak', 'NNP'), ('.', '.')] 

[('I', 'PRP'), ('work', 'VBP'), ('for', 'IN'), ('Wipro', 'NNP'), ('.', '.')] 

[('I', 'PRP'), ('like', 'VBP'), ('playing', 'VBG'), ('with', 'IN'), ('pets', 'NNS'), ('.', '.')] 

[('I', 'PRP'), ('am', 'VBP'), ('a', 'DT'), ('good', 'JJ'), ('human', 'NN'), ('!', '.')] 



### Chunking

In [11]:
tokens = sent_tokenize(sample_data)

for i in tokens:
    words = nltk.word_tokenize(i)
    tag = nltk.pos_tag(words)

    chunkGram = r''' Chunk: { <RB.?>*<VB.?>*<NNP>+<NN>?}'''
    
    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser.parse(tag)
    
    #chunked.draw()

### Chinking

In [12]:
tokens = sent_tokenize(sample_data)

for i in tokens:
    words = nltk.word_tokenize(i)
    tag = nltk.pos_tag(words)

    chunkGram = r''' Chunk: {<.*>+}
                            }<VB.?|IN|DT>+{'''
    
    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser.parse(tag)
    
    #chunked.draw()

### Named Entity Recognition

In [13]:
tokens = sent_tokenize(sample_data)

for i in tokens:
    words = nltk.word_tokenize(i)
    tag = nltk.pos_tag(words)
    
    namedEnt = nltk.ne_chunk(tag)
    namedEnt.draw()

### Lemmatizing

In [14]:
lm = WordNetLemmatizer()

print(lm.lemmatize('cats'))
print(lm.lemmatize('better'))
print(lm.lemmatize('better', pos='a'))
print(lm.lemmatize('running', pos='a'))
print(lm.lemmatize('worst', pos='a'))

cat
better
good
running
bad


### NLTK Corpora

In [15]:
from nltk.corpus import gutenberg
sample_corpus = gutenberg.raw('bible-kjv.txt')

### WORDNET

In [16]:
from nltk.corpus import wordnet

# Set of words for the word
syn_sets = wordnet.synsets('Program')

print("Set of Words related to \"Program\"- \n")
print(syn_sets)

# The first word of set
print("\n\nFirst Word = ", syn_sets[0].name())
print("\nLemma : ", syn_sets[0].lemmas())
print("Def : ", syn_sets[0].definition())

Set of Words related to "Program"- 

[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]


First Word =  plan.n.01

Lemma :  [Lemma('plan.n.01.plan'), Lemma('plan.n.01.program'), Lemma('plan.n.01.programme')]
Def :  a series of steps to be carried out or goals to be accomplished


In [18]:
# Synonyms and Antynonyms
good_set = wordnet.synsets('good')

In [19]:
synonyms = []
antonyms = []

for s in good_set:
    for lem in s.lemmas():
        synonyms.append(lem.name())
        if lem.antonyms():
            antonyms.append(lem.antonyms()[0].name())

print(synonyms, "\n")
print(antonyms)

['good', 'good', 'goodness', 'good', 'goodness', 'commodity', 'trade_good', 'good', 'good', 'full', 'good', 'good', 'estimable', 'good', 'honorable', 'respectable', 'beneficial', 'good', 'good', 'good', 'just', 'upright', 'adept', 'expert', 'good', 'practiced', 'proficient', 'skillful', 'skilful', 'good', 'dear', 'good', 'near', 'dependable', 'good', 'safe', 'secure', 'good', 'right', 'ripe', 'good', 'well', 'effective', 'good', 'in_effect', 'in_force', 'good', 'good', 'serious', 'good', 'sound', 'good', 'salutary', 'good', 'honest', 'good', 'undecomposed', 'unspoiled', 'unspoilt', 'good', 'well', 'good', 'thoroughly', 'soundly', 'good'] 

['evil', 'evilness', 'bad', 'badness', 'bad', 'evil', 'ill']


### Semantic Similarity - Method: WUP

In [20]:
# Semantic Similarity

w1 = wordnet.synset('Animal.n.01')
w2 = wordnet.synset('Dog.n.01')
print(w1.wup_similarity(w2))

w1 = wordnet.synset('Animal.n.01')
w2 = wordnet.synset('Human.n.01')
print(w1.wup_similarity(w2))

w1 = wordnet.synset('Animal.n.01')
w2 = wordnet.synset('Tea.n.01')
print(w1.wup_similarity(w2))

0.875
0.6666666666666666
0.2857142857142857


### Frequency Distribution

In [30]:
# First remove the stop words
stop_words = set(stopwords.words('english'))

filtered_words = []

# List of filtered words without Stopwords
for w in words:
    if w not in stop_words:
        filtered_words.append(w)

In [31]:
# Using Frequency Distribution feature of NLTK
freqDist_words = nltk.FreqDist(filtered_words)

# TOP 5 MOST COMMON words
top_5 = freqDist_words.most_common(5)

print(top_5)

[('I', 1), ('good', 1), ('human', 1), ('!', 1)]
