# NLTK (Natural Language ToolKit)

In [36]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, state_union
from nltk.stem import PorterStemmer, SnowballStemmer
from pprint import pprint
from IPython.display import HTML, display
# nltk.download()

In [49]:
print(nltk.__file__)

C:\Users\Yasin\Anaconda3\lib\site-packages\nltk\__init__.py


## Tokenizing

In [37]:
from nltk.tokenize import word_tokenize, sent_tokenize

text = 'Hello Mr. Smith. How are you today?'
print(word_tokenize(text))
print(sent_tokenize(text))

['Hello', 'Mr.', 'Smith', '.', 'How', 'are', 'you', 'today', '?']
['Hello Mr. Smith.', 'How are you today?']


## Stopwords

In [39]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

sw = stopwords.words('english')
text = 'Hello Mr. Smith. How are you today? This is a great day! My name is Yasin Zamani.'
filtered_text = [x for x in word_tokenize(text) if x.lower() not in sw]

print(filtered_text)

marked_text = ' '.join([('<span style="background-color:yellow">{}</span>'.format(x) if x.lower() in sw else x) for x in text.split()])
HTML(marked_text)

['Hello', 'Mr.', 'Smith', '.', 'today', '?', 'great', 'day', '!', 'name', 'Yasin', 'Zamani', '.']


## Stemming

In [41]:
ps = PorterStemmer()
ss = SnowballStemmer('english')
words = ['write', 'wrote', 'written', 'writing', 'hopefully', 'generously']
for w in words:
    print(ps.stem(w), ss.stem(w))

write write
wrote wrote
written written
write write
hope hope
gener generous


## Part of speech tagging

In [42]:
text = state_union.raw('2006-GWBush.txt')
words = word_tokenize(text)
tagged = nltk.pos_tag(words)

## Chunkiong

## Chinking

## Named Entity Recognition

## Lemmatizing

In [48]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
print(wnl.lemmatize('cats'))
print(wnl.lemmatize('better'))
print(wnl.lemmatize('better', pos='a'))

cat
better
good


## Corpora

C:\Users\Yasin\AppData\Roaming\nltk_data\corpora

In [51]:
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize

text = gutenberg.raw('bible-kjv.txt')
sentences = sent_tokenize(text)

print(sentences[5:15])

['1:5 And God called the light Day, and the darkness he called Night.', 'And the evening and the morning were the first day.', '1:6 And God said, Let there be a firmament in the midst of the waters,\nand let it divide the waters from the waters.', '1:7 And God made the firmament, and divided the waters which were\nunder the firmament from the waters which were above the firmament:\nand it was so.', '1:8 And God called the firmament Heaven.', 'And the evening and the\nmorning were the second day.', '1:9 And God said, Let the waters under the heaven be gathered together\nunto one place, and let the dry land appear: and it was so.', '1:10 And God called the dry land Earth; and the gathering together of\nthe waters called he Seas: and God saw that it was good.', '1:11 And God said, Let the earth bring forth grass, the herb yielding\nseed, and the fruit tree yielding fruit after his kind, whose seed is\nin itself, upon the earth: and it was so.', '1:12 And the earth brought forth grass, and

## Wordnet

In [57]:
from nltk.corpus import wordnet
syns = wordnet.synsets('program')
print(syns)
print(syns[0].name())
print(syns[0].lemmas()[0].name())
print(syns[0].definition())
print(syns[0].examples())

[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]
plan.n.01
plan
a series of steps to be carried out or goals to be accomplished
['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [63]:
synonyms = []
antonyms = []

for syn in wordnet.synsets('good'):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
        
print(set(synonyms))
print(set(antonyms))

{'safe', 'secure', 'estimable', 'well', 'near', 'unspoiled', 'salutary', 'unspoilt', 'expert', 'sound', 'respectable', 'beneficial', 'commodity', 'proficient', 'in_force', 'full', 'adept', 'in_effect', 'serious', 'ripe', 'honest', 'skilful', 'dependable', 'trade_good', 'soundly', 'just', 'skillful', 'upright', 'thoroughly', 'good', 'effective', 'undecomposed', 'dear', 'right', 'practiced', 'honorable', 'goodness'}
{'badness', 'evil', 'evilness', 'bad', 'ill'}
