# Tokenizing Words and Sentences

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize

example_text = "Hello Mr. Smith, how are you doing today? The weather is great and Python is awesome. The sky is pink and you should eat cardboard"

print(sent_tokenize(example_text)) # Sentences have been separeted, output is a list

['Hello Mr. Smith, how are you doing today?', 'The weather is great and Python is awesome.', 'The sky is pink and you should eat cardboard']


In [4]:
print(word_tokenize(example_text))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pink', 'and', 'you', 'should', 'eat', 'cardboard']


In [5]:
for i in word_tokenize(example_text):
    print(i)

Hello
Mr.
Smith
,
how
are
you
doing
today
?
The
weather
is
great
and
Python
is
awesome
.
The
sky
is
pink
and
you
should
eat
cardboard


# Stop Words

In [7]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sentence = "This is an example showing of stop word filtration."
stop_words = set(stopwords.words("english"))

print(stop_words)

{'mustn', 'nor', 'once', "she's", 'they', 'about', 'down', 'ain', 'then', 'wasn', 'hasn', 'after', 'needn', 'do', 'being', 'having', 'myself', 'under', "you're", 'both', 've', 'you', 'here', 'further', 'she', 'he', "haven't", 'has', 'yours', 'other', 'a', 'itself', 'should', 'doing', 'or', 'before', 'how', 'don', "aren't", 'by', "don't", 'an', 'than', "couldn't", 'shan', 'same', 'hadn', 'haven', 'himself', 'his', "hadn't", 'off', 'her', 'some', 'your', 'their', 'were', 'into', 'as', 'me', 'themselves', "mustn't", 'weren', 'yourselves', 's', 't', 'we', "wouldn't", 'not', 'm', 'why', 'any', "didn't", 'aren', 'each', 'more', 'are', 'this', 'on', 'd', 'hers', 'no', "you've", 'herself', 'until', 'where', 'such', 'won', 'so', 'y', 'over', 'but', "shan't", "shouldn't", 'couldn', 'if', 'because', 'these', "it's", 'him', 'for', 'at', 'own', 'yourself', 'few', 'that', 'will', 'out', 'which', "you'll", 'i', 'during', 'from', 'there', 'whom', 'isn', "that'll", "doesn't", 'very', 'am', 'been', 'aga

In [8]:
words = word_tokenize(example_sentence)

print(words)

['This', 'is', 'an', 'example', 'showing', 'of', 'stop', 'word', 'filtration', '.']


In [9]:
filtered_sentence = []

for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)

print(filtered_sentence) # stopwords have been removed

['This', 'example', 'showing', 'stop', 'word', 'filtration', '.']


# Stemming

In [11]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli


In [12]:
new_text = "It is very important to be loved while loving the act of love which in itself is very lovely"

words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))

It
is
veri
import
to
be
love
while
love
the
act
of
love
which
in
itself
is
veri
love


# Part of Speech Tagging

In [None]:
# POS tag list:

# CC	coordinating conjunction
# CD	cardinal digit
# DT	determiner
# EX	existential there (like: "there is" ... think of it like "there exists")
# FW	foreign word
# IN	preposition/subordinating conjunction
# JJ	adjective	'big'
# JJR	adjective, comparative	'bigger'
# JJS	adjective, superlative	'biggest'
# LS	list marker	1)
# MD	modal	could, will
# NN	noun, singular 'desk'
# NNS	noun plural	'desks'
# NNP	proper noun, singular	'Harrison'
# NNPS	proper noun, plural	'Americans'
# PDT	predeterminer	'all the kids'
# POS	possessive ending	parent\'s
# PRP	personal pronoun	I, he, she
# PRP$	possessive pronoun	my, his, hers
# RB	adverb	very, silently,
# RBR	adverb, comparative	better
# RBS	adverb, superlative	best
# RP	particle	give up
# TO	to	go 'to' the store.
# UH	interjection	errrrrrrrm
# VB	verb, base form	take
# VBD	verb, past tense	took
# VBG	verb, gerund/present participle	taking
# VBN	verb, past participle	taken
# VBP	verb, sing. present, non-3d	take
# VBZ	verb, 3rd person sing. present	takes
# WDT	wh-determiner	which
# WP	wh-pronoun	who, what
# WP$	possessive wh-pronoun	whose
# WRB	wh-abverb	where, when

In [4]:
import nltk
from nltk.tokenize import word_tokenize

text = word_tokenize("Hello welcome to the world of to learn Categorizing and POS Tagging with NLTK and Python")

nltk.pos_tag(text)

[('Hello', 'NNP'),
 ('welcome', 'NN'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('world', 'NN'),
 ('of', 'IN'),
 ('to', 'TO'),
 ('learn', 'VB'),
 ('Categorizing', 'NNP'),
 ('and', 'CC'),
 ('POS', 'NNP'),
 ('Tagging', 'NNP'),
 ('with', 'IN'),
 ('NLTK', 'NNP'),
 ('and', 'CC'),
 ('Python', 'NNP')]

In [13]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer # This is an unsupervised ML Tokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

print(train_text[0:1000])

PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION
 
February 2, 2005


9:10 P.M. EST 

THE PRESIDENT: Mr. Speaker, Vice President Cheney, members of Congress, fellow citizens: 

As a new Congress gathers, all of us in the elected branches of government share a great privilege: We've been placed in office by the votes of the people we serve. And tonight that is a privilege we share with newly-elected leaders of Afghanistan, the Palestinian Territories, Ukraine, and a free and sovereign Iraq. (Applause.) 

Two weeks ago, I stood on the steps of this Capitol and renewed the commitment of our nation to the guiding ideal of liberty for all. This evening I will set forth policies to advance that ideal at home and around the world. 

Tonight, with a healthy, growing economy, with more Americans going back to work, with our nation an active force for good in the world -- the state of our union is confident and strong. (Applause.) 

Our generati

In [11]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text) # Traning the Punkt tokenizer

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))

process_content()

Z'), ('been', 'VBN'), ('falling', 'VBG'), ('for', 'IN'), ('a', 'DT'), ('dozen', 'NN'), ('years', 'NNS'), ('in', 'IN'), ('a', 'DT'), ('row', 'NN'), ('.', '.')]
[('(', '('), ('Applause', 'NNP'), ('.', '.'), (')', ')')]
[('These', 'DT'), ('gains', 'NNS'), ('are', 'VBP'), ('evidence', 'NN'), ('of', 'IN'), ('a', 'DT'), ('quiet', 'JJ'), ('transformation', 'NN'), ('--', ':'), ('a', 'DT'), ('revolution', 'NN'), ('of', 'IN'), ('conscience', 'NN'), (',', ','), ('in', 'IN'), ('which', 'WDT'), ('a', 'DT'), ('rising', 'VBG'), ('generation', 'NN'), ('is', 'VBZ'), ('finding', 'VBG'), ('that', 'IN'), ('a', 'DT'), ('life', 'NN'), ('of', 'IN'), ('personal', 'JJ'), ('responsibility', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('life', 'NN'), ('of', 'IN'), ('fulfillment', 'NN'), ('.', '.')]
[('Government', 'NNP'), ('has', 'VBZ'), ('played', 'VBN'), ('a', 'DT'), ('role', 'NN'), ('.', '.')]
[('Wise', 'NNP'), ('policies', 'NNS'), (',', ','), ('such', 'JJ'), ('as', 'IN'), ('welfare', 'NN'), ('reform', 'NN'), ('and', 

# Chunking

In [7]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer # This is an unsupervised ML Tokenizer

text = word_tokenize("Hello welcome to the world of to learn Categorizing and POS Tagging with NLTK and Python.")

pos = nltk.pos_tag(text)

chunkGram = r""" Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""

chunkParser = nltk.RegexpParser(chunkGram)
chunked = chunkParser.parse(pos)

print(chunked)

(S
  (Chunk Hello/NNP welcome/NN)
  to/TO
  the/DT
  world/NN
  of/IN
  to/TO
  (Chunk learn/VB Categorizing/NNP)
  and/CC
  (Chunk POS/NNP Tagging/NNP)
  with/IN
  (Chunk NLTK/NNP)
  and/CC
  (Chunk Python/NNP)
  ./.)


# Chinking

In [28]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize # This is an unsupervised ML Tokenizer

text = word_tokenize("Hello welcome to the world of to learn Categorizing and POS Tagging with NLTK and Python.")

pos = nltk.pos_tag(text)

chunkGram = r""" Chunk: {<.*>+} 
                        }<VB.?|IN|DT>+{"""

chunkParser = nltk.RegexpParser(chunkGram)
chunked = chunkParser.parse(pos)

print(chunked)

(S
  (Chunk Hello/NNP welcome/NN to/TO)
  the/DT
  (Chunk world/NN)
  of/IN
  (Chunk to/TO)
  learn/VB
  (Chunk Categorizing/NNP and/CC POS/NNP Tagging/NNP)
  with/IN
  (Chunk NLTK/NNP and/CC Python/NNP ./.))


# Named Entity Recognition

In [None]:
# NE Type and Examples
# ORGANIZATION - Georgia-Pacific Corp., WHO
# PERSON - Eddy Bonte, President Obama
# LOCATION - Murray River, Mount Everest
# DATE - June, 2008-06-29
# TIME - two fifty a m, 1:30 p.m.
# MONEY - 175 million Canadian Dollars, GBP 10.40
# PERCENT - twenty pct, 18.75 %
# FACILITY - Washington Monument, Stonehenge
# GPE - South East Asia, Midlothian

In [35]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize # This is an unsupervised ML Tokenizer

text = word_tokenize("Hello, my name is Alex. WHO has changed the iPhone and you have a better camera and a better screen on July by 20%. This is very big for Apple which is starting it's production in May in India.")

pos = nltk.pos_tag(text)

namedEnt = nltk.ne_chunk(pos)

print(namedEnt)

(S
  (GPE Hello/NNP)
  ,/,
  my/PRP$
  name/NN
  is/VBZ
  (PERSON Alex/NNP)
  ./.
  (ORGANIZATION WHO/NNP)
  has/VBZ
  changed/VBN
  the/DT
  (ORGANIZATION iPhone/NN)
  and/CC
  you/PRP
  have/VBP
  a/DT
  better/JJR
  camera/NN
  and/CC
  a/DT
  better/JJR
  screen/NN
  on/IN
  July/NNP
  by/IN
  20/CD
  %/NN
  ./.
  This/DT
  is/VBZ
  very/RB
  big/JJ
  for/IN
  (PERSON Apple/NNP)
  which/WDT
  is/VBZ
  starting/VBG
  it/PRP
  's/VBZ
  production/NN
  in/IN
  May/NNP
  in/IN
  (GPE India/NNP)
  ./.)


# Lemmatizing

In [37]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))

cat
cactus
goose
rock
python


In [45]:
print(lemmatizer.lemmatize("better", pos = "a")) # a -> adjective, default is noun
print(lemmatizer.lemmatize("best", pos = "a"))
print(lemmatizer.lemmatize("ran", pos = "v"))

good
best
ran
run


# NLTK Corpus

In [1]:
import nltk
print(nltk.__file__)

/usr/local/lib/python3.8/site-packages/nltk/__init__.py


In [3]:
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize

sample_text = gutenberg.raw("bible-kjv.txt")

tok = sent_tokenize(sample_text)

print(tok[5:15])

['1:5 And God called the light Day, and the darkness he called Night.', 'And the evening and the morning were the first day.', '1:6 And God said, Let there be a firmament in the midst of the waters,\nand let it divide the waters from the waters.', '1:7 And God made the firmament, and divided the waters which were\nunder the firmament from the waters which were above the firmament:\nand it was so.', '1:8 And God called the firmament Heaven.', 'And the evening and the\nmorning were the second day.', '1:9 And God said, Let the waters under the heaven be gathered together\nunto one place, and let the dry land appear: and it was so.', '1:10 And God called the dry land Earth; and the gathering together of\nthe waters called he Seas: and God saw that it was good.', '1:11 And God said, Let the earth bring forth grass, the herb yielding\nseed, and the fruit tree yielding fruit after his kind, whose seed is\nin itself, upon the earth: and it was so.', '1:12 And the earth brought forth grass, and

# Word Net

In [2]:
from nltk.corpus import wordnet

syns = wordnet.synsets("program")

print(syns)

[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]


In [3]:
print(syns[0])

Synset('plan.n.01')


In [4]:
print(syns[0].lemmas()) # These are synonyms for a word

[Lemma('plan.n.01.plan'), Lemma('plan.n.01.program'), Lemma('plan.n.01.programme')]


In [5]:
print(syns[0].lemmas()[0].name())

plan


In [6]:
print(syns[0].definition())

a series of steps to be carried out or goals to be accomplished


In [7]:
print(syns[0].examples())

['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [9]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print("\n")
print(set(antonyms))

{'well', 'respectable', 'dependable', 'right', 'soundly', 'adept', 'goodness', 'unspoiled', 'proficient', 'trade_good', 'honorable', 'expert', 'effective', 'estimable', 'ripe', 'beneficial', 'in_force', 'serious', 'skilful', 'near', 'practiced', 'salutary', 'in_effect', 'honest', 'secure', 'skillful', 'upright', 'commodity', 'unspoilt', 'just', 'undecomposed', 'sound', 'thoroughly', 'dear', 'safe', 'full', 'good'}


{'evilness', 'evil', 'badness', 'ill', 'bad'}


In [10]:
w1 = wordnet.synset("ship.n.01") # n -> noun and 01 -> first one
w2 = wordnet.synset("boat.n.01")

print(w1.wup_similarity(w2)) # Compare similarity of word 1 and word 2, answer is in *100 %

0.9090909090909091


In [11]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("car.n.01")

print(w1.wup_similarity(w2))

0.6956521739130435


In [12]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("cat.n.01")

print(w1.wup_similarity(w2))

0.32
