## Tokenization
Splitting paragraphs into sentences or words.

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
text = "hi john, how are you? I will be travelling to your city. Let's catch up"

In [2]:
sent_tokenize(text)

['hi john, how are you?',
 'I will be travelling to your city.',
 "Let's catch up"]

In [4]:
word_tokenize(text)

['hi',
 'john',
 ',',
 'how',
 'are',
 'you',
 '?',
 'I',
 'will',
 'be',
 'travelling',
 'to',
 'your',
 'city',
 '.',
 'Let',
 "'s",
 'catch',
 'up']

## Stemming
Reducing words to its root word. Its major drawback is that it sometimes reduces word whcih has no meaning. for example see stemming of increases.

In [22]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

print(stemmer.stem("playing"))
print(stemmer.stem("plays"))
print(stemmer.stem("played"))

print(stemmer.stem("increases")) # drawback!!! not a dictionary word-> use lemmatization
print(stemmer.stem("raining"),stemmer.stem("decreases"))

play
play
play
increas
rain decreas


## Lemmatization

In [44]:
from nltk.stem import WordNetLemmatizer
lemm = WordNetLemmatizer()

print(lemm.lemmatize("increases"))
print(lemm.lemmatize("running", pos="v")) ## make use of pos tag for lemmatization
print(lemm.lemmatize("running"))

increase
run
running


## pos_tag

In [14]:
from nltk import pos_tag
text = "hi john, how are you? I will be travelling to your city. Let's catch up"
tokens = word_tokenize(text)

In [16]:
pos_tag(tokens)

[('hi', 'NN'),
 ('john', 'NN'),
 (',', ','),
 ('how', 'WRB'),
 ('are', 'VBP'),
 ('you', 'PRP'),
 ('?', '.'),
 ('I', 'PRP'),
 ('will', 'MD'),
 ('be', 'VB'),
 ('travelling', 'VBG'),
 ('to', 'TO'),
 ('your', 'PRP$'),
 ('city', 'NN'),
 ('.', '.'),
 ('Let', 'VB'),
 ("'s", 'POS'),
 ('catch', 'VB'),
 ('up', 'RP')]

## Find synonym

In [17]:
from nltk.corpus import wordnet
wordnet.synsets('good')

[Synset('good.n.01'),
 Synset('good.n.02'),
 Synset('good.n.03'),
 Synset('commodity.n.01'),
 Synset('good.a.01'),
 Synset('full.s.06'),
 Synset('good.a.03'),
 Synset('estimable.s.02'),
 Synset('beneficial.s.01'),
 Synset('good.s.06'),
 Synset('good.s.07'),
 Synset('adept.s.01'),
 Synset('good.s.09'),
 Synset('dear.s.02'),
 Synset('dependable.s.04'),
 Synset('good.s.12'),
 Synset('good.s.13'),
 Synset('effective.s.04'),
 Synset('good.s.15'),
 Synset('good.s.16'),
 Synset('good.s.17'),
 Synset('good.s.18'),
 Synset('good.s.19'),
 Synset('good.s.20'),
 Synset('good.s.21'),
 Synset('well.r.01'),
 Synset('thoroughly.r.02')]

## N-grams

In [19]:
from nltk import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize

sentence = "I love to play football"
n = 2
for gram in ngrams(word_tokenize(sentence), n):
    print(gram)

('I', 'love')
('love', 'to')
('to', 'play')
('play', 'football')


In [40]:
grams = ngrams(word_tokenize(sentence),n=2)
print(" ".join(str(g) for g in grams))

('I', 'love') ('love', 'to') ('to', 'play') ('play', 'football')
