In [2]:
import nltk 

In [3]:
# Tokenizing text

In [4]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [5]:
text = "This is an amazing sentance. This is another amazing sentence"

In [10]:
# tokenize by sentence
sents = sent_tokenize(text)
sents

['This is an amazing sentance.', 'This is another amazing sentence']

In [14]:
# tokenize by word
words = word_tokenize(text)
words
# ['This', 'is','an', 'amazing','sentance', '.', 'This', 'is', 'another', 'amazing', 'sentence']

['This',
 'is',
 'an',
 'amazing',
 'sentance',
 '.',
 'This',
 'is',
 'another',
 'amazing',
 'sentence']

In [15]:
# Stop words
# stop words are words that dont add much meaning to a sentence or text e.g 'are', 'is', 'as'

from nltk.corpus import stopwords
from string import punctuation

In [18]:
customStopWords = set(stopwords.words('english')+list(punctuation))

In [20]:
words_with_not_stop_words = [word for word in word_tokenize(text) if word not in customStopWords]
words_with_not_stop_words

['This', 'amazing', 'sentance', 'This', 'another', 'amazing', 'sentence']

In [25]:
# Bigrams and N grams
# getting the frequency of words in a document
from nltk.collections import * 
bigrams_measure = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(words_with_not_stop_words)
sorted(finder.ngram_fd.items()) # print the bigrams

[(('This', 'amazing'), 1),
 (('This', 'another'), 1),
 (('amazing', 'sentance'), 1),
 (('amazing', 'sentence'), 1),
 (('another', 'amazing'), 1),
 (('sentance', 'This'), 1)]

In [26]:
# Steaming and parts of speech tagging
# using the lancaster steaming algorithim
from nltk.stem.lancaster import LancasterStemmer


In [29]:
cool_text = "you are cool, but are you cooler than the kid"

In [30]:
ls= LancasterStemmer()
stemmed_words=[ls.stem(word) for word in word_tokenize(cool_text) ]
print(stemmed_words)

['you', 'ar', 'cool', ',', 'but', 'ar', 'you', 'cool', 'than', 'the', 'kid']


In [33]:
nltk.pos_tag(word_tokenize(cool_text))

[('you', 'PRP'),
 ('are', 'VBP'),
 ('cool', 'JJ'),
 (',', ','),
 ('but', 'CC'),
 ('are', 'VBP'),
 ('you', 'PRP'),
 ('cooler', 'JJR'),
 ('than', 'IN'),
 ('the', 'DT'),
 ('kid', 'NN')]

In [41]:
# Disambiguating
# this is the process of understanding the meaning of a world in relation to context used
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk

In [40]:
for ss in wn.synsets('cold'):
    print(ss, ss.definition())

Synset('cold.n.01') a mild viral infection involving the nose and respiratory passages (but not the lungs)
Synset('coldness.n.03') the absence of heat
Synset('cold.n.03') the sensation produced by low temperatures
Synset('cold.a.01') having a low or inadequate temperature or feeling a sensation of coldness or having been made cold by e.g. ice or refrigeration
Synset('cold.a.02') extended meanings; especially of psychological coldness; without human warmth or emotion
Synset('cold.s.03') having lost freshness through passage of time
Synset('cold.s.04') (color) giving no sensation of warmth
Synset('cold.s.05') marked by errorless familiarity
Synset('cold.s.06') lacking originality or spontaneity; no longer new
Synset('cold.s.07') so intense as to be almost uncontrollable
Synset('cold.s.08') sexually unresponsive
Synset('cold.s.09') without compunction or human feeling
Synset('cold.s.10') feeling or showing no enthusiasm
Synset('cold.s.11') unconscious from a blow or shock or intoxication


In [46]:
s1 = lesk(word_tokenize("I am cold"),'cold')
print(s1, s1.definition())

Synset('cold.a.01') having a low or inadequate temperature or feeling a sensation of coldness or having been made cold by e.g. ice or refrigeration


In [47]:
s2 = lesk(word_tokenize("she is so cold towards me"),'cold')
print(s2, s2.definition())

Synset('cold.s.07') so intense as to be almost uncontrollable
