# TASKS IN NLP

In [67]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LOCALGHOST\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\LOCALGHOST\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LOCALGHOST\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

NLTK consisits of different sub-modules and we  will be using functions within these sub-modules to perform these tasks.
we will import and use these sub-modules and functions as and when required.

## Task - 1

Take a piece of text and break it into senstences or words.

In [10]:
text = "Mary had a little lamb. Her fleece was white as snow"

In [11]:
# For this, first we import : tokenize sub-module and functions(within tokenize) : word_tokenize & sent_tokenize

from nltk.tokenize import word_tokenize,sent_tokenize

In [12]:
# now apply sent_tokenize function to text to break the text into a list of sentences

sents = sent_tokenize(text)
sents

['Mary had a little lamb.', 'Her fleece was white as snow']

In [15]:
# now apply word_tokenize function to text to break the text into a list of words
words = word_tokenize(text)
words

['Mary',
 'had',
 'a',
 'little',
 'lamb',
 '.',
 'Her',
 'fleece',
 'was',
 'white',
 'as',
 'snow']

In [17]:
# In the above example, we broke the whole text into a single list of words.
# If we want to take sentences and break then into words, then use the following

words_sn = [word_tokenize(sent) for sent in sents] #list comprehension on above obtained result of sent_tokenize : sents
words_sn

[['Mary', 'had', 'a', 'little', 'lamb', '.'],
 ['Her', 'fleece', 'was', 'white', 'as', 'snow']]

In [18]:
# In the above cell we got two lists of tokens, and punctuation('.') is also treated as a seperate token.

## Task -2 

: Remove Stop Words

In [19]:
# for this , nltk has some built-in linguistic resources

In [25]:
# import a set of stopwords that are already there in nltk
# import a set of punctuations - since they are treated as tokens
# convert list of punctuations and stopwords into a single set

from nltk.corpus import stopwords
from string import punctuation
customStopWords = set(stopwords.words('english')+list(punctuation)) 

In [27]:
type(customStopWords)

set

In [30]:
# Now we can filter out words from text which are not in customStopWords 

wordsWOstopwords = [word for word in word_tokenize(text) if word not in customStopWords]
print(wordsWOstopwords)

['Mary', 'little', 'lamb', 'Her', 'fleece', 'white', 'snow']


## Task -3

Identify N-Grams : words that occur consecutively 
*** 
Bigrams : pair of words that occur consecutively   
For this example, we will construct Bigrams from a list of words and the frequency of occurance of bigrams within that list of words.

In [33]:
# For this, we will make use of nltk module : collocations
# collocations are words that are located together/collocated.
# use from_words() function from BigramCollocationFinder class to construct Bigrams from a list of words and store them in-
# -finder class
# finder object has its own method to print all bigrams that it has constructed
# inside the binder object we have all the bigrams and their frequencies

from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsWOstopwords) 

In [34]:
# print items in finder
sorted(finder.ngram_fd.items())

[(('Her', 'fleece'), 1),
 (('Mary', 'little'), 1),
 (('fleece', 'white'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1),
 (('white', 'snow'), 1)]

In [35]:
#collocations also has a Trigram finder, which we can use in a similar fashion

In [36]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = TrigramCollocationFinder.from_words(wordsWOstopwords)
sorted(finder.ngram_fd.items())

[(('Her', 'fleece', 'white'), 1),
 (('Mary', 'little', 'lamb'), 1),
 (('fleece', 'white', 'snow'), 1),
 (('lamb', 'Her', 'fleece'), 1),
 (('little', 'lamb', 'Her'), 1)]

In [37]:
# it also has Quadgram finder - check it out

## Task - 4

- Stemming   
- Parts of speech Tagging

In [49]:
text = "Mary closed on closing night when she was in the mood to close"

In [50]:
# above we have word 'close' in different morphological forms
# stemming - identify root of words
# why ? otherwise will be treated as different words while counting frequency
# nltk has several algo's to do stemming

In [51]:
# from stem module in nltk library import lancasterStemmer class(loacted inside lancaster sub-module)
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
stemmedwords = [st.stem(word) for word in word_tokenize(text)]

In [52]:
type(st) #LancasterStemmer is a class , st is an object , stem is a function

nltk.stem.lancaster.LancasterStemmer

In [56]:
print(stemmedwords)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'clos']


In [57]:
# Now let's take each of these words and tag each of these words to their relevant part of speech tag
# nltk has a builtin module for this called pos_tag

In [62]:
nltk.pos_tag(word_tokenize(text))

[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB')]

In [65]:
#checkout list of acronyms at the end of this notebook

## Task - 4

Word Disambiguation

- Identify meaning of a words w.r.t its context

In [None]:
# For this, we can make use of resource called Wordnet in corpus module in nltk lib
# Wordnet is a lexicon/dictionary
# the basic entity within wordnet is sysnset
# A synset represesnt one single definition of a word

In [69]:
from nltk.corpus import wordnet as wn
for ss in wn.synsets('bass'):
    print(ss ,ss.definition())

Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range


Now, with two different connotations(meaning in context) lets check whether nltk will be able to disambiguate b/w them.

In [70]:
# For this lets import an algo 'Lesk' from wsd module in in nltk

In [71]:
from nltk.wsd import lesk

lesk class has a builtin function which takes a set of words which represent the context and the word whose meaning we want it to return.

In [75]:
sense1 = lesk(word_tokenize("Sing in a lower tone,along with the base"),'bass')
print(sense1, sense1.definition())

Synset('bass.n.07') the member with the lowest range of a family of musical instruments


In [76]:
sense2 = lesk(word_tokenize("This sea bass was really hard to catch"),'bass')
print(sense2, sense2.definition())

Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae


*** 

END NOTES  :

In [77]:
# CC coordinating conjunction
# CD cardinal digit
# DT determiner
# EX existential there (like: “there is” … think of it like “there exists”)
# FW foreign word
# IN preposition/subordinating conjunction
# JJ adjective ‘big’
# JJR adjective, comparative ‘bigger’
# JJS adjective, superlative ‘biggest’
# LS list marker 1)
# MD modal could, will
# NN noun, singular ‘desk’
# NNS noun plural ‘desks’
# NNP proper noun, singular ‘Harrison’
# NNPS proper noun, plural ‘Americans’
# PDT predeterminer ‘all the kids’
# POS possessive ending parent’s
# PRP personal pronoun I, he, she
# PRP$ possessive pronoun my, his, hers
# RB adverb very, silently,
# RBR adverb, comparative better
# RBS adverb, superlative best
# RP particle give up
# TO, to go ‘to’ the store.
# UH interjection, errrrrrrrm
# VB verb, base form take
# VBD verb, past tense took
# VBG verb, gerund/present participle taking
# VBN verb, past participle taken
# VBP verb, sing. present, non-3d take
# VBZ verb, 3rd person sing. present takes
# WDT wh-determiner which
# WP wh-pronoun who, what
# WP$ possessive wh-pronoun whose
# WRB wh-abverb where, when