# **EXP 02: Introduction to WordNet**

## **WordNet**

In [1]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /root/nltk_data...


## **Synsets and Lemmas**

### **Getting Synsets:**

In [2]:
syn = wn.synsets('room')
print(syn)

[Synset('room.n.01'), Synset('room.n.02'), Synset('room.n.03'), Synset('room.n.04'), Synset('board.v.02')]


### **Getting definition of a Synset**

In [5]:
syn_arr = wn.synsets('room')
syn_arr[0].definition()

'an area within a building enclosed by walls and floor and ceiling'

In [6]:
# Another method
wn.synset('room.n.01').definition()

'an area within a building enclosed by walls and floor and ceiling'

### **Getting all Lemmas of a Synset**

In [7]:
print(syn_arr[1].lemma_names())

['room', 'way', 'elbow_room']


In [8]:
# Another method
print(wn.synset('room.n.02').lemma_names())

['room', 'way', 'elbow_room']


## **Hyponyms**

In [9]:
print(wn.synset('calendar.n.01').hyponyms())

[Synset('lunisolar_calendar.n.01'), Synset('lunar_calendar.n.01'), Synset('solar_calendar.n.01')]


## **Hypernyms**

In [10]:
print(wn.synset('solar_calendar.n.01').hypernyms())

[Synset('calendar.n.01')]


# **EXP 03: Parts-of-Speech of Tagging**

## **Using a Tagger**

In [21]:
import nltk
nltk.download('pnkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
text = nltk.word_tokenize("And now for something completely different")
nltk.pos_tag(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

In [23]:
text = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

## **Tagged Corpora**

### **Representing Tagged Tokens:**

In [24]:
tagged_token = nltk.tag.str2tuple('fly/NN')
tagged_token

('fly', 'NN')

In [25]:
tagged_token[0]

'fly'

In [26]:
tagged_token[1]

'NN'

### **Reading Tagged Corpora**

In [27]:
nltk.download('brown')
nltk.corpus.brown.tagged_words()

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [28]:
nltk.download('universal_tagset')
nltk.corpus.brown.tagged_words(tagset = 'universal')

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


[('The', 'DET'), ('Fulton', 'NOUN'), ...]

### **For another treebank corpus**

In [29]:
nltk.download('treebank')
nltk.corpus.treebank.tagged_words(tagset = 'universal')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ...]

###**For Indian Languages:**

In [30]:
nltk.download('indian')
nltk.corpus.indian.tagged_words()

[nltk_data] Downloading package indian to /root/nltk_data...
[nltk_data]   Unzipping corpora/indian.zip.


[('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM'), ...]

In [31]:
# pos tagger for Hindi corpus
nltk.corpus.indian.tagged_words('hindi.pos')

[('पूर्ण', 'JJ'), ('प्रतिबंध', 'NN'), ('हटाओ', 'VFM'), ...]

In [32]:
# pos tagger for Marathi corpus
nltk.corpus.indian.tagged_words('marathi.pos')

[("''", 'SYM'), ('सनातनवाद्यांनी', 'NN'), ('व', 'CC'), ...]

###**To find tags that are most common in the news category of the Brown Corpus:**

In [33]:
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories = 'news', tagset = 'universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd

FreqDist({'NOUN': 30654, 'VERB': 14399, 'ADP': 12355, '.': 11928, 'DET': 11389, 'ADJ': 6706, 'ADV': 3349, 'CONJ': 2717, 'PRON': 2535, 'PRT': 2264, ...})

## **Automatic Tagging**

In [43]:
nltk.download('brown')
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories = 'news')
print(brown_tagged_sents)

[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [44]:
# For simple sentences of brown corpus
brown_sents = brown.sents(categories = 'news')
print(brown_sents)

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]


## **The Default Tagger**

In [41]:
tags = [tag for (word, tag) in brown.tagged_words(categories = 'news')]
nltk.FreqDist(tags).max()

'NN'

## **Tagger that tags everything as NN**

In [45]:
nltk.download('punktr')
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
default_tagger.tag(tokens)

[nltk_data] Error loading punktr: Package 'punktr' not found in index


[('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('green', 'NN'),
 ('eggs', 'NN'),
 ('and', 'NN'),
 ('ham', 'NN'),
 (',', 'NN'),
 ('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('them', 'NN'),
 ('Sam', 'NN'),
 ('I', 'NN'),
 ('am', 'NN'),
 ('!', 'NN')]

In [46]:
# This method performs rather poorly
default_tagger.accuracy(brown_tagged_sents)

0.13089484257215028

## **The Regular Expression Tagger**

In [47]:
pattern = [
    (r'.*ing$', 'VBG'),
    (r'.*ed$', 'VBD'),
    (r'.*es$', 'VBZ'),
    (r'.*ould$', 'MD'),
    (r'.*\'s$', 'NN$'),
    (r'.*s$', 'NNS'),
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
    (r'.*', 'NN')
]

regexp_tagger = nltk.RegexpTagger(pattern)
regexp_tagger.tag(brown_sents[3])

[('``', 'NN'),
 ('Only', 'NN'),
 ('a', 'NN'),
 ('relative', 'NN'),
 ('handful', 'NN'),
 ('of', 'NN'),
 ('such', 'NN'),
 ('reports', 'NNS'),
 ('was', 'NNS'),
 ('received', 'VBD'),
 ("''", 'NN'),
 (',', 'NN'),
 ('the', 'NN'),
 ('jury', 'NN'),
 ('said', 'NN'),
 (',', 'NN'),
 ('``', 'NN'),
 ('considering', 'VBG'),
 ('the', 'NN'),
 ('widespread', 'NN'),
 ('interest', 'NN'),
 ('in', 'NN'),
 ('the', 'NN'),
 ('election', 'NN'),
 (',', 'NN'),
 ('the', 'NN'),
 ('number', 'NN'),
 ('of', 'NN'),
 ('voters', 'NNS'),
 ('and', 'NN'),
 ('the', 'NN'),
 ('size', 'NN'),
 ('of', 'NN'),
 ('this', 'NNS'),
 ('city', 'NN'),
 ("''", 'NN'),
 ('.', 'NN')]

In [48]:
regexp_tagger.accuracy(brown_tagged_sents)

0.20326391789486245

##**N-Gram Tagging**

In [50]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ','),
 ('being', 'BEG'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('ground', 'NN'),
 ('floor', 'NN'),
 ('so', 'QL'),
 ('that', 'CS'),
 ('entrance', 'NN'),
 ('is', 'BEZ'),
 ('direct', 'JJ'),
 ('.', '.')]

In [51]:
unigram_tagger.accuracy(brown_tagged_sents)

0.9349006503968017

## **Separating the Training and Testing data**

In [52]:
size = int(len(brown_tagged_sents) * 0.9)
size

4160

In [53]:
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
unigram_tagger.accuracy(test_sents)

0.8121200039868434