# Simple POS Tagger

In [2]:
import nltk
from nltk import pos_tag
from nltk import word_tokenize
sample_text = word_tokenize("The classes are reopening from 15th March'2021 in St. Joseph's College")
sample_text

['The',
 'classes',
 'are',
 'reopening',
 'from',
 '15th',
 "March'2021",
 'in',
 'St.',
 'Joseph',
 "'s",
 'College']

In [3]:
pos_tag(sample_text)

[('The', 'DT'),
 ('classes', 'NNS'),
 ('are', 'VBP'),
 ('reopening', 'VBG'),
 ('from', 'IN'),
 ('15th', 'CD'),
 ("March'2021", 'NNP'),
 ('in', 'IN'),
 ('St.', 'NNP'),
 ('Joseph', 'NNP'),
 ("'s", 'POS'),
 ('College', 'NNP')]

In [6]:
import nltk
nltk.download('tagsets')


[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping help\tagsets.zip.


True

In [7]:
nltk.help.upenn_tagset('VBG')

VBG: verb, present participle or gerund
    telegraphing stirring focusing angering judging stalling lactating
    hankerin' alleging veering capping approaching traveling besieging
    encrypting interrupting erasing wincing ...


In [9]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


True

In [10]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text

<Text: the fulton county grand jury said friday an...>

In [11]:
text.similar('boy')

man time day way girl year house people world city family state room
country car woman program church government job


In [12]:
text.similar('test')

time man year state work place one day case way world is question
group country war start moment in and


# Representing Tagged Words/Tokens

In [13]:
var1 = nltk.tag.str2tuple('SJCC/NNP')

In [14]:
var1

('SJCC', 'NNP')

In [15]:
var1[1]

'NNP'

In [17]:
var1[0]

'SJCC'

In [18]:
sentence = '''
            The/DT classes/NNS are/VBP reopening/VBG from/IN 15th/CD March'2021/NNP 
            in/IN St./NNP Joseph/NNP 's/POS College/NNP
            '''
sentence

"\n            The/DT classes/NNS are/VBP reopening/VBG from/IN 15th/CD March'2021/NNP \n            in/IN St./NNP Joseph/NNP 's/POS College/NNP\n            "

In [19]:
[nltk.tag.str2tuple(i) for i in sentence.split()]
#split the process of diviing text into sentences

[('The', 'DT'),
 ('classes', 'NNS'),
 ('are', 'VBP'),
 ('reopening', 'VBG'),
 ('from', 'IN'),
 ('15th', 'CD'),
 ("March'2021", 'NNP'),
 ('in', 'IN'),
 ('St.', 'NNP'),
 ('Joseph', 'NNP'),
 ("'s", 'POS'),
 ('College', 'NNP')]

In [20]:
nltk.corpus.brown.tagged_words()

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [21]:
nltk.help.upenn_tagset('AT')

No matching tags found.


In [23]:
import nltk
nltk.download('indian')


[nltk_data] Downloading package indian to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\indian.zip.


True

In [24]:
nltk.corpus.indian.tagged_words()

[('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM'), ...]

In [25]:
nltk.corpus.treebank.tagged_words()
#

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ...]

In [27]:
import nltk
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\universal_tagset.zip.


True

In [28]:
nltk.corpus.treebank.tagged_words(tagset = 'universal')

[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ...]

# Nouns, Verbs, Adverbs and Adjectives Categorization

In [29]:
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories = 'news', tagset = 'universal')
brown_news_tagged

[('The', 'DET'), ('Fulton', 'NOUN'), ...]

In [30]:
word_tag_pairs = nltk.bigrams(brown_news_tagged)
word_tag_pairs

<generator object bigrams at 0x000001B49079EF90>

#  assertion based on grammar is ==> nouns come after a determiner, an adjective or can be the topic/subject/object of the verb

In [31]:
n = [a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN']

In [32]:
fdist = nltk.FreqDist(n)

In [33]:
[tag for (tag, _) in fdist.most_common()]

['NOUN',
 'DET',
 'ADJ',
 'ADP',
 '.',
 'VERB',
 'CONJ',
 'NUM',
 'ADV',
 'PRT',
 'PRON',
 'X']

# most common verbs in the corpus

In [34]:
w = nltk.corpus.treebank.tagged_words(tagset = 'universal')
w

[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ...]

In [35]:
freq_tagsets = nltk.FreqDist(w)
freq_tagsets

FreqDist({(',', '.'): 4885, ('the', 'DET'): 4038, ('.', '.'): 3828, ('of', 'ADP'): 2319, ('to', 'PRT'): 2161, ('a', 'DET'): 1874, ('in', 'ADP'): 1554, ('and', 'CONJ'): 1505, ('*-1', 'X'): 1123, ('0', 'X'): 1099, ...})

In [36]:
[a[0] for (a, _) in freq_tagsets.most_common() if a[1] == 'VERB']

['is',
 'said',
 'was',
 'are',
 'be',
 'has',
 'have',
 'will',
 'says',
 'would',
 'were',
 'had',
 'been',
 'could',
 "'s",
 'can',
 'do',
 'say',
 'make',
 'may',
 'did',
 'rose',
 'made',
 'does',
 'expected',
 'buy',
 'take',
 'get',
 'might',
 'sell',
 'added',
 'sold',
 'help',
 'including',
 'should',
 'reported',
 'according',
 'pay',
 'compared',
 'being',
 'fell',
 'began',
 'based',
 'used',
 'closed',
 "'re",
 'want',
 'see',
 'took',
 'yield',
 'offered',
 'set',
 'priced',
 'approved',
 'come',
 'noted',
 'cut',
 'ended',
 'found',
 'increased',
 'become',
 'think',
 'named',
 'go',
 'trying',
 'proposed',
 'received',
 'growing',
 'declined',
 'held',
 'give',
 'came',
 'use',
 'put',
 'making',
 'continue',
 'raise',
 'estimated',
 'called',
 'paid',
 'designed',
 'going',
 'expects',
 'seeking',
 'must',
 'plans',
 'wo',
 'increasing',
 'saying',
 'got',
 'owns',
 'trading',
 'acquired',
 'gained',
 'fined',
 'reached',
 'holding',
 'announced',
 'filed',
 'became',
