In [None]:
#The process of classifying words into their parts of speech and labeling them accordingly is known as part-of-speech tagging, or simply POS-tagging.

#steps
#Tokenize text (word_tokenize)
#apply pos_tag to above step that is nltk.pos_tag(tokenize_text)
#NLTK POS tagger is used to assign grammatical information of each word of the sentence. 

Applications:  information retrieval, parsing, Text to Speech (TTS) applications, information extraction, linguistic research for corpora.

Techniques
1. Rule-based POS tagging: The rule-based POS tagging models apply a set of handwritten rules and use contextual information to assign POS tags to words.
2. Transformation Based Tagging:  The transformation-based approaches use a pre-defined set of handcrafted rules as well as automatically induced rules that are generated during training.
3.deep learning models: Various Deep learning models have been used for POS tagging such as Meta-BiLSTM 
4.Stochastic (Probabilistic) tagging: A stochastic approach includes frequency, probability or statistics.

Chunking(shallow parsing)- process to take small pieces of information and group them into large units. The primary use of Chunking is making groups of “noun phrases.”
It is used to add structure to the sentence by following POS tagging combined with regular expressions. 
The resulted group of words are called “chunks".
Chunking is used for entity detection.

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
from nltk import pos_tag
from nltk import RegexpParser
text ="Boult Audio headphone had many defects and. Highly dissatisfied with the company.".split()
print("After Split:",text)
tokens_tag = pos_tag(text)
print("After Token:",tokens_tag)
patterns= """mychunk:{<NN.?>*<VBD.?>*<JJ.?>*<CC>?}"""
chunker = RegexpParser(patterns)
print("After Regex:",chunker)
output = chunker.parse(tokens_tag)
print("After Chunking",output)

After Split: ['Boult', 'Audio', 'headphone', 'had', 'many', 'defects', 'and.', 'Highly', 'dissatisfied', 'with', 'the', 'company.']
After Token: [('Boult', 'NNP'), ('Audio', 'NNP'), ('headphone', 'NN'), ('had', 'VBD'), ('many', 'JJ'), ('defects', 'NNS'), ('and.', 'VBP'), ('Highly', 'NNP'), ('dissatisfied', 'VBD'), ('with', 'IN'), ('the', 'DT'), ('company.', 'NN')]
After Regex: chunk.RegexpParser with 1 stages:
RegexpChunkParser with 1 rules:
       <ChunkRule: '<NN.?>*<VBD.?>*<JJ.?>*<CC>?'>
After Chunking (S
  (mychunk Boult/NNP Audio/NNP headphone/NN had/VBD many/JJ)
  (mychunk defects/NNS)
  and./VBP
  (mychunk Highly/NNP dissatisfied/VBD)
  with/IN
  the/DT
  (mychunk company./NN))


NLTK POS Tags Examples are as below:

Abbreviation	Meaning
CC	coordinating conjunction
CD	cardinal digit
DT	determiner
EX	existential there
FW	foreign word
IN	preposition/subordinating conjunction
JJ	This NLTK POS Tag is an adjective (large)
JJR	adjective, comparative (larger)
JJS	adjective, superlative (largest)
LS	list market
MD	modal (could, will)
NN	noun, singular (cat, tree)
NNS	noun plural (desks)
NNP	proper noun, singular (sarah)
NNPS	proper noun, plural (indians or americans)
PDT	predeterminer (all, both, half)
POS	possessive ending (parent\ ‘s)
PRP	personal pronoun (hers, herself, him, himself)
PRP$	possessive pronoun (her, his, mine, my, our )
RB	adverb (occasionally, swiftly)
RBR	adverb, comparative (greater)
RBS	adverb, superlative (biggest)
RP	particle (about)
TO	infinite marker (to)
UH	interjection (goodbye)
VB	verb (ask)
VBG	verb gerund (judging)
VBD	verb past tense (pleaded)
VBN	verb past participle (reunified)
VBP	verb, present tense not 3rd person singular(wrap)
VBZ	verb, present tense with 3rd person singular (bases)
WDT	wh-determiner (that, what)
WP	wh- pronoun (who)
WRB	wh- adverb (how)




In [None]:
#Counting tags- crucial for text classification 
from collections import Counter
import nltk
nltk.download('punkt')
text = "Boult Audio headphone had many defects and. Highly dissatisfied with the company.Other Boult products are quite fine. "
lower_case = text.lower()
tokens = nltk.word_tokenize(lower_case)
tags = nltk.pos_tag(tokens)
counts = Counter( tag for word,  tag in tags)
print(counts)

Counter({'NN': 5, 'JJ': 3, 'NNS': 2, '.': 2, 'RB': 2, 'VBD': 1, 'CC': 1, 'IN': 1, 'DT': 1, 'VBP': 1})


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Collocations are the pairs of words occurring together many times in a document.
It is calculated by the number of those pair occurring together to the overall word count of the document.
Types :
Bigrams
Trigrams 
They are useful in text-based sentimental analysis.

In [None]:
#bigrams
Tokens = nltk.word_tokenize(text)
output = list(nltk.bigrams(Tokens))
print(output)

[('Boult', 'Audio'), ('Audio', 'headphone'), ('headphone', 'had'), ('had', 'many'), ('many', 'defects'), ('defects', 'and'), ('and', '.'), ('.', 'Highly'), ('Highly', 'dissatisfied'), ('dissatisfied', 'with'), ('with', 'the'), ('the', 'company.Other'), ('company.Other', 'Boult'), ('Boult', 'products'), ('products', 'are'), ('are', 'quite'), ('quite', 'fine'), ('fine', '.')]


In [None]:
#trigrams
Tokens = nltk.word_tokenize(text)
output = list(nltk.trigrams(Tokens))
print(output)

[('Boult', 'Audio', 'headphone'), ('Audio', 'headphone', 'had'), ('headphone', 'had', 'many'), ('had', 'many', 'defects'), ('many', 'defects', 'and'), ('defects', 'and', '.'), ('and', '.', 'Highly'), ('.', 'Highly', 'dissatisfied'), ('Highly', 'dissatisfied', 'with'), ('dissatisfied', 'with', 'the'), ('with', 'the', 'company.Other'), ('the', 'company.Other', 'Boult'), ('company.Other', 'Boult', 'products'), ('Boult', 'products', 'are'), ('products', 'are', 'quite'), ('are', 'quite', 'fine'), ('quite', 'fine', '.')]


Optimizing HMM with Viterbi Algorithm 
The Viterbi algorithm is a dynamic programming algorithm for finding the most likely sequence of hidden states—called the Viterbi path—that results in a sequence of observed events, especially in the context of Markov information sources and hidden Markov models (HMM).

In [None]:
# Importing libraries
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time
 
#download the treebank corpus from nltk
nltk.download('treebank')
 
#download the universal tagset from nltk
nltk.download('universal_tagset')
 
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
 
#print the first two sentences along with tags
print(nltk_data[:2])

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]


In [None]:
#print each word with its respective tag for first two sentences
for sent in nltk_data[:2]:
  for tuple in sent:
    print(tuple)

('Pierre', 'NOUN')
('Vinken', 'NOUN')
(',', '.')
('61', 'NUM')
('years', 'NOUN')
('old', 'ADJ')
(',', '.')
('will', 'VERB')
('join', 'VERB')
('the', 'DET')
('board', 'NOUN')
('as', 'ADP')
('a', 'DET')
('nonexecutive', 'ADJ')
('director', 'NOUN')
('Nov.', 'NOUN')
('29', 'NUM')
('.', '.')
('Mr.', 'NOUN')
('Vinken', 'NOUN')
('is', 'VERB')
('chairman', 'NOUN')
('of', 'ADP')
('Elsevier', 'NOUN')
('N.V.', 'NOUN')
(',', '.')
('the', 'DET')
('Dutch', 'NOUN')
('publishing', 'VERB')
('group', 'NOUN')
('.', '.')


In [None]:

# split data into training and validation set in the ratio 80:20
train_set,test_set =train_test_split(nltk_data,train_size=0.80,test_size=0.20,random_state = 101)

In [None]:
# create list of train and test tagged words
train_tagged_words = [ tup for sent in train_set for tup in sent ]
test_tagged_words = [ tup for sent in test_set for tup in sent ]
print(len(train_tagged_words))
print(len(test_tagged_words))

80310
20366


In [None]:

# check some of the tagged words.
train_tagged_words[:5]

[('Drink', 'NOUN'),
 ('Carrier', 'NOUN'),
 ('Competes', 'VERB'),
 ('With', 'ADP'),
 ('Cartons', 'NOUN')]

In [None]:
#use set datatype to check how many unique tags are present in training data
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)
 
# check total words in vocabulary
vocab = {word for word,tag in train_tagged_words}

12
{'CONJ', 'ADV', 'PRON', 'ADP', 'NOUN', '.', 'X', 'NUM', 'ADJ', 'VERB', 'DET', 'PRT'}
