# POS Tagger 

In [18]:
import nltk   #import required libraries
from nltk.tag import hmm
from nltk.stem import PorterStemmer
from nltk.corpus.reader import TaggedCorpusReader 
from nltk.corpus.reader import PlaintextCorpusReader
from sklearn import metrics  #for evaluation purposes


In [19]:
# Used nltk tagged corpus reader to read training data (http://www.nltk.org/_modules/nltk/corpus/reader/tagged.html)

tagged_corpus = TaggedCorpusReader(".","train.txt")

In [20]:
# Extracting  tagged sentences and words from our corpus
tagged_sentences = tagged_corpus.tagged_sents()
tagged_words = tagged_corpus.tagged_words()

In [21]:
# Peek at the tagged corpus
print (tagged_sentences[0])
print (tagged_words[0])
print("No. of tagged sentences for training: " , len(tagged_sentences))
print("No. of tagged words for training: " , len(tagged_words))

[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]
('The', 'AT')
No. of tagged sentences for training:  10388
No. of tagged words for training:  228333


In [22]:
#frequency distribution of tags in our data
tag_fd = nltk.FreqDist(tag for (word, tag) in tagged_words)
tag_fd.most_common(4)

[('NN', 29090), ('IN', 23657), ('AT', 19840), ('JJ', 12254)]

In [23]:
# divide the data into train and dev set with ratio of split = 10%
split = int(len(tagged_sentences) * 0.9)
split

9349

In [24]:
train_sents = tagged_sentences[:split]
dev_sents = tagged_sentences[split:]

In [25]:
#Training and testing unigram tagger
unigram_tagger = nltk.UnigramTagger(train_sents)  
print("Unigram accuracy:", unigram_tagger.evaluate(dev_sents)*100)

Unigram accuracy: 85.17705927636644


In [26]:
#Using backoff tagger for bi-tri tagger
bigram_tagger = nltk.BigramTagger(train_sents, backoff=unigram_tagger)  
print("Bigram accuracy:",bigram_tagger.evaluate(dev_sents)*100)
trigram_tagger = nltk.TrigramTagger(train_sents, backoff=bigram_tagger)
print("Trigram accuracy:",trigram_tagger.evaluate(dev_sents)*100)


Bigram accuracy: 86.70900692840647
Trigram accuracy: 86.59353348729792


In [16]:
#Training and tagging HMM using nltk trainer
hmm_trainer = hmm.HiddenMarkovModelTrainer()
hmm_tagger = hmm_trainer.train_supervised(train_sents)
print("HMM train accuracy:", hmm_tagger.evaluate(train_sents)*100)
print("HMM dev accuracy:", hmm_tagger.evaluate(dev_sents)*100)


HMM train accuracy: 97.76034948827049
HMM dev accuracy: 43.645111624326404


## Accuracy, Precision, Recall, F1-score. 
* Precision = True Positive / (True Positive + False Positive) 
* Recall    = True Positive / (True Positive + False Negative)          
* F1-Score = 2 * (Precision * Recall) / (Precision + Recall)

In [65]:
#Using tags for given corpus to evaluate different metrics for different tagging model
uni_dev_tagged_sents = unigram_tagger.tag_sents([[word for word,tag in sentence] for sentence in dev_sents])
standard = [str(tag) for sentence in dev_sents for token,tag in sentence]
uni_predicted = [str(tag) for sentence in uni_dev_tagged_sents for token,tag in sentence]
print(" Unigram Accuracy :", metrics.accuracy_score(standard,uni_predicted))
print(" Unigram Precision:", metrics.precision_score(standard,uni_predicted,average='weighted'))
print(" Unigram Recall   :", metrics.recall_score(standard,uni_predicted,average='weighted'))
print(" Unigram F1-Score :", metrics.f1_score(standard,uni_predicted,average='weighted'))

tri_dev_tagged_sents = trigram_tagger.tag_sents([[word for word,tag in sentence] for sentence in dev_sents])
tri_predicted = [str(tag) for sentence in tri_dev_tagged_sents for token,tag in sentence]
print(" Trigram Accuracy :", metrics.accuracy_score(standard,tri_predicted))
print(" Trigram Precision:", metrics.precision_score(standard,tri_predicted,average='weighted'))
print(" Trigram Recall   :", metrics.recall_score(standard,tri_predicted,average='weighted'))
print(" Trigram F1-Score :", metrics.f1_score(standard,tri_predicted,average='weighted'))

hmm_dev_tagged_sents = hmm_tagger.tag_sents([[word for word,tag in sentence] for sentence in dev_sents])
hmm_predicted = [str(tag) for sentence in hmm_dev_tagged_sents for token,tag in sentence]
print(" Hmm Accuracy :", metrics.accuracy_score(standard,hmm_predicted))
print(" Hmm Precision:", metrics.precision_score(standard,hmm_predicted,average='weighted'))
print(" Hmm Recall   :", metrics.recall_score(standard,hmm_predicted,average='weighted'))
print(" Hmm F1-Score :", metrics.f1_score(standard,hmm_predicted,average='weighted'))



 Unigram Accuracy : 0.8513471901462664
 Unigram Precision: 0.9124731025455544
 Unigram Recall   : 0.8513471901462664
 Unigram F1-Score : 0.8748561520066922


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


 Trigram Accuracy : 0.8654734411085451
 Trigram Precision: 0.9266259342940507
 Trigram Recall   : 0.8654734411085451
 Trigram F1-Score : 0.8915913508772109
 Hmm Accuracy : 0.43645111624326405
 Hmm Precision: 0.8727021005868528
 Hmm Recall   : 0.43645111624326405
 Hmm F1-Score : 0.5141230809558233


In [78]:
#Reading test file using nltk plain text reader and storing sentences
test_eva_corpus = TaggedCorpusReader(".","test.tag")
test_corpus  = PlaintextCorpusReader(".","test.txt")
test_tagged_eva_sents = test_eva_corpus.tagged_sents()
test_sents = test_corpus.sents()
# Using trigram (best one here) to generate tagged test file.  

text_tagged_sents = trigram_tagger.tag_sents(test_sents)
print(text_tagged_sents[0])


[('Much', 'AP'), ('more', 'AP'), ('than', 'IN'), ('shelter', 'NN'), (',', ','), ('housing', 'VBG'), ('symbolizes', None), ('social', 'JJ'), ('status', 'NN'), (',', ','), ('a', 'AT'), ('sense', 'NN'), ('of', 'IN'), ('``', '``'), ('belonging', 'VBG'), ("''", "''"), (',', ','), ('acceptance', 'NN'), ('within', 'IN'), ('a', 'AT'), ('given', 'VBN'), ('group', 'NN'), ('or', 'CC'), ('neighborhood', 'NN'), (',', ','), ('identification', 'NN'), ('with', 'IN'), ('particular', 'JJ'), ('cultural', 'JJ'), ('values', 'NNS'), ('and', 'CC'), ('social', 'JJ'), ('institutions', 'NNS'), (',', ','), ('feelings', 'NNS'), ('of', 'IN'), ('pride', 'NN'), ('and', 'CC'), ('worth', 'JJ'), (',', ','), ('aspirations', None), ('and', 'CC'), ('hopes', 'NNS'), ('basic', 'JJ'), ('to', 'IN'), ('human', 'JJ'), ('well', 'RB'), ('-', None), ('being', 'BEG'), ('.', '.')]


In [82]:
#Generate output file with trigram tagged test sentences with word followed by / and its tag
with open('test.out', 'w') as fo:
    for i, sentence in enumerate(text_tagged_sents):
        print(' '.join([word + '/' + str(tag).lower() for word, tag in sentence]), end='\n\n', file=fo)