# CRFs for POS Tagging
***

__[Ankit Bhatia](https://www.linkedin.com/in/ab9bhatia/)__

__<u>Summary</u>__
Build a POS tagger using Conditional Random Fields

HMM-based POS taggers do not use features, however, in a CRF based POS tagger, you can define rich feature functions which can deal with problems such as unknown words, complex word patterns etc. Also, in this part of the assignment, you will use three different tagged datasets for training. 

## Prerequisite

1. Install __sklearn_crfsuite__ using __"pip install sklearn_crfsuite" or "conda install -c conda-forge python-crfsuite"__
2. Download all __NLTK__ modules using __nltk.download()__

## Importing libraries

In [1]:
import nltk, re, pprint
import numpy as np
import pandas as pd
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import sklearn
import matplotlib.pyplot as plt
import random
from sklearn_crfsuite import metrics,scorers,CRF
from nltk.tag.util import untag
from sklearn.model_selection import GridSearchCV

## Data Preprocessing

#### 1. Treebank dataset

In [2]:
treebank = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [3]:
print('Number of Sentences in the corpus::',len(treebank),'\n')
# print first two tagged sentences
print(treebank[0:2])

Number of Sentences in the corpus:: 3914 

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]


#### 2. Brown dataset

In [4]:
brown = list(nltk.corpus.brown.tagged_sents(tagset='universal'))

In [5]:
print('Number of Sentences in the corpus::',len(brown),'\n')
# print first two tagged sentences
print(brown[0:2])

Number of Sentences in the corpus:: 57340 

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'N

#### 3. CoNLL2000 dataset

In [6]:
conll2000= list(nltk.corpus.conll2000.tagged_sents(tagset='universal'))

In [7]:
print('Number of Sentences in the corpus::',len(conll2000),'\n')
# print first two tagged sentences
print(conll2000[0:2])

Number of Sentences in the corpus:: 10948 

[[('Confidence', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('pound', 'NOUN'), ('is', 'VERB'), ('widely', 'ADV'), ('expected', 'VERB'), ('to', 'PRT'), ('take', 'VERB'), ('another', 'DET'), ('sharp', 'ADJ'), ('dive', 'NOUN'), ('if', 'ADP'), ('trade', 'NOUN'), ('figures', 'NOUN'), ('for', 'ADP'), ('September', 'NOUN'), (',', '.'), ('due', 'ADJ'), ('for', 'ADP'), ('release', 'NOUN'), ('tomorrow', 'NOUN'), (',', '.'), ('fail', 'VERB'), ('to', 'PRT'), ('show', 'VERB'), ('a', 'DET'), ('substantial', 'ADJ'), ('improvement', 'NOUN'), ('from', 'ADP'), ('July', 'NOUN'), ('and', 'CONJ'), ('August', 'NOUN'), ("'s", 'PRT'), ('near-record', 'ADJ'), ('deficits', 'NOUN'), ('.', '.')], [('Chancellor', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('Exchequer', 'NOUN'), ('Nigel', 'NOUN'), ('Lawson', 'NOUN'), ("'s", 'PRT'), ('restated', 'VERB'), ('commitment', 'NOUN'), ('to', 'PRT'), ('a', 'DET'), ('firm', 'NOUN'), ('monetary', 'ADJ'), ('policy', 'NOUN'), ('has', 'VERB'

### Combine Datasets

In [8]:
master = treebank + brown + conll2000
#master = master[0:100] # Just for testing purpose

In [9]:
print('Number of Sentences in the master corpus::',len(master),'\n')
# print first two tagged sentences
print(master[0:2])

Number of Sentences in the master corpus:: 72202 

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]


## I. Build your CRF

### Define Features

In [10]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

### Prepare data for CRF

In [11]:
# Split the dataset for training and testing
cutoff = int(.80 * len(master))
training_sentences = master[:cutoff]
validation_sentences = master[cutoff:]

In [12]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        X.append([features(untag(tagged), index) for index in range(len(tagged))])
        y.append([tag for _, tag in tagged])
 
    return X, y

In [13]:
start = time.time()
X_train, y_train = transform_to_dataset(training_sentences)
X_valid, y_valid = transform_to_dataset(validation_sentences)
print(len(X_train))     
print(len(X_valid))
end = time.time()
print('Time Taken(Sec) in this cell',end-start)

57761
14441
Time Taken(Sec) in this cell 5.840530157089233


### Build CRF Model

In [14]:
start = time.time()
model = CRF()
model.fit(X_train, y_train)
end = time.time()
print('Time Taken(Sec) in this cell',end-start)

Time Taken(Sec) in this cell 739.0159306526184


In [15]:
def pos_tag(sentence):
    sentence_features = [features(sentence, index) for index in range(len(sentence))]
    return list(zip(sentence, model.predict([sentence_features])[0]))

In [16]:
sample = ['I','am','Learning','NLP']
pos_tag(sample)

[('I', 'PRON'), ('am', 'VERB'), ('Learning', 'NOUN'), ('NLP', 'NOUN')]

## II. Evaluate the Model Performance

In [17]:
start = time.time()
y_pred = model.predict(X_valid)
end = time.time()
print('Time Taken(in Sec) in this cell',end-start)
print('Accuracy::',metrics.flat_accuracy_score(y_valid, y_pred))
print('F1 Score',metrics.flat_f1_score(y_valid, y_pred,average='weighted'))

Time Taken(in Sec) in this cell 2.6693429946899414
Accuracy:: 0.9438060158058392
F1 Score 0.9435019419876269


In [18]:
print(metrics.flat_classification_report(y_valid, y_pred, digits=2))

             precision    recall  f1-score   support

          .       1.00      1.00      1.00     41431
        ADJ       0.85      0.87      0.86     20932
        ADP       0.92      0.95      0.94     33697
        ADV       0.91      0.92      0.91     12425
       CONJ       0.99      1.00      0.99      8523
        DET       0.92      0.96      0.94     29936
       NOUN       0.96      0.95      0.96     89666
        NUM       0.98      0.99      0.99     10567
       PRON       0.94      0.79      0.86     11820
        PRT       0.81      0.78      0.79     10669
       VERB       0.96      0.96      0.96     47262
          X       0.36      0.22      0.27       170

avg / total       0.94      0.94      0.94    317098



#### Accuracy from the base model is coming as ~94%.

Let's tune our model using gridsearch.

## III. Hyperparameters Tuning

In [19]:
# define fixed parameters and parameters to search
crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

# parameters to tune
params_space = {
    'c1': [0.01, 0.1, 1],
    'c2': [0.01, 0.1, 1]
}

# use the same metric for evaluation
f1_scorer = scorers.make_scorer(metrics.flat_f1_score,
                        average='weighted')

rs = GridSearchCV(crf, 
                  params_space,
                  cv=3,
                  verbose=1,
                  n_jobs=2,
                  scoring=f1_scorer, 
                  return_train_score=True)
# fit
#rs.fit(X_train, y_train) # Commented Explicitly

Fitting 3 folds for each of 9 candidates, totalling 27 fits

__[Parallel(n_jobs=2)]: Done  27 out of  27 | elapsed: 351.4min finished__

GridSearchCV(cv=3, error_score='raise',
       estimator=CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error...e,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False),
  
    fit_params=None, iid=True, n_jobs=2,
    
    param_grid={'c1': [0.01, 0.1, 1], 'c2': [0.01, 0.1, 1]},
    
    pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
    scoring=make_scorer(flat_f1_score, average=weighted), verbose=1)

#### Important:: It took approx 350 min to get the optimal hyperparameters, so I noted the best values of C1 & C2 and commented the above code, so that it will not take the time during code evaluation.

__Note__ If you want to run the above code just uncomment __rs.fit(X_train, y_train)__ line


### Let's thus choose c1 = 0.1 and c2 = 0.1.

In [20]:
# store CV results in a DF
#cv_results = pd.DataFrame(rs.cv_results_)
#cv_results

## IV. Rebuilding Model with Hyperparameters

In [21]:
# building a model with optimal hyperparams
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

### Save the Model

Using pickle, we can save the model as a file so that it can be resued just by importing(without training the model again)

In [22]:
import _pickle as cPickle

with open('Ankit_Bhatia_part_2.pkl', 'wb') as clf:
    try:
        cPickle.dump(crf, clf)
    except Exception as e:
        print(e)
    finally:
        clf.close()   

## V. Evaluate the Model Performance(after Hyperpatameters Tuning)

In [23]:
# load the trained model
import _pickle as cPickle

with open('Ankit_Bhatia_part_2.pkl', 'rb') as fid:
    crf = cPickle.load(fid)

In [24]:
# make predictions on validation data
y_pred = crf.predict(X_valid)
print('Accuracy::',metrics.flat_accuracy_score(y_valid, y_pred))
print('F1 Score',metrics.flat_f1_score(y_valid, y_pred,average='weighted'))

Accuracy:: 0.946552800711452
F1 Score 0.9463142916336001


### There is just a slight difference in accuracy from 94.35% to 94.65%(using hyperparameters)

## VI. Testing Tuned Model on Sample Sentences

In [25]:
end = time.time()
test_sentences = ["Show me flights from Denver to Nebraska departing after 8 p.m.",
                  "Android is a mobile operating system developed by Google based on a modified version of the Linux kernel.",
                  "Android is a mobile operating system developed by Google.",
                  "Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.",
                  "Google and Twitter made a deal in 2015 that gave Google access to Twitter's firehose.",
                  "Twitter is an online news and social networking service on which users post and interact with messages known as tweets.",
                  "Before entering politics, Donald Trump was a domineering businessman and a television personality.",
                  "The 2018 FIFA World Cup is the 21st FIFA World Cup, an international football tournament contested once every four years.",
                  "This is the first World Cup to be held in Eastern Europe and the 11th time that it has been held in Europe.",
                  "Show me the cheapest round trips from Dallas to Atlanta",
                  "I would like to see flights from Denver to Philadelphia.",
                  "Show me the price of the flights leaving Atlanta at about 3 in the afternoon and arriving in San Francisco.",
                  "NASA invited social media users to experience the launch of ICESAT-2 Satellite."]


#print(test_sentences)

def pos_tag(sentence):
    sentence_features = [features(sentence, index) for index in range(len(sentence))]
    return list(zip(sentence, crf.predict([sentence_features])[0]))

for sentence  in test_sentences:
    word_list = sentence.split(' ')
    print(pos_tag(word_list)) 
    print("\n")
end = time.time()
print('Time Taken(Sec) in this cell',end-start)    

[('Show', 'VERB'), ('me', 'PRON'), ('flights', 'VERB'), ('from', 'ADP'), ('Denver', 'NOUN'), ('to', 'ADP'), ('Nebraska', 'NOUN'), ('departing', 'VERB'), ('after', 'ADP'), ('8', 'NUM'), ('p.m.', 'ADV')]


[('Android', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'VERB'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google', 'NOUN'), ('based', 'VERB'), ('on', 'ADP'), ('a', 'DET'), ('modified', 'VERB'), ('version', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('Linux', 'NOUN'), ('kernel.', 'NOUN')]


[('Android', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'VERB'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google.', 'NOUN')]


[('Android', 'NOUN'), ('has', 'VERB'), ('been', 'VERB'), ('the', 'DET'), ('best-selling', 'ADJ'), ('OS', 'NOUN'), ('worldwide', 'NOUN'), ('on', 'ADP'), ('smartphones', 'NOUN'), ('since', 'ADP'), ('2011', 'NUM'), ('and', 'CONJ'), ('on', 'ADP'), ('tablets', 'NOUN'), ('since', 'ADP'), ('

## VIII. Enlist Important State and Transition Features learnt by the model

### Important State

In [26]:
print(metrics.flat_classification_report(y_valid, y_pred,  digits=3))

             precision    recall  f1-score   support

          .      0.997     0.999     0.998     41431
        ADJ      0.861     0.883     0.872     20932
        ADP      0.919     0.954     0.936     33697
        ADV      0.902     0.922     0.912     12425
       CONJ      0.992     0.997     0.994      8523
        DET      0.922     0.957     0.939     29936
       NOUN      0.969     0.953     0.961     89666
        NUM      0.983     0.991     0.987     10567
       PRON      0.941     0.800     0.865     11820
        PRT      0.817     0.787     0.801     10669
       VERB      0.962     0.967     0.965     47262
          X      0.364     0.276     0.314       170

avg / total      0.947     0.947     0.946    317098



### Transition Features

In [27]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-5s -> %-6s %0.3f" % (label_from, label_to, weight))

print("Top 10 Most Common transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop 10 Most Uncommon transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top 10 Most Common transitions:
X     -> X      6.713
ADJ   -> NOUN   4.159
PRT   -> VERB   2.596
PRON  -> VERB   2.256
DET   -> NOUN   2.228
NOUN  -> NOUN   2.118
ADP   -> NOUN   2.075
NOUN  -> PRON   2.066
VERB  -> NOUN   2.012
ADV   -> ADJ    1.957
NOUN  -> VERB   1.744
DET   -> ADJ    1.619
ADJ   -> ADJ    1.545
ADJ   -> NUM    1.455
ADV   -> ADV    1.349
VERB  -> ADV    1.321
VERB  -> PRT    1.312
X     -> VERB   1.304
ADP   -> DET    1.292
DET   -> VERB   1.185

Top 10 Most Uncommon transitions:
ADV   -> X      -1.280
CONJ  -> NUM    -1.304
ADP   -> .      -1.309
X     -> ADP    -1.315
X     -> NUM    -1.334
DET   -> PRON   -1.406
DET   -> DET    -1.445
NUM   -> ADV    -1.466
NUM   -> PRON   -1.508
DET   -> .      -1.578
.     -> NUM    -1.586
NUM   -> DET    -1.799
ADP   -> X      -1.838
NUM   -> VERB   -2.075
DET   -> CONJ   -2.205
DET   -> PRT    -2.262
DET   -> ADP    -2.269
CONJ  -> CONJ   -2.685
CONJ  -> X      -2.819
CONJ  -> .      -3.574
