# Part -2 CRFs for POS Tagging
***

__[Ankit Bhatia](https://www.linkedin.com/in/ab9bhatia/)__

## Prerequisite

1. Install __sklearn_crfsuite__ using __"pip install sklearn_crfsuite" or "conda install -c conda-forge python-crfsuite"__
2. Download all __NLTK__ modules using __nltk.download()__

## Importing libraries

In [3]:
import nltk, re, pprint
import numpy as np
import pandas as pd
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import sklearn
import matplotlib.pyplot as plt
import random
from sklearn_crfsuite import metrics,scorers,CRF
from nltk.tag.util import untag

ModuleNotFoundError: No module named 'sklearn_crfsuite'

## Data Preprocessing

#### 1. Treebank dataset

In [6]:
treebank = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [7]:
print('Number of Sentences in the corpus::',len(treebank),'\n')
# print first two tagged sentences
print(treebank[0:2])

Number of Sentences in the corpus:: 3914 

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]


#### 2. Brown dataset

In [8]:
brown = list(nltk.corpus.brown.tagged_sents(tagset='universal'))

In [9]:
print('Number of Sentences in the corpus::',len(brown),'\n')
# print first two tagged sentences
print(brown[0:2])

Number of Sentences in the corpus:: 57340 

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'N

#### 3. CoNLL2000 dataset

In [10]:
conll2000= list(nltk.corpus.conll2000.tagged_sents(tagset='universal'))

In [11]:
print('Number of Sentences in the corpus::',len(conll2000),'\n')
# print first two tagged sentences
print(conll2000[0:2])

Number of Sentences in the corpus:: 10948 

[[('Confidence', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('pound', 'NOUN'), ('is', 'VERB'), ('widely', 'ADV'), ('expected', 'VERB'), ('to', 'PRT'), ('take', 'VERB'), ('another', 'DET'), ('sharp', 'ADJ'), ('dive', 'NOUN'), ('if', 'ADP'), ('trade', 'NOUN'), ('figures', 'NOUN'), ('for', 'ADP'), ('September', 'NOUN'), (',', '.'), ('due', 'ADJ'), ('for', 'ADP'), ('release', 'NOUN'), ('tomorrow', 'NOUN'), (',', '.'), ('fail', 'VERB'), ('to', 'PRT'), ('show', 'VERB'), ('a', 'DET'), ('substantial', 'ADJ'), ('improvement', 'NOUN'), ('from', 'ADP'), ('July', 'NOUN'), ('and', 'CONJ'), ('August', 'NOUN'), ("'s", 'PRT'), ('near-record', 'ADJ'), ('deficits', 'NOUN'), ('.', '.')], [('Chancellor', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('Exchequer', 'NOUN'), ('Nigel', 'NOUN'), ('Lawson', 'NOUN'), ("'s", 'PRT'), ('restated', 'VERB'), ('commitment', 'NOUN'), ('to', 'PRT'), ('a', 'DET'), ('firm', 'NOUN'), ('monetary', 'ADJ'), ('policy', 'NOUN'), ('has', 'VERB'

### Combine Datasets

In [12]:
master = treebank + brown + conll2000
#master = master[0:100] # Just for testing purpose

In [13]:
print('Number of Sentences in the master corpus::',len(master),'\n')
# print first two tagged sentences
print(master[0:2])

Number of Sentences in the master corpus:: 72202 

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]


## I. Build your CRF

### Define Features

In [14]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

### Prepare data for CRF

In [15]:
# Split the dataset for training and testing
cutoff = int(.80 * len(master))
training_sentences = master[:cutoff]
validation_sentences = master[cutoff:]

In [16]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        X.append([features(untag(tagged), index) for index in range(len(tagged))])
        y.append([tag for _, tag in tagged])
 
    return X, y

In [17]:
start = time.time()
X_train, y_train = transform_to_dataset(training_sentences)
X_valid, y_valid = transform_to_dataset(validation_sentences)
end = time.time()
print('Time Taken(Sec) in this cell',end-start)
print(len(X_train))     
print(len(X_valid))         

Time Taken(in Sec) in this cell 6.396191120147705
57761
14441


### Build CRF Model

In [None]:
start = time.time()
model = CRF()
model.fit(X_train, y_train)
end = time.time()
print('Time Taken(Sec) in this cell',end-start)

In [None]:
def pos_tag(sentence):
    sentence_features = [features(sentence, index) for index in range(len(sentence))]
    return list(zip(sentence, model.predict([sentence_features])[0]))

In [None]:
sample = ['I','am','Learning','NLP']
pos_tag(sample)

## II. Evaluate the Model Performance

In [None]:
start = time.time()
y_pred = model.predict(X_valid)
end = time.time()
print('Time Taken(in Sec) in this cell',end-start)
print('Accuracy::',metrics.flat_accuracy_score(y_valid, y_pred))
print('F1 Score',metrics.flat_f1_score(y_valid, y_pred,average='weighted'))

In [None]:
print(metrics.flat_classification_report(y_valid, y_pred, digits=2))

#### Accuracy from the base model is coming as ~94%.

Let's tune our model using gridsearch.

## III. Hyperparameters Tuning

In [None]:
# define fixed parameters and parameters to search
crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

# parameters to tune
params_space = {
    'c1': [0.01, 0.1, 1],
    'c2': [0.01, 0.1, 1]
}

# use the same metric for evaluation
f1_scorer = scorers.make_scorer(metrics.flat_f1_score,
                        average='weighted')

In [None]:
from sklearn.model_selection import GridSearchCV

# instantiate a GridSearchCV object
rs = GridSearchCV(crf, 
                  params_space,
                  cv=3,
                  verbose=1,
                  n_jobs=2,
                  scoring=f1_scorer, 
                  return_train_score=True)
# fit
rs.fit(X_train, y_train)

#### Save these Hyperparameters in a Dataframe

In [None]:
cv_results = pd.DataFrame(rs.cv_results_)
cv_results.head(3)

#### Plotting CV results

In [None]:
# plotting CV results
# for each value of c2, make a plot of c1 versus train and validation f1-score

plt.figure(figsize=(16,6))

for i, val in enumerate(params_space['c2']):
   
    # subplot 1/3/i
    plt.subplot(1, 3, i+1)
    c2_subset = cv_results[cv_results['param_c2']==val]

    plt.plot(c2_subset["param_c1"], c2_subset["mean_test_score"])
    plt.plot(c2_subset["param_c1"], c2_subset["mean_train_score"])
    plt.xlabel('c1')
    plt.ylabel('Mean F-score')
    plt.title("c2={0}".format(val))
    plt.ylim([0.80, 1])
    plt.legend(['validation score', 'train score'], loc='upper left')
    plt.xscale('log')

The plot above shows that at very low values of c1, the model overfits, as shown by the difference in training and test performance. Also, the test score seems to be slightly higher for c2 = 0.1.

Let's thus choose c1 = 0.1 and c2 = 0.1.

## IV. Rebuilding Model with Hyperparameters

In [None]:
# building a model with optimal hyperparams
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

### Save the Model

Using pickle, we can save the model as a file so that it can be resued just by importing(without training the model again)

In [None]:
import _pickle as cPickle

with open('Ankit_Bhatia_part_2.pkl', 'wb') as clf:
    try:
        cPickle.dump(crf, clf)
    except Exception as e:
        print(e)
    finally:
        clf.close()   

## V. Evaluate the Model Performance(after Hyperpatameters Tuning)

In [None]:
# load the trained model
import _pickle as cPickle

with open('Ankit_Bhatia_part_2.pkl', 'rb') as fid:
    crf = cPickle.load(fid)

In [None]:
# make predictions on validation data
y_pred = crf.predict(X_valid)
print('Accuracy::',metrics.flat_accuracy_score(y_valid, y_pred))
print('F1 Score',metrics.flat_f1_score(y_valid, y_pred,average='weighted'))

## VI. Testing Tuned Model on Sample Sentences

In [4]:
end = time.time()
test_sentences = ["Show me flights from Denver to Nebraska departing after 8 p.m.",
                  "Android is a mobile operating system developed by Google based on a modified version of the Linux kernel.",
                  "Android is a mobile operating system developed by Google.",
                  "Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.",
                  "Google and Twitter made a deal in 2015 that gave Google access to Twitter's firehose.",
                  "Twitter is an online news and social networking service on which users post and interact with messages known as tweets.",
                  "Before entering politics, Donald Trump was a domineering businessman and a television personality.",
                  "The 2018 FIFA World Cup is the 21st FIFA World Cup, an international football tournament contested once every four years.",
                  "This is the first World Cup to be held in Eastern Europe and the 11th time that it has been held in Europe.",
                  "Show me the cheapest round trips from Dallas to Atlanta",
                  "I would like to see flights from Denver to Philadelphia.",
                  "Show me the price of the flights leaving Atlanta at about 3 in the afternoon and arriving in San Francisco.",
                  "NASA invited social media users to experience the launch of ICESAT-2 Satellite."]


#print(test_sentences)

def pos_tag(sentence):
    sentence_features = [features(sentence, index) for index in range(len(sentence))]
    return list(zip(sentence, crf.predict([sentence_features])[0]))

for sentence  in test_sentences:
    word_list = sentence.split(' ')
    print(pos_tag(word_list)) 
    print("\n")
end = time.time()
print('Time Taken(Sec) in this cell',end-start)    

NameError: name 'features' is not defined

## VIII. Enlist Important State and Transition Features learnt by the model

### Important State

In [None]:
print(metrics.flat_classification_report(y_valid, y_pred,  digits=3))

### Transition Features

In [None]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-5s -> %-6s %0.3f" % (label_from, label_to, weight))

print("Top 10 Most Common transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop 10 Most Uncommon transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])