# Named Entity Recognition on Conll(2003) dataset using pycrfcuite.

### This notebook covers the following topics:
1. Created a function to define features for training the model. We have used identity, word-suffix, word-shape, pos-tag and two previous and next adjacent neighboring words, as features. 
2. Trained the model using pycrfsuite. 
3. Performed hyperparameter tuning using l1, l2, and max_iterations features. 
4. Created a function to print classification report for each parameter setting. 
5. Found the best parameter setting on a validation set(eng.testa)
6. Reporting the results on a testset (eng.testb)
        

### Conll 2003 dataset contains the following sub-datasets:
  1. eng.train : Used as a  training set.
  2. eng.testa : Used as a validation set.
  3. eng.testb : Used as a test set.

In [87]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite



#### As we have downloaded the Conll 2003 dataset and are not using nltk.corpus for extracting the dataset directly, lets write the function to read datasets from the directory. 


In [88]:
import sys
def read_data(filename):
    data = []
    count = 0
    with open(filename) as f:
        tmp = []
        for line in f:
            count += 1
            if count < 3:
                continue                 
            words = line.strip().split(" ")
            if len(words) < 2:
                data.append(tmp)    
                tmp = []
            elif len(words) == 4:
                tmp.append((words[0], words[1], words[3]))
    #             print(tmp)
            else:
                print(count)
                sys.exit(0)
    return data


        
        

    

In [89]:
# eng.train is used for training the model and eng.testa is used as a validation set. 
train_set = read_data("eng.train")
validation_set = read_data("eng.testa")

In [90]:
''' For our purpose, only word, its pos-tag, and its label is imp. Here ORG refers to "organization", LOC - "location",
PER - "Person", B - "Beggining" E - End. '''
validation_set[0]

[('CRICKET', 'NNP', 'O'),
 ('-', ':', 'O'),
 ('LEICESTERSHIRE', 'NNP', 'B-ORG'),
 ('TAKE', 'NNP', 'O'),
 ('OVER', 'IN', 'O'),
 ('AT', 'NNP', 'O'),
 ('TOP', 'NNP', 'O'),
 ('AFTER', 'NNP', 'O'),
 ('INNINGS', 'NNP', 'O'),
 ('VICTORY', 'NN', 'O'),
 ('.', '.', 'O')]

#### Below is the feature extractor. For a word, we extract its suffix (character n-grams), postag, case (lower or upper), and previous two and next two neighboring words as features. Special handling is done for boundary words. 

In [91]:
def word2features(sentence, i):
    word = sentence[i][0]
    postag = sentence[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sentence[i-1][0]
        postag1 = sentence[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
        if i > 1:
            word2 = sentence[i-2][0]
            postag2 = sentence[i-2][1]
            features.extend([
                '-2:word.lower=' + word2.lower(),
                '-2:word.istitle=%s' % word2.istitle(),
                '-2.word.isupper=%s' % word2.isupper(),
                '-2:postag=' + postag2,
                '-2:postag[:2]=' + postag2[:2],
            ])
    else:
        features.append('BOS')
        
    if i < len(sentence)-1:
        word1 = sentence[i+1][0]
        postag1 = sentence[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
        if i < len(sentence)-2:
            word2 = sentence[i+2][0]
            postag2 = sentence[i+2][1]
            features.extend([
                '+2:word.lower=' + word2.lower(),
                '+2:word.istitle=%s' % word2.istitle(),
                '+2:word.isupper=%s' % word2.isupper(),
                '+2:postag=' + postag2,
                '+2:postag[:2]=' + postag2[:2],
            ])
    else:
        features.append('EOS')
    return features




In [92]:
def define_features(sentence):
    return [word2features(sentence, i) for i in range(len(sentence))]

def extract_labels(sentence):
    return [label for token, postag, label in sentence]

def extract_tokens(sentence):
    return [token for token, postag, label in sentence]

In [93]:
X_train = [define_features(s) for s in train_set]
y_train = [extract_labels(s) for s in train_set]

X_valid = [define_features(s) for s in validation_set]
y_valid = [extract_labels(s) for s in validation_set]

Let's look at one sample to see how X_train looks like

In [94]:
X_train[0]

[['bias',
  'word.lower=eu',
  'word[-3:]=EU',
  'word[-2:]=EU',
  'word.isupper=True',
  'word.istitle=False',
  'word.isdigit=False',
  'postag=NNP',
  'postag[:2]=NN',
  'BOS',
  '+1:word.lower=rejects',
  '+1:word.istitle=False',
  '+1:word.isupper=False',
  '+1:postag=VBZ',
  '+1:postag[:2]=VB',
  '+2:word.lower=german',
  '+2:word.istitle=True',
  '+2:word.isupper=False',
  '+2:postag=JJ',
  '+2:postag[:2]=JJ'],
 ['bias',
  'word.lower=rejects',
  'word[-3:]=cts',
  'word[-2:]=ts',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  'postag=VBZ',
  'postag[:2]=VB',
  '-1:word.lower=eu',
  '-1:word.istitle=False',
  '-1:word.isupper=True',
  '-1:postag=NNP',
  '-1:postag[:2]=NN',
  '+1:word.lower=german',
  '+1:word.istitle=True',
  '+1:word.isupper=False',
  '+1:postag=JJ',
  '+1:postag[:2]=JJ',
  '+2:word.lower=call',
  '+2:word.istitle=False',
  '+2:word.isupper=False',
  '+2:postag=NN',
  '+2:postag[:2]=NN'],
 ['bias',
  'word.lower=german',
  'word[-3:]=

#### Below shown code is used to train the model using pycrfsuite. First, pycrfsuite.Trainer is defined and then dataset is loaded in crfsuite

In [95]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=False)

for x_sent, y_sent in zip(X_train, y_train):
    trainer.append(x_sent, y_sent)

#### This function generates the classification report and collects the F-1 score metric for each classified document. 

In [96]:
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from collections import Counter
from sklearn.preprocessing import LabelBinarizer
def F1_classification_report(y_true, y_pred):
    
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

    return f1_score(y_true_combined, y_pred_combined, labels = [class_indices[cls] for cls in tagset], average='weighted')



#### The below given code performs the parameter tuning and prints the F1 scores corresponding to each parameter setting, applied on validation set in sorted order. The parameter setting with maximum F1 score will be considered as the best setting acheived on validation set(eng.testa).

In [97]:
import itertools
import collections
import operator
F1_scores = {}

l1_penalty = [0.5, 1, 1.5]
l2_penalty = [0.001, 0.005, 0.01]
max_iterations = [50, 100, 150]
for (l1,l2, max_iter) in list(itertools.product(l1_penalty, l2_penalty, max_iterations)): 
    print("Setting: l1 = ",l1,"l2 =", l2, "max_iter = ", max_iter)
    trainer.set_params({
        'c1': l1,   # coefficient for L1 penalty
        'c2': l2,  # coefficient for L2 penalty
        'max_iterations': max_iter,  # stop earlier
        'feature.possible_transitions': True # include transitions that are possible, but not observed
        })
    trainer.train('crf-conll2003.model')
    trainer.logparser.last_iteration
    tagger = pycrfsuite.Tagger()
    tagger.open('crf-conll2003.model')


    y_pred = [tagger.tag(xseq) for xseq in X_valid]
    key = "_".join([str(l1), str(l2), str(max_iter)])
    F1_scores[key] = F1_classification_report(y_valid, y_pred)
sorted_F1 = sorted(F1_scores.items(), key=operator.itemgetter(1))
print(sorted_F1)





Setting: l1 =  0.5 l2 = 0.001 max_iter =  50
Setting: l1 =  0.5 l2 = 0.001 max_iter =  100
Setting: l1 =  0.5 l2 = 0.001 max_iter =  150
Setting: l1 =  0.5 l2 = 0.005 max_iter =  50
Setting: l1 =  0.5 l2 = 0.005 max_iter =  100
Setting: l1 =  0.5 l2 = 0.005 max_iter =  150
Setting: l1 =  0.5 l2 = 0.01 max_iter =  50
Setting: l1 =  0.5 l2 = 0.01 max_iter =  100
Setting: l1 =  0.5 l2 = 0.01 max_iter =  150
Setting: l1 =  1 l2 = 0.001 max_iter =  50
Setting: l1 =  1 l2 = 0.001 max_iter =  100
Setting: l1 =  1 l2 = 0.001 max_iter =  150
Setting: l1 =  1 l2 = 0.005 max_iter =  50
Setting: l1 =  1 l2 = 0.005 max_iter =  100
Setting: l1 =  1 l2 = 0.005 max_iter =  150
Setting: l1 =  1 l2 = 0.01 max_iter =  50
Setting: l1 =  1 l2 = 0.01 max_iter =  100
Setting: l1 =  1 l2 = 0.01 max_iter =  150
Setting: l1 =  1.5 l2 = 0.001 max_iter =  50
Setting: l1 =  1.5 l2 = 0.001 max_iter =  100
Setting: l1 =  1.5 l2 = 0.001 max_iter =  150
Setting: l1 =  1.5 l2 = 0.005 max_iter =  50
Setting: l1 =  1.5 l

#### Below given code reports the F1 score on the best parameter setting obtained on the validation set above. We want to see the difference in accuracies on validation set and the test set in order to evaluate the parameter settings. 

In [53]:
# There are two test sets, testa is used as validation set and testb is used as test set. 
test_set = read_data("eng.testb")



In [54]:
test_set[0]

[('SOCCER', 'NN', 'O'),
 ('-', ':', 'O'),
 ('JAPAN', 'NNP', 'B-LOC'),
 ('GET', 'VB', 'O'),
 ('LUCKY', 'NNP', 'O'),
 ('WIN', 'NNP', 'O'),
 (',', ',', 'O'),
 ('CHINA', 'NNP', 'B-PER'),
 ('IN', 'IN', 'O'),
 ('SURPRISE', 'DT', 'O'),
 ('DEFEAT', 'NN', 'O'),
 ('.', '.', 'O')]

In [55]:
trainer = pycrfsuite.Trainer(verbose=False)
# Using the same procedure as above, we will extract features for every sentence of this test set and seperates the label from it. 
X_test = [define_features(s) for s in test_set]
y_test = [extract_labels(s) for s in test_set]

for x_sent, y_sent in zip(X_train, y_train):   
    trainer.append(x_sent, y_sent)  

#### During hyperparameter tuning on validation set, we found our best setting as: l1= 0.5, l2= 0.05, max_iterations = 50. Thus, we will apply this setting to our test set. 

In [84]:
l1_penalty,l2_penalty,max_iterations  = 1.5, 0.005, 150
trainer.set_params({
    'c1': l1_penalty,   # coefficient for L1 penalty
    'c2': l2_penalty,  # coefficient for L2 penalty
    'max_iterations': max_iterations,  # stop earlier
    'feature.possible_transitions': True # include transitions that are possible, but not observed
    })
trainer.train('crf-conll2003.model')
trainer.logparser.last_iteration
tagger = pycrfsuite.Tagger()
tagger.open('crf-conll2003.model')

   

<contextlib.closing at 0x7f91f60d8048>

#### Model will be trained on the same set (eng.train), which is used for training above. 

In [85]:
F1_scores_test_set = {}
y_predb = [tagger.tag(x_sent) for x_sent in X_test]
key = "_".join([str(l1_penalty), str(l2_penalty), str(max_iterations)])
F1_scores_test_set[key] = F1_classification_report(y_test, y_predb)

sorted_F1 = sorted(F1_scores_test_set.items(), key=operator.itemgetter(1))
print(sorted_F1)


[('1.5_0.005_150', 0.7899300685849052)]


### Conclusion: Thus we find that, parameter tuning on validation set gave us the accuracy of 88.423 % and reporting the best setting obtained from validation set on test set gave us the accuracy of 80.504%. 

### Thus we see that, there is an improvement of 20% approx, ( HMM - 61.81% to CRF- 78.78% ) in the classification accuracy. 