In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report

from nltk.tokenize.treebank import TreebankWordDetokenizer
%matplotlib inline

# Downloading and Preprocessing the Data

In [2]:
df = pd.read_csv('../data/ner_dataset.csv', encoding = "ISO-8859-1")
df.describe()

Unnamed: 0,Sentence #,Word,POS,Tag
count,47959,1048575,1048575,1048575
unique,47959,35178,42,17
top,Sentence: 46488,the,NN,O
freq,1,52573,145807,887908


In [3]:
df.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [4]:
print(df['Tag'].unique())

['O' 'B-geo' 'B-gpe' 'B-per' 'I-geo' 'B-org' 'I-org' 'B-tim' 'B-art'
 'I-art' 'I-per' 'I-gpe' 'I-tim' 'B-nat' 'B-eve' 'I-eve' 'I-nat']


In [5]:
df = df.fillna(method = 'ffill')

In [6]:
# This is a class te get sentence. The each sentence will be list of tuples with its tag and pos.
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s : [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                       s['Tag'].values.tolist())]
        self.grouped = self.df.groupby("Sentence #").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent +=1
            return s
        except:
            return None

In [7]:
#Displaying one full sentence
getter = sentence(df)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
sentences[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [8]:
sent = getter.get_text()
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [9]:
sentences = getter.sentences

In [10]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [11]:
reduced_tag_set = ['B-geo', 'B-gpe', 'B-org', 'B-per', 'B-tim', 'I-geo',
                   'I-gpe', 'I-org', 'I-per', 'I-tim', 'O']

In [12]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
y = [[label if label in reduced_tag_set else 'O' for label in y_i] for y_i in y]  # reduce tag set

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.025)

In [14]:
crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = False)

# GMB-Simple-Blurry

In [15]:
import copy, random

def blur_labels(y, frac=0.25):
    y_new = []
    error_array = []
    
    counter = 0
    
    for i in range(len(y)):
        error_array.append(list())
        y_new.append(list())
        change_steps = 0
        change_to = None
        
        for j in range(len(y[i])):
            
            current_tag = y[i][j]
            if current_tag=='B-geo' and  j >= 3:
                for k in range(3):
                    y_new[i][j - k - 1] = current_tag
                    error_array[i][j - k - 1] = True
                
                y_new[i].append(current_tag)
                error_array[i].append(False)
            else:
                error_array[i].append(False)
                y_new[i].append(current_tag)
        
    return y_new, error_array

In [16]:
y_train_new, error_train_array = blur_labels(y_train)

In [17]:
t = sum([len(a) for a in error_train_array]); print('Num tags:', t)
e = sum([sum(a) for a in error_train_array]); print('Num errs:', e)
e/t

Num tags: 818271
Num errs: 72069


0.0880747331874159

In [None]:
crf.fit(X_train, y_train)
y_pred = crf.predict(X_test)
f1_score = flat_f1_score(y_test, y_pred, average = 'macro')
print(f1_score)

In [None]:
report = flat_classification_report(y_test, y_pred)
print(report)

In [None]:
crf.fit(X_train, y_train_new)
y_pred = crf.predict(X_test)
f1_score = flat_f1_score(y_test, y_pred, average = 'macro')
print(f1_score)

In [None]:
report = flat_classification_report(y_test, y_pred)
print(report)

# Understanding the Mistakes

### Seeing the training data

In [None]:
for i in range(10):
    print("-", end="")
    for w, word in enumerate(X_train[i]):
        word = word['word.lower()']
        if error_train_array[i][w]:
            print('*', end='')            
        if y_train_new[i][w] == 'O':
            print(word.lower(), end=' ')
        else:
            print(word.upper(), end=' ')            
    print()

### Seeing the predictions on the test data

In [None]:
for i in range(10):
    print("-", end="")
    for w, word in enumerate(X_test[i]):
        word = word['word.lower()']
        if y_pred[i][w] != y_test[i][w]:
            print('*', end='')
            
        if y_pred[i][w] == 'O':
            print(word.lower(), end=' ')
        else:
            print(word.upper(), end=' ')            
    print()

# Let's Try to Fix the Mistakes

For reference, the clean F1 is: **0.84** 

In [None]:
def measure_method(error_pred, error_array, X, y_corrected):
    # measure what percent of errors are fixed
    np = 0; nn=0; tp = 0; fp = 0;
    for i in range(len(error_pred)):
        for j in range(len(error_pred[i])):
            if error_pred[i][j] and error_pred[i][j] == error_array[i][j]:
                tp += 1
            elif error_pred[i][j]:
                fp += 1
            if error_array[i][j]:
                np += 1
            else:
                nn += 1

    
    print("TP errors detected: {}".format(tp/np))
    print("FP errors detected: {}".format(fp/nn)) 

    # measure accuracy
    crf.fit(X, y_corrected)
    y_pred = crf.predict(X_test)
    f1_score = flat_f1_score(y_test, y_pred, average = 'macro')
    print("F1 score on trained model: {}".format(f1_score))
    
    report = flat_classification_report(y_test, y_pred)
    print(report)
    
    return tp/np, fp/nn, f1_score

### K-Fold

In [43]:
n = len(X_train)    
X1 = X_train[:n//2]; y1 = y_train_new[:n//2]
X2 = X_train[n//2:]; y2 = y_train_new[n//2:]

crf.fit(X2, y2)
y1_pred = crf.predict(X1)

error_pred = []

for i in range(len(y1_pred)):    
    error_pred.append([])
    
    for j in range(len(y1_pred[i])):
        if not(y1_pred[i][j]==y1[i][j]):
            error_pred[i].append(True)
        else:
            error_pred[i].append(False)

crf.fit(X1, y1)
y2_pred = crf.predict(X2)

for i in range(len(y1_pred)):    
    error_pred.append([])
    
    for j in range(len(y2_pred[i])):
        if not(y2_pred[i][j]==y2[i][j]):
            error_pred[n//2 + i].append(True)
        else:
            error_pred[n//2 + i].append(False)

y_pred = y1_pred + y2_pred

In [44]:
measure_method(error_pred, error_train_array, X_train, y_pred)

TP errors detected: 0.2863837506527774
FP errors detected: 0.05513550911282526
F1 score on trained model: 0.7485058716596803
              precision    recall  f1-score   support

       B-geo       0.27      0.85      0.40      7421
       B-gpe       0.95      0.88      0.92      3237
       B-org       0.73      0.69      0.71      3989
       B-per       0.81      0.77      0.79      3327
       B-tim       0.91      0.79      0.85      4076
       I-geo       0.86      0.55      0.67      1482
       I-gpe       0.95      0.47      0.63        38
       I-org       0.74      0.72      0.73      3287
       I-per       0.81      0.87      0.84      3353
       I-tim       0.85      0.66      0.74      1314
           O       0.99      0.91      0.95    177169

   micro avg       0.89      0.89      0.89    208693
   macro avg       0.81      0.74      0.75    208693
weighted avg       0.95      0.89      0.91    208693



## Trained on Validation

In [45]:
crf.fit(X_val, y_val)
y_pred = crf.predict(X_train)

error_pred = []
y_corrected = []

for i in range(len(y_pred)):    
    error_pred.append([])
    y_corrected.append([])
    
    for j in range(len(y_pred[i])):
        if not(y_pred[i][j]==y_train_new[i][j]):
            error_pred[i].append(True)
        else:
            error_pred[i].append(False)

In [46]:
measure_method(error_pred, error_train_array, X_train, y_pred)

TP errors detected: 0.9529450567572767
FP errors detected: 0.04436393767488882
F1 score on trained model: 0.7027222300485211
              precision    recall  f1-score   support

       B-geo       0.78      0.87      0.82      7421
       B-gpe       0.92      0.89      0.90      3237
       B-org       0.71      0.62      0.66      3989
       B-per       0.77      0.70      0.73      3327
       B-tim       0.89      0.76      0.82      4076
       I-geo       0.75      0.63      0.69      1482
       I-gpe       0.00      0.00      0.00        38
       I-org       0.66      0.67      0.67      3287
       I-per       0.78      0.86      0.82      3353
       I-tim       0.81      0.52      0.63      1314
           O       0.99      0.99      0.99    177169

   micro avg       0.96      0.96      0.96    208693
   macro avg       0.73      0.68      0.70    208693
weighted avg       0.96      0.96      0.95    208693



_What about just the val model?_

In [64]:
crf.fit(X_val, y_val)
y_pred = crf.predict(X_test)
f1_score = flat_f1_score(y_test, y_pred, average = 'macro')
print("F1 score on trained model: {}".format(f1_score))
report = flat_classification_report(y_test, y_pred)
print(report)

F1 score on trained model: 0.4749235730245637
              precision    recall  f1-score   support

       B-geo       0.72      0.57      0.64      7421
       B-gpe       0.68      0.59      0.63      3237
       B-org       0.28      0.44      0.34      3989
       B-per       0.53      0.65      0.59      3327
       B-tim       0.69      0.59      0.64      4076
       I-geo       0.44      0.40      0.42      1482
       I-gpe       0.00      0.00      0.00        38
       I-org       0.06      0.65      0.11      3287
       I-per       0.56      0.82      0.67      3353
       I-tim       0.24      0.45      0.31      1314
           O       0.99      0.79      0.88    177169

   micro avg       0.76      0.76      0.76    208693
   macro avg       0.47      0.54      0.47    208693
weighted avg       0.92      0.76      0.83    208693



### GTC with X, y

In [50]:
crf.fit(X_train, y_train_new)
y_val_pred = crf.predict(X_val)

In [51]:
correction_network_input = []

for i in range(len(y_val_pred)):
    correction_network_input.append([])
    for j in range(len(y_val_pred[i])):
        correction_network_input[i].append(X_val[i][j])
        correction_network_input[i][j]['y'] = y_val_pred[i][j]     

In [52]:
crf.fit(correction_network_input, y_val)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [53]:
X_expanded = []

for i in range(len(y_train_new)):
    X_expanded.append([])
    for j in range(len(y_train_new[i])):
        X_expanded[i].append(X_train[i][j])
        X_expanded[i][j]['y'] = y_train_new[i][j]     

In [54]:
# Go from X_expanded to X_corrected
y_corrected = crf.predict(X_expanded)

In [55]:
error_pred = []

for i in range(len(y_corrected)):    
    error_pred.append([])    
    for j in range(len(y_corrected[i])):
        if not(y_corrected[i][j]==y_train_new[i][j]):
            error_pred[i].append(True)
        else:
            error_pred[i].append(False)

In [56]:
measure_method(error_pred, error_train_array, X_train, y_corrected)

TP errors detected: 0.9465959376631944
FP errors detected: 0.015689881487897392
F1 score on trained model: 0.5825781856981149
              precision    recall  f1-score   support

       B-geo       0.75      0.79      0.77      7421
       B-gpe       0.76      0.64      0.70      3237
       B-org       0.54      0.59      0.57      3989
       B-per       0.72      0.63      0.67      3327
       B-tim       0.65      0.69      0.67      4076
       I-geo       0.57      0.48      0.52      1482
       I-gpe       0.00      0.00      0.00        38
       I-org       0.41      0.70      0.52      3287
       I-per       0.72      0.77      0.74      3353
       I-tim       0.18      0.61      0.28      1314
           O       0.99      0.96      0.97    177169

   micro avg       0.92      0.92      0.92    208693
   macro avg       0.57      0.62      0.58    208693
weighted avg       0.93      0.92      0.92    208693



### GTC with X, y, neighboring y

In [57]:
crf.fit(X_train, y_train_new)
y_val_pred = crf.predict(X_val)

In [58]:
correction_network_input = []

for i in range(len(y_val_pred)):
    correction_network_input.append([])
    for j in range(len(y_val_pred[i])):
        correction_network_input[i].append(X_val[i][j])
        correction_network_input[i][j]['y'] = y_val_pred[i][j]     
        if j >= 1:
            correction_network_input[i][j]['y-1'] = y_val_pred[i][j-1]
        else:
            correction_network_input[i][j]['y-1'] = 'N'
        if j >= 2:
            correction_network_input[i][j]['y-2'] = y_val_pred[i][j-2]
        else:
            correction_network_input[i][j]['y-2'] = 'N'
        if j < len(y_val_pred[i]) - 1:
            correction_network_input[i][j]['y+1'] = y_val_pred[i][j+1]
        else:
            correction_network_input[i][j]['y+1'] = 'N'
        if j < len(y_val_pred[i]) - 2:
            correction_network_input[i][j]['y+2'] = y_val_pred[i][j+2]
        else:
            correction_network_input[i][j]['y+2'] = 'N'

In [59]:
crf.fit(correction_network_input, y_val)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [60]:
X_expanded = []

for i in range(len(y_train_new)):
    X_expanded.append([])
    for j in range(len(y_train_new[i])):
        X_expanded[i].append(X_train[i][j])
        X_expanded[i][j]['y'] = y_train_new[i][j]     
        if j >= 1:
            X_expanded[i][j]['y-1'] = y_train_new[i][j-1]
        else:
            X_expanded[i][j]['y-1'] = 'N'
        if j >= 2:
            X_expanded[i][j]['y-2'] = y_train_new[i][j-2]
        else:
            X_expanded[i][j]['y-2'] = 'N'
        if j < len(y_train_new[i]) - 1:
            X_expanded[i][j]['y+1'] = y_train_new[i][j+1]
        else:
            X_expanded[i][j]['y+1'] = 'N'
        if j < len(y_train_new[i]) - 2:
            X_expanded[i][j]['y+2'] = y_train_new[i][j+2]
        else:
            X_expanded[i][j]['y+2'] = 'N'

In [61]:
# Go from X_expanded to X_corrected
y_corrected = crf.predict(X_expanded)

In [62]:
error_pred = []

for i in range(len(y_corrected)):    
    error_pred.append([])    
    for j in range(len(y_corrected[i])):
        if not(y_corrected[i][j]==y_train_new[i][j]):
            error_pred[i].append(True)
        else:
            error_pred[i].append(False)

In [63]:
measure_method(error_pred, error_train_array, X_train, y_corrected)

TP errors detected: 0.948341258279966
FP errors detected: 0.015313120532667588


  'precision', 'predicted', average, warn_for)


F1 score on trained model: 0.5075486025751861


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       B-geo       0.72      0.70      0.71      7421
       B-gpe       0.67      0.59      0.63      3237
       B-org       0.37      0.51      0.43      3989
       B-per       0.59      0.64      0.61      3327
       B-tim       0.64      0.65      0.65      4076
       I-geo       0.55      0.39      0.45      1482
       I-gpe       0.00      0.00      0.00        38
       I-org       0.11      0.69      0.19      3287
       I-per       0.59      0.83      0.69      3353
       I-tim       0.20      0.55      0.29      1314
           O       0.99      0.87      0.93    177169

   micro avg       0.84      0.84      0.84    208693
   macro avg       0.49      0.58      0.51    208693
weighted avg       0.92      0.84      0.87    208693



### GTC with y, neighboring y only

In [49]:
crf.fit(X_train, y_train_new)
y_val_pred = crf.predict(X_val)

correction_network_input = []

for i in range(len(y_val_pred)):
    correction_network_input.append([])
    for j in range(len(y_val_pred[i])):
        correction_network_input[i].append(dict())
        correction_network_input[i][j]['y'] = y_val_pred[i][j]     
        if j >= 1:
            correction_network_input[i][j]['y-1'] = y_val_pred[i][j-1]
        else:
            correction_network_input[i][j]['y-1'] = 'N'
        if j >= 2:
            correction_network_input[i][j]['y-2'] = y_val_pred[i][j-2]
        else:
            correction_network_input[i][j]['y-2'] = 'N'
        if j < len(y_val_pred[i]) - 1:
            correction_network_input[i][j]['y+1'] = y_val_pred[i][j+1]
        else:
            correction_network_input[i][j]['y+1'] = 'N'
        if j < len(y_val_pred[i]) - 2:
            correction_network_input[i][j]['y+2'] = y_val_pred[i][j+2]
        else:
            correction_network_input[i][j]['y+2'] = 'N'
            
crf.fit(correction_network_input, y_val)

X_expanded = []

for i in range(len(y_train_new)):
    X_expanded.append([])
    for j in range(len(y_train_new[i])):
        X_expanded[i].append(dict())
        X_expanded[i][j]['y'] = y_train_new[i][j]     
        if j >= 1:
            X_expanded[i][j]['y-1'] = y_train_new[i][j-1]
        else:
            X_expanded[i][j]['y-1'] = 'N'
        if j >= 2:
            X_expanded[i][j]['y-2'] = y_train_new[i][j-2]
        else:
            X_expanded[i][j]['y-2'] = 'N'
        if j < len(y_train_new[i]) - 1:
            X_expanded[i][j]['y+1'] = y_train_new[i][j+1]
        else:
            X_expanded[i][j]['y+1'] = 'N'
        if j < len(y_train_new[i]) - 2:
            X_expanded[i][j]['y+2'] = y_train_new[i][j+2]
        else:
            X_expanded[i][j]['y+2'] = 'N'
            
# Go from X_expanded to X_corrected
y_corrected = crf.predict(X_expanded)

error_pred = []

for i in range(len(y_corrected)):    
    error_pred.append([])    
    for j in range(len(y_corrected[i])):
        if not(y_corrected[i][j]==y_train_new[i][j]):
            error_pred[i].append(True)
        else:
            error_pred[i].append(False)
            
measure_method(error_pred, error_train_array, X_train, y_corrected)

TP errors detected: 1.0
FP errors detected: 0.0033171053496033284
F1 score on trained model: 0.8202294192925398
              precision    recall  f1-score   support

       B-geo       0.86      0.83      0.84      7421
       B-gpe       0.97      0.91      0.94      3237
       B-org       0.81      0.68      0.74      3989
       B-per       0.85      0.79      0.82      3327
       B-tim       0.92      0.83      0.87      4076
       I-geo       0.83      0.73      0.77      1482
       I-gpe       0.91      0.53      0.67        38
       I-org       0.79      0.73      0.76      3287
       I-per       0.84      0.87      0.85      3353
       I-tim       0.84      0.72      0.77      1314
           O       0.98      0.99      0.99    177169

   micro avg       0.96      0.96      0.96    208693
   macro avg       0.87      0.78      0.82    208693
weighted avg       0.96      0.96      0.96    208693



### How do results change based on validation data size?