In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report

from nltk.tokenize.treebank import TreebankWordDetokenizer
%matplotlib inline

# Downloading and Preprocessing the Data

In [2]:
df = pd.read_csv('../data/ner_dataset.csv', encoding = "ISO-8859-1")
df.describe()

Unnamed: 0,Sentence #,Word,POS,Tag
count,47959,1048575,1048575,1048575
unique,47959,35178,42,17
top,Sentence: 46366,the,NN,O
freq,1,52573,145807,887908


In [3]:
df.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [4]:
print(df['Tag'].unique())

['O' 'B-geo' 'B-gpe' 'B-per' 'I-geo' 'B-org' 'I-org' 'B-tim' 'B-art'
 'I-art' 'I-per' 'I-gpe' 'I-tim' 'B-nat' 'B-eve' 'I-eve' 'I-nat']


In [5]:
df = df.fillna(method = 'ffill')

In [6]:
# This is a class te get sentence. The each sentence will be list of tuples with its tag and pos.
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s : [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                       s['Tag'].values.tolist())]
        self.grouped = self.df.groupby("Sentence #").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent +=1
            return s
        except:
            return None

In [7]:
#Displaying one full sentence
getter = sentence(df)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
sentences[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [8]:
sent = getter.get_text()
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [9]:
sentences = getter.sentences

In [10]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [98]:
reduced_tag_set = ['B-geo', 'B-gpe', 'B-org', 'B-per', 'B-tim', 'I-geo',
                   'I-gpe', 'I-org', 'I-per', 'I-tim', 'O']

In [107]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
y = [[label if label in reduced_tag_set else 'O' for label in y_i] for y_i in y]  # reduce tag set

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1)

In [109]:
crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = False)

# GMB-Blurry

In [110]:
import copy, random

def blur_labels(y, frac=0.25):
    y_new = []
    error_array = []
    
    counter = 0
    
    for i in range(len(y)):
        error_array.append(list())
        y_new.append(list())
        change_steps = 0
        change_to = None
        
        for j in range(len(y[i])):
            current_tag = y[i][j]
            if change_steps > 0:
                change_steps -= 1
                y_new[i].append(change_to)
                error_array[i].append(True)
                continue
            
            if current_tag.startswith('B') and np.random.random() < frac and j >= 3:
                new_tag = current_tag.replace('B', 'I')
                y_new[i][j - 3] = current_tag; error_array[i][j-3] = True
                y_new[i][j - 2] = new_tag; error_array[i][j-2] = True
                y_new[i][j - 1] = new_tag; error_array[i][j-1] = True
                y_new[i].append(new_tag); 
                error_array[i].append(False)
                
                if (len(y[i])-j)>3 and not(current_tag=='O') and y[i][j+1] == 'O' and np.random.random() < frac:
                    change_to = current_tag.replace('B', 'I')
                    change_steps = 3
                
                continue
            
            if (len(y[i])-j)>3 and not(current_tag=='O') and y[i][j+1] == 'O' and np.random.random() < frac:
                change_to = current_tag.replace('B', 'I')
                change_steps = 3
                y_new[i].append(current_tag); 
                error_array[i].append(False)
                continue
            
            error_array[i].append(False)
            y_new[i].append(current_tag)
        
    return y_new, error_array

In [111]:
y_train_new, error_train_array = blur_labels(y_train, frac=0.25)

In [112]:
t = sum([len(a) for a in error_train_array]); print('Num tags:', t)
e = sum([sum(a) for a in error_train_array]); print('Num errs:', e)
e/t

Num tags: 755410
Num errs: 86447


0.11443719304748415

In [119]:
crf.fit(X_train, y_train)
y_pred = crf.predict(X_test)
f1_score = flat_f1_score(y_test, y_pred, average = 'macro')
print(f1_score)

0.8350112325620827


In [120]:
report = flat_classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       B-geo       0.86      0.91      0.89      7582
       B-gpe       0.97      0.94      0.95      3254
       B-org       0.81      0.75      0.78      3966
       B-per       0.86      0.82      0.84      3378
       B-tim       0.93      0.89      0.91      4058
       I-geo       0.84      0.79      0.81      1571
       I-gpe       0.87      0.38      0.53        34
       I-org       0.80      0.82      0.81      3256
       I-per       0.85      0.90      0.87      3499
       I-tim       0.82      0.78      0.80      1209
           O       0.99      0.99      0.99    176872

   micro avg       0.97      0.97      0.97    208679
   macro avg       0.87      0.81      0.84    208679
weighted avg       0.97      0.97      0.97    208679



In [113]:
crf.fit(X_train, y_train_new)
y_pred = crf.predict(X_test)
f1_score = flat_f1_score(y_test, y_pred, average = 'macro')
print(f1_score)

0.6137694734502105


# Understanding the Mistakes

### Seeing the training data

In [114]:
for i in range(10):
    print("-", end="")
    for w, word in enumerate(X_train[i]):
        word = word['word.lower()']
        if error_train_array[i][w]:
            print('*', end='')            
        if y_train_new[i][w] == 'O':
            print(word.lower(), end=' ')
        else:
            print(word.upper(), end=' ')            
    print()

-meanwhile , political *SOURCES *SAY *IRAQI PRIME *MINISTER *NURI *AL-MALIKI will announce his national reconciliation *PLAN *TO *PARLIAMENT SUNDAY . 
-she is alleged to have passed *CLASSIFIED *INFORMATION *TO PAKISTAN 's intelligence agency . 
-ZIEMER *, *A *RETIRED U.S. NAVY rear admiral , was previously *EXECUTIVE *DIRECTOR *OF WORLD RELIEF *, *A *PRIVATE AMERICAN disaster relief organization . 
-the STATE DEPARTMENT says the U.S. ambassador in NICARAGUA is meeting with all parties that have expressed an interest in a " democratic electoral process . " 
-the military had held DR. BESIGYE SINCE NOVEMBER , when he returned from self-imposed exile to run in next month 's elections against PRESIDENT YOWERI MUSEVENI . 
-they arranged to meet , and the thief was arrested . 
-however , FIFA officials say the horns *ARE *A *SOUTH AFRICAN tradition and will not be banned . 
-the PAKISTANI military launched *ITS *OFFENSIVE *IN ORAKZAI to hunt TALIBAN insurgents . 
-in a separate development 

### Seeing the predictions on the test data

In [115]:
for i in range(10):
    print("-", end="")
    for w, word in enumerate(X_test[i]):
        word = word['word.lower()']
        if y_pred[i][w] != y_test[i][w]:
            print('*', end='')
            
        if y_pred[i][w] == 'O':
            print(word.lower(), end=' ')
        else:
            print(word.upper(), end=' ')            
    print()

-the two reporters were kidnapped AUGUST 19 by an insurgent group calling itself the ISLAMIC ARMY *of IRAQ . 
-last month , the U.N. SECURITY COUNCIL adopted a resolution demanding that SYRIA cooperate with the investigation into the FEBRUARY assassination or face possible sanctions . 
-poor soils and inadequate water supplies hamper the development of agriculture . 
-IRANIAN authorities arrested BAUER *, *FATTAL *AND *SARAH SHOURD last year on charges of unlawfully crossing into IRANIAN territory from IRAQ . 
-it will be his first trip abroad since winning *re-election last week . 
-elsewhere in PAKISTAN 's tribal region , suspected militants released 50 of the 60 people kidnapped at gunpoint in KURRAM on SATURDAY . 
-ahead of his trip to the region , the military released NATIONAL LEAGUE FOR DEMOCRACY vice chairman TIN OO , after SEVEN years *of *detention . 
-in *YARACUY , troops surrounded the office of opposition governor EDUARDO LAPI , who vowed not to leave his post until final 

# Let's Try to Fix the Mistakes

For reference, the clean F1 is: **0.83** 

In [116]:
def measure_method(error_pred, error_array, X, y_corrected):
    # measure what percent of errors are fixed
    np = 0; nn=0; tp = 0; fp = 0;
    for i in range(len(error_pred)):
        for j in range(len(error_pred[i])):
            if error_pred[i][j] and error_pred[i][j] == error_array[i][j]:
                tp += 1
            elif error_pred[i][j]:
                fp += 1
            if error_array[i][j]:
                np += 1
            else:
                nn += 1

    
    print("TP errors detected: {}".format(tp/np))
    print("FP errors detected: {}".format(fp/nn)) 

    # measure accuracy
    crf.fit(X, y_corrected)
    y_pred = crf.predict(X_test)
    f1_score = flat_f1_score(y_test, y_pred, average = 'macro')
    print("F1 score on trained model: {}".format(f1_score))
    
    report = flat_classification_report(y_test, y_pred)
    print(report)

### No fix

In [117]:
error_pred = [[False]*len(i) for i in error_train_array]

In [118]:
measure_method(error_pred, error_train_array, X_train, y_train_new)

TP errors detected: 0.0
FP errors detected: 0.0
F1 score on trained model: 0.6137694734502105
              precision    recall  f1-score   support

       B-geo       0.83      0.70      0.76      7582
       B-gpe       0.93      0.80      0.86      3254
       B-org       0.78      0.59      0.67      3966
       B-per       0.82      0.65      0.73      3378
       B-tim       0.86      0.72      0.78      4058
       I-geo       0.19      0.80      0.31      1571
       I-gpe       0.01      0.47      0.02        34
       I-org       0.44      0.77      0.56      3256
       I-per       0.54      0.86      0.66      3499
       I-tim       0.29      0.77      0.43      1209
           O       0.99      0.94      0.97    176872

   micro avg       0.91      0.91      0.91    208679
   macro avg       0.61      0.73      0.61    208679
weighted avg       0.95      0.91      0.92    208679



### K-Fold

In [163]:
n = len(X_train)    
X1 = X_train[:n//2]; y1 = y_train_new[:n//2]
X2 = X_train[n//2:]; y2 = y_train_new[n//2:]

crf.fit(X2, y2)
y1_pred = crf.predict(X1)

error_pred = []

for i in range(len(y1_pred)):    
    error_pred.append([])
    
    for j in range(len(y1_pred[i])):
        if not(y1_pred[i][j]==y1[i][j]):
            error_pred[i].append(True)
        else:
            error_pred[i].append(False)

crf.fit(X1, y1)
y2_pred = crf.predict(X2)

for i in range(len(y1_pred)):    
    error_pred.append([])
    
    for j in range(len(y2_pred[i])):
        if not(y2_pred[i][j]==y2[i][j]):
            error_pred[n//2 + i].append(True)
        else:
            error_pred[n//2 + i].append(False)

y_pred = y1_pred + y2_pred

In [164]:
measure_method(error_pred, error_train_array, X_train, y_pred)

TP errors detected: 0.0
FP errors detected: 0.0
F1 score on trained model: 0.11501441942759749
              precision    recall  f1-score   support

       B-geo       0.00      0.00      0.00      7582
       B-gpe       0.12      0.23      0.16      3254
       B-org       0.27      0.03      0.06      3966
       B-per       0.39      0.29      0.33      3378
       B-tim       0.35      0.24      0.29      4058
       I-geo       0.00      0.00      0.00      1571
       I-gpe       0.00      0.50      0.00        34
       I-org       0.08      0.14      0.10      3256
       I-per       0.19      0.67      0.30      3499
       I-tim       0.01      0.72      0.03      1209
           O       1.00      0.00      0.00    176872

   micro avg       0.03      0.03      0.03    208679
   macro avg       0.22      0.26      0.12    208679
weighted avg       0.87      0.03      0.02    208679



## Trained on Validation

In [147]:
crf.fit(X_val, y_val)
y_pred = crf.predict(X_train)

error_pred = []
y_corrected = []

for i in range(len(y_pred)):    
    error_pred.append([])
    y_corrected.append([])
    
    for j in range(len(y_pred[i])):
        if not(y_pred[i][j]==y_train_new[i][j]):
            error_pred[i].append(True)
        else:
            error_pred[i].append(False)

In [148]:
measure_method(error_pred, error_train_array, X_train, y_pred)

TP errors detected: 0.9828681157240853
FP errors detected: 0.03904999230151742
F1 score on trained model: 0.7130694398505978
              precision    recall  f1-score   support

       B-geo       0.77      0.86      0.81      7582
       B-gpe       0.72      0.90      0.80      3254
       B-org       0.70      0.59      0.64      3966
       B-per       0.68      0.71      0.70      3378
       B-tim       0.86      0.79      0.82      4058
       I-geo       0.66      0.69      0.67      1571
       I-gpe       0.80      0.24      0.36        34
       I-org       0.63      0.70      0.66      3256
       I-per       0.66      0.92      0.77      3499
       I-tim       0.60      0.64      0.62      1209
           O       0.99      0.98      0.98    176872

   micro avg       0.95      0.95      0.95    208679
   macro avg       0.73      0.73      0.71    208679
weighted avg       0.95      0.95      0.95    208679



_What about just the val model?_

In [162]:
crf.fit(X_val, y_val)
y_pred = crf.predict(X_test)
f1_score = flat_f1_score(y_test, y_pred, average = 'macro')
print("F1 score on trained model: {}".format(f1_score))
report = flat_classification_report(y_test, y_pred)

F1 score on trained model: 0.5412043852859015


### GTC with X, y

In [149]:
crf.fit(X_train, y_train_new)
y_val_pred = crf.predict(X_val)

In [150]:
correction_network_input = []

for i in range(len(y_val_pred)):
    correction_network_input.append([])
    for j in range(len(y_val_pred[i])):
        correction_network_input[i].append(X_val[i][j])
        correction_network_input[i][j]['y'] = y_val_pred[i][j]     

In [151]:
crf.fit(correction_network_input, y_val)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [152]:
X_expanded = []

for i in range(len(y_train_new)):
    X_expanded.append([])
    for j in range(len(y_train_new[i])):
        X_expanded[i].append(X_train[i][j])
        X_expanded[i][j]['y'] = y_train_new[i][j]     

In [153]:
# Go from X_expanded to X_corrected
y_corrected = crf.predict(X_expanded)

In [154]:
error_pred = []

for i in range(len(y_corrected)):    
    error_pred.append([])    
    for j in range(len(y_corrected[i])):
        if not(y_corrected[i][j]==y_train_new[i][j]):
            error_pred[i].append(True)
        else:
            error_pred[i].append(False)

In [155]:
measure_method(error_pred, error_train_array, X_train, y_corrected)

TP errors detected: 0.9828681157240853
FP errors detected: 0.03904999230151742
F1 score on trained model: 0.7130694398505978
              precision    recall  f1-score   support

       B-geo       0.77      0.86      0.81      7582
       B-gpe       0.72      0.90      0.80      3254
       B-org       0.70      0.59      0.64      3966
       B-per       0.68      0.71      0.70      3378
       B-tim       0.86      0.79      0.82      4058
       I-geo       0.66      0.69      0.67      1571
       I-gpe       0.80      0.24      0.36        34
       I-org       0.63      0.70      0.66      3256
       I-per       0.66      0.92      0.77      3499
       I-tim       0.60      0.64      0.62      1209
           O       0.99      0.98      0.98    176872

   micro avg       0.95      0.95      0.95    208679
   macro avg       0.73      0.73      0.71    208679
weighted avg       0.95      0.95      0.95    208679



### GTC with X, y, neighboring y

In [156]:
correction_network_input = []

for i in range(len(y_val_pred)):
    correction_network_input.append([])
    for j in range(len(y_val_pred[i])):
        correction_network_input[i].append(X_val[i][j])
        correction_network_input[i][j]['y'] = y_val_pred[i][j]     
        if j >= 1:
            correction_network_input[i][j]['y-1'] = y_val_pred[i][j-1]
        else:
            correction_network_input[i][j]['y-1'] = 'N'
        if j >= 2:
            correction_network_input[i][j]['y-2'] = y_val_pred[i][j-2]
        else:
            correction_network_input[i][j]['y-2'] = 'N'
        if j < len(y_val_pred[i]) - 1:
            correction_network_input[i][j]['y+1'] = y_val_pred[i][j+1]
        else:
            correction_network_input[i][j]['y+1'] = 'N'
        if j < len(y_val_pred[i]) - 2:
            correction_network_input[i][j]['y+2'] = y_val_pred[i][j+2]
        else:
            correction_network_input[i][j]['y+2'] = 'N'

In [157]:
crf.fit(correction_network_input, y_val)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [158]:
X_expanded = []

for i in range(len(y_train_new)):
    X_expanded.append([])
    for j in range(len(y_train_new[i])):
        X_expanded[i].append(X_train[i][j])
        X_expanded[i][j]['y'] = y_train_new[i][j]     
        if j >= 1:
            X_expanded[i][j]['y-1'] = y_train_new[i][j-1]
        else:
            X_expanded[i][j]['y-1'] = 'N'
        if j >= 2:
            X_expanded[i][j]['y-2'] = y_train_new[i][j-2]
        else:
            X_expanded[i][j]['y-2'] = 'N'
        if j < len(y_train_new[i]) - 1:
            X_expanded[i][j]['y+1'] = y_train_new[i][j+1]
        else:
            X_expanded[i][j]['y+1'] = 'N'
        if j < len(y_train_new[i]) - 2:
            X_expanded[i][j]['y+2'] = y_train_new[i][j+2]
        else:
            X_expanded[i][j]['y+2'] = 'N'

In [159]:
# Go from X_expanded to X_corrected
y_corrected = crf.predict(X_expanded)

In [160]:
error_pred = []

for i in range(len(y_corrected)):    
    error_pred.append([])    
    for j in range(len(y_corrected[i])):
        if not(y_corrected[i][j]==y_train_new[i][j]):
            error_pred[i].append(True)
        else:
            error_pred[i].append(False)

In [161]:
measure_method(error_pred, error_train_array, X_train, y_corrected)

TP errors detected: 0.9828681157240853
FP errors detected: 0.03904999230151742
F1 score on trained model: 0.7130694398505978
              precision    recall  f1-score   support

       B-geo       0.77      0.86      0.81      7582
       B-gpe       0.72      0.90      0.80      3254
       B-org       0.70      0.59      0.64      3966
       B-per       0.68      0.71      0.70      3378
       B-tim       0.86      0.79      0.82      4058
       I-geo       0.66      0.69      0.67      1571
       I-gpe       0.80      0.24      0.36        34
       I-org       0.63      0.70      0.66      3256
       I-per       0.66      0.92      0.77      3499
       I-tim       0.60      0.64      0.62      1209
           O       0.99      0.98      0.98    176872

   micro avg       0.95      0.95      0.95    208679
   macro avg       0.73      0.73      0.71    208679
weighted avg       0.95      0.95      0.95    208679



### GTC with y, neighboring y only

In [142]:
correction_network_input = []

for i in range(len(y_val_pred)):
    correction_network_input.append([])
    for j in range(len(y_val_pred[i])):
        correction_network_input[i].append(dict())
        correction_network_input[i][j]['y'] = y_val_pred[i][j]     
        if j >= 1:
            correction_network_input[i][j]['y-1'] = y_val_pred[i][j-1]
        else:
            correction_network_input[i][j]['y-1'] = 'N'
        if j >= 2:
            correction_network_input[i][j]['y-2'] = y_val_pred[i][j-2]
        else:
            correction_network_input[i][j]['y-2'] = 'N'
        if j < len(y_val_pred[i]) - 1:
            correction_network_input[i][j]['y+1'] = y_val_pred[i][j+1]
        else:
            correction_network_input[i][j]['y+1'] = 'N'
        if j < len(y_val_pred[i]) - 2:
            correction_network_input[i][j]['y+2'] = y_val_pred[i][j+2]
        else:
            correction_network_input[i][j]['y+2'] = 'N'
            
crf.fit(correction_network_input, y_val)

X_expanded = []

for i in range(len(y_train_new)):
    X_expanded.append([])
    for j in range(len(y_train_new[i])):
        X_expanded[i].append(dict())
        X_expanded[i][j]['y'] = y_train_new[i][j]     
        if j >= 1:
            X_expanded[i][j]['y-1'] = y_train_new[i][j-1]
        else:
            X_expanded[i][j]['y-1'] = 'N'
        if j >= 2:
            X_expanded[i][j]['y-2'] = y_train_new[i][j-2]
        else:
            X_expanded[i][j]['y-2'] = 'N'
        if j < len(y_train_new[i]) - 1:
            X_expanded[i][j]['y+1'] = y_train_new[i][j+1]
        else:
            X_expanded[i][j]['y+1'] = 'N'
        if j < len(y_train_new[i]) - 2:
            X_expanded[i][j]['y+2'] = y_train_new[i][j+2]
        else:
            X_expanded[i][j]['y+2'] = 'N'
            
# Go from X_expanded to X_corrected
y_corrected = crf.predict(X_expanded)

error_pred = []

for i in range(len(y_corrected)):    
    error_pred.append([])    
    for j in range(len(y_corrected[i])):
        if not(y_corrected[i][j]==y_train_new[i][j]):
            error_pred[i].append(True)
        else:
            error_pred[i].append(False)
            
measure_method(error_pred, error_train_array, X_train, y_corrected)

TP errors detected: 0.46778951264936897
FP errors detected: 0.012821336905030623
F1 score on trained model: 0.010681488434638484
              precision    recall  f1-score   support

       B-geo       0.00      0.00      0.00      7582
       B-gpe       0.00      0.00      0.00      3254
       B-org       0.06      0.10      0.07      3966
       B-per       0.10      0.00      0.00      3378
       B-tim       0.00      0.00      0.00      4058
       I-geo       0.00      0.00      0.00      1571
       I-gpe       0.00      0.00      0.00        34
       I-org       0.02      0.99      0.03      3256
       I-per       0.11      0.00      0.00      3499
       I-tim       0.00      0.00      0.00      1209
           O       0.85      0.00      0.01    176872

   micro avg       0.02      0.02      0.02    208679
   macro avg       0.10      0.10      0.01    208679
weighted avg       0.72      0.02      0.01    208679



### Step Back and Analyze What's Going On

Possible problems:
* Debug Kfold
* Should do on X_train predictions, not X_train itself
* Should decrease size of validation dataset
* Try with much simpler