In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report

from nltk.tokenize.treebank import TreebankWordDetokenizer
%matplotlib inline

# Downloading and Preprocessing the Data

In [2]:
df = pd.read_csv('../data/ner_dataset.csv', encoding = "ISO-8859-1")
df.describe()

Unnamed: 0,Sentence #,Word,POS,Tag
count,47959,1048575,1048575,1048575
unique,47959,35178,42,17
top,Sentence: 34297,the,NN,O
freq,1,52573,145807,887908


In [3]:
df.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [4]:
print(df['Tag'].unique())

['O' 'B-geo' 'B-gpe' 'B-per' 'I-geo' 'B-org' 'I-org' 'B-tim' 'B-art'
 'I-art' 'I-per' 'I-gpe' 'I-tim' 'B-nat' 'B-eve' 'I-eve' 'I-nat']


In [5]:
df = df.fillna(method = 'ffill')

In [6]:
# This is a class te get sentence. The each sentence will be list of tuples with its tag and pos.
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s : [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                       s['Tag'].values.tolist())]
        self.grouped = self.df.groupby("Sentence #").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent +=1
            return s
        except:
            return None

In [7]:
#Displaying one full sentence
getter = sentence(df)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
sentences[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [8]:
sent = getter.get_text()
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [9]:
sentences = getter.sentences

In [10]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [11]:
reduced_tag_set = ['B-geo', 'B-gpe', 'B-org', 'B-per', 'B-tim', 'I-geo',
                   'I-gpe', 'I-org', 'I-per', 'I-tim', 'O']

In [12]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
y = [[label if label in reduced_tag_set else 'O' for label in y_i] for y_i in y]  # reduce tag set

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.025)

In [14]:
crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = False)

# GMB-Simple-Blurry

In [15]:
def blur_labels(y, frac):
    y_new = []
    error_array = []
    
    counter = 0
    
    for i in range(len(y)):
        error_array.append(list())
        y_new.append(list())
        change_steps = 0
        change_to = None
        
        for j in range(len(y[i])):
            
            current_tag = y[i][j]
            if current_tag=='B-geo' and  j >= 3 and np.random.random() < frac:
                for k in range(3):
                    y_new[i][j - k - 1] = current_tag
                    error_array[i][j - k - 1] = True
                
                y_new[i].append(current_tag)
                error_array[i].append(False)
            else:
                error_array[i].append(False)
                y_new[i].append(current_tag)
        
    return y_new, error_array

In [16]:
y_train_new, error_train_array = blur_labels(y_train, frac=0.67)

In [17]:
t = sum([len(a) for a in error_train_array]); print('Num tags:', t)
e = sum([sum(a) for a in error_train_array]); print('Num errs:', e)
e/t

Num tags: 818288
Num errs: 49359


0.06031983849207125

In [18]:
print(len(X_train))

37407


In [19]:
crf.fit(X_train, y_train)
y_pred = crf.predict(X_test)
f1_score = flat_f1_score(y_test, y_pred, average = 'macro')
print(f1_score)

0.8357404453107364


In [20]:
report = flat_classification_report(y_test, y_pred, output_dict=True)
clean_precision, clean_recall, clean_f1 = report['B-geo']['precision'], report['B-geo']['recall'], report['B-geo']['f1-score'] 
print(flat_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       B-geo       0.86      0.91      0.89      7440
       B-gpe       0.96      0.94      0.95      3125
       B-org       0.82      0.74      0.78      4049
       B-per       0.85      0.84      0.84      3506
       B-tim       0.92      0.88      0.90      4072
       I-geo       0.82      0.78      0.80      1486
       I-gpe       0.78      0.42      0.55        43
       I-org       0.82      0.79      0.81      3410
       I-per       0.86      0.90      0.88      3583
       I-tim       0.86      0.76      0.81      1352
           O       0.99      0.99      0.99    177386

   micro avg       0.97      0.97      0.97    209452
   macro avg       0.87      0.81      0.84    209452
weighted avg       0.97      0.97      0.97    209452



In [21]:
crf.fit(X_train, y_train_new)
y_pred = crf.predict(X_test)
f1_score = flat_f1_score(y_test, y_pred, average = 'macro')
print(f1_score)

0.787772411624082


In [22]:
report = flat_classification_report(y_test, y_pred, output_dict=True)
dirty_precision, dirty_recall, dirty_f1 = report['B-geo']['precision'], report['B-geo']['recall'], report['B-geo']['f1-score'] 
print(flat_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       B-geo       0.38      0.89      0.53      7440
       B-gpe       0.96      0.91      0.93      3125
       B-org       0.79      0.74      0.76      4049
       B-per       0.85      0.83      0.84      3506
       B-tim       0.92      0.84      0.88      4072
       I-geo       0.83      0.66      0.73      1486
       I-gpe       0.86      0.42      0.56        43
       I-org       0.81      0.78      0.79      3410
       I-per       0.85      0.89      0.87      3583
       I-tim       0.87      0.73      0.79      1352
           O       0.99      0.94      0.97    177386

   micro avg       0.93      0.93      0.93    209452
   macro avg       0.83      0.78      0.79    209452
weighted avg       0.95      0.93      0.94    209452



# Understanding the Mistakes

### Seeing the training data

In [23]:
for i in range(10):
    print("-", end="")
    for w, word in enumerate(X_train[i]):
        word = word['word.lower()']
        if error_train_array[i][w]:
            print('*', end='')            
        if y_train_new[i][w] == 'O':
            print(word.lower(), end=' ')
        else:
            print(word.upper(), end=' ')            
    print()

-the press freedom group reporters without borders has condemned the decision by VENEZUELAN PRESIDENT HUGH CHAVEZ to shut down one of the country 's oldest television stations the group issued a statement FRIDAY calling the move " a serious attack against editorial pluralism . " 
-a group spokesman said the project was about 85 percent complete . 
-his comments came as RUSSIAN PRESIDENT VLADIMIR PUTIN and VENEZUELAN leader HUGO CHAVEZ met in MOSCOW . 
-before the center opened THURSDAY , *CUSTOMS *OFFICES *ALONG RUSSIA 's expansive border regions *COMMUNICATED *WITH *THEIR MOSCOW headquarters primarily by telephone . 
-he blames ICELAND 's economic calamity on commercial bankers . 
-a police spokeswoman said a 46-year-old man had been arrested APRIL 25 and questioned over an assault in west LONDON . 
-IRAN says its nuclear program is only for peaceful purposes . 
-the KYRGYZ staged a major revolt against the TSARIST EMPIRE in 1916 in which almost one-sixth of the KYRGYZ population was 

### Seeing the predictions on the test data

In [24]:
for i in range(10):
    print("-", end="")
    for w, word in enumerate(X_test[i]):
        word = word['word.lower()']
        if y_pred[i][w] != y_test[i][w]:
            print('*', end='')
            
        if y_pred[i][w] == 'O':
            print(word.lower(), end=' ')
        else:
            print(word.upper(), end=' ')            
    print()

-*city officials say the students ' return will sizably increase the city 's population and bring in new consumer dollars . 
-the medicines attack the aids virus at different points in its replication cycle . 
-MR. BUSH recently wrapped up a nationwide tour promoting his plans for SOCIAL SECURITY . 
-south KOREAN media report MONDAY that a recent *POLITICAL *COMMENTARY *ON *NORTH KOREAN radio , heard last THURSDAY , referred to comments by the elder KIM saying his son or even his grandson should complete his work if he falls short of completing the country 's " revolution . " 
-PROFESSOR COMERIO talks about the scope of re-building , the *LONG-RANGE *OUTLOOK *FOR *RECOVERY *AND *HOW HAITI 's devastation compares with other similar earthquake disasters . 
-last year , *AFGHANISTAN discovered cases of the h5n1 virus in birds , but not humans . 
-U.S. officials do not publicly comment on the drone strikes , which have *RAISED *TENSIONS *BETWEEN PAKISTAN *AND *THE UNITED STATES in the *pas

# Let's Try to Fix the Mistakes

For reference, the clean F1 is: **0.84** 

In [25]:
def measure_method(error_pred, error_array, X, y_corrected):
    # measure what percent of errors are fixed
    np = 0; nn=0; tp = 0; fp = 0;
    for i in range(len(error_pred)):
        for j in range(len(error_pred[i])):
            if error_pred[i][j] and error_pred[i][j] == error_array[i][j]:
                tp += 1
            elif error_pred[i][j]:
                fp += 1
            if error_array[i][j]:
                np += 1
            else:
                nn += 1

    
    print("TP errors detected: {}".format(tp/np))
    print("FP errors detected: {}".format(fp/nn)) 

    # measure accuracy
    crf.fit(X, y_corrected)
    y_pred = crf.predict(X_test)
    f1_score = flat_f1_score(y_test, y_pred, average = 'macro')
    print("F1 score on trained model: {}".format(f1_score))
    
    report = flat_classification_report(y_test, y_pred, output_dict=True)
    precision, recall, f1 = report['B-geo']['precision'], report['B-geo']['recall'], report['B-geo']['f1-score'] 
    print(flat_classification_report(y_test, y_pred))

    return precision, recall, f1

## Pseudolabeled from Validation

In [26]:
def pseudolabeled_on_validation(X_val, y_val):
    
    crf.fit(X_val, y_val)
    y_pred = crf.predict(X_train)

    error_pred = []
    y_corrected = []

    for i in range(len(y_pred)):    
        error_pred.append([])
        y_corrected.append([])

        for j in range(len(y_pred[i])):
            if not(y_pred[i][j]==y_train_new[i][j]):
                error_pred[i].append(True)
            else:
                error_pred[i].append(False)

    return measure_method(error_pred, error_train_array, X_train, y_pred)

### GTC with X, y, neighboring y

In [27]:
def gtc_with_x_y_neighboring_y(X_val, y_val, num_x_keys=1):
    keys_prioritized = ['word.isupper()', 'word.istitle()', 'word.isdigit()', '+1:word.istitle()', 
                        '+1:word.isupper()', 'BOS', 'bias',  'word.lower()', 'word[-3:]', 'word[-2:]',
                        '+1:word.lower()', 'postag', 'postag[:2]', '+1:postag', '+1:postag[:2]', 
                        '-1:postag', '-1:postag[:2]', '-1:word.istitle()', '-1:word.lower()',
                        '-1:word.isupper()', 'EOS']
    
    crf.fit(X_train, y_train_new)
    y_val_pred = crf.predict(X_val)

    correction_network_input = []

    for i in range(len(y_val_pred)):
        correction_network_input.append([])
        for j in range(len(y_val_pred[i])):
            if num_x_keys is None:
                X_val_sub = X_val[i][j].copy()
            else:
                X_val_sub = {k: X_val[i][j][k] for k in keys_prioritized[:num_x_keys] if k in X_val[i][j]}
            correction_network_input[i].append(X_val_sub)
            correction_network_input[i][j]['y'] = y_val_pred[i][j]     
            if j >= 1:
                correction_network_input[i][j]['y-1'] = y_val_pred[i][j-1]
            else:
                correction_network_input[i][j]['y-1'] = 'N'
            if j >= 2:
                correction_network_input[i][j]['y-2'] = y_val_pred[i][j-2]
            else:
                correction_network_input[i][j]['y-2'] = 'N'
            if j < len(y_val_pred[i]) - 1:
                correction_network_input[i][j]['y+1'] = y_val_pred[i][j+1]
            else:
                correction_network_input[i][j]['y+1'] = 'N'
            if j < len(y_val_pred[i]) - 2:
                correction_network_input[i][j]['y+2'] = y_val_pred[i][j+2]
            else:
                correction_network_input[i][j]['y+2'] = 'N'

    crf.fit(correction_network_input, y_val)

    X_expanded = []

    for i in range(len(y_train_new)):
        X_expanded.append([])
        for j in range(len(y_train_new[i])):
            if num_x_keys is None:
                X_train_sub = X_train[i][j].copy()
            else:
                X_train_sub = {k: X_train[i][j][k] for k in keys_prioritized[:num_x_keys] if k in X_train[i][j]}
            X_expanded[i].append(X_train_sub)
            X_expanded[i][j]['y'] = y_train_new[i][j]     
            if j >= 1:
                X_expanded[i][j]['y-1'] = y_train_new[i][j-1]
            else:
                X_expanded[i][j]['y-1'] = 'N'
            if j >= 2:
                X_expanded[i][j]['y-2'] = y_train_new[i][j-2]
            else:
                X_expanded[i][j]['y-2'] = 'N'
            if j < len(y_train_new[i]) - 1:
                X_expanded[i][j]['y+1'] = y_train_new[i][j+1]
            else:
                X_expanded[i][j]['y+1'] = 'N'
            if j < len(y_train_new[i]) - 2:
                X_expanded[i][j]['y+2'] = y_train_new[i][j+2]
            else:
                X_expanded[i][j]['y+2'] = 'N'

    # Go from X_expanded to X_corrected
    y_corrected = crf.predict(X_expanded)

    error_pred = []

    for i in range(len(y_corrected)):    
        error_pred.append([])    
        for j in range(len(y_corrected[i])):
            if not(y_corrected[i][j]==y_train_new[i][j]):
                error_pred[i].append(True)
            else:
                error_pred[i].append(False)

    return measure_method(error_pred, error_train_array, X_train, y_corrected)

### GTC with y, neighboring y only

In [28]:
def gtc_with_y_neighboring_y(X_val, y_val, num_ys=1):
    
    crf.fit(X_train, y_train_new)
    y_val_pred = crf.predict(X_val)

    correction_network_input = []

    for i in range(len(y_val_pred)):
        correction_network_input.append([])
        for j in range(len(y_val_pred[i])):
            correction_network_input[i].append(dict())
            correction_network_input[i][j]['y'] = y_val_pred[i][j] 
            for k in range(1, 1+num_ys):
                if j >= k:
                    correction_network_input[i][j]['y-{}'.format(k)] = y_val_pred[i][j-k]
                else:
                    correction_network_input[i][j]['y-{}'.format(k)] = 'N'
                if j < len(y_val_pred[i]) - k:
                    correction_network_input[i][j]['y+{}'.format(k)] = y_val_pred[i][j+1]
                else:
                    correction_network_input[i][j]['y+{}'.format(k)] = 'N'
    
    crf.fit(correction_network_input, y_val)

    X_expanded = []

    for i in range(len(y_train_new)):
        X_expanded.append([])
        for j in range(len(y_train_new[i])):
            X_expanded[i].append(dict())
            X_expanded[i][j]['y'] = y_train_new[i][j]     
            for k in range(1, 1+num_ys):
                if j >= k:
                    X_expanded[i][j]['y-{}'.format(k)] = y_train_new[i][j-k]
                else:
                    X_expanded[i][j]['y-{}'.format(k)] = 'N'
                if j < len(y_train_new[i]) - k:
                    X_expanded[i][j]['y+{}'.format(k)] = y_train_new[i][j+k]
                else:
                    X_expanded[i][j]['y+{}'.format(k)] = 'N'

    # Go from X_expanded to X_corrected
    y_corrected = crf.predict(X_expanded)

    error_pred = []

    for i in range(len(y_corrected)):    
        error_pred.append([])    
        for j in range(len(y_corrected[i])):
            if not(y_corrected[i][j]==y_train_new[i][j]):
                error_pred[i].append(True)
            else:
                error_pred[i].append(False)

    return measure_method(error_pred, error_train_array, X_train, y_corrected)

### How do results change based on markov blanket size?

In [29]:
n = 100
X_val_ = X_val[:n]
y_val_ = y_val[:n]

In [30]:
ppr, pre, pf1 = pseudolabeled_on_validation(X_val_, y_val_)

TP errors detected: 0.9476893778236999
FP errors detected: 0.06741584723687102


  'precision', 'predicted', average, warn_for)


F1 score on trained model: 0.5963959435549879


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       B-geo       0.69      0.84      0.76      7440
       B-gpe       0.89      0.62      0.73      3125
       B-org       0.53      0.44      0.48      4049
       B-per       0.74      0.61      0.67      3506
       B-tim       0.93      0.57      0.71      4072
       I-geo       0.48      0.62      0.54      1486
       I-gpe       0.00      0.00      0.00        43
       I-org       0.54      0.47      0.51      3410
       I-per       0.77      0.72      0.75      3583
       I-tim       0.76      0.31      0.44      1352
           O       0.97      0.99      0.98    177386

   micro avg       0.93      0.93      0.93    209452
   macro avg       0.66      0.56      0.60    209452
weighted avg       0.93      0.93      0.93    209452



In [31]:
pr, re, f1 = gtc_with_y_neighboring_y(X_val_, y_val_, num_ys=3)

TP errors detected: 0.9365262667396017
FP errors detected: 0.007696419305293466
F1 score on trained model: 0.7658431168329041
              precision    recall  f1-score   support

       B-geo       0.85      0.88      0.86      7440
       B-gpe       0.97      0.92      0.94      3125
       B-org       0.82      0.70      0.75      4049
       B-per       0.86      0.82      0.84      3506
       B-tim       0.92      0.82      0.87      4072
       I-geo       0.78      0.73      0.76      1486
       I-gpe       0.00      0.00      0.00        43
       I-org       0.83      0.75      0.79      3410
       I-per       0.85      0.88      0.86      3583
       I-tim       0.82      0.71      0.76      1352
           O       0.99      0.99      0.99    177386

   micro avg       0.97      0.97      0.97    209452
   macro avg       0.79      0.75      0.77    209452
weighted avg       0.97      0.97      0.97    209452



In [32]:
pr, re, f1 = gtc_with_x_y_neighboring_y(X_val_, y_val_, num_x_keys=None)

TP errors detected: 0.9195486132214996
FP errors detected: 0.006711933091351738
F1 score on trained model: 0.7683004293936396
              precision    recall  f1-score   support

       B-geo       0.83      0.91      0.87      7440
       B-gpe       0.97      0.92      0.94      3125
       B-org       0.79      0.71      0.75      4049
       B-per       0.86      0.82      0.84      3506
       B-tim       0.93      0.81      0.87      4072
       I-geo       0.77      0.76      0.76      1486
       I-gpe       0.00      0.00      0.00        43
       I-org       0.83      0.75      0.79      3410
       I-per       0.86      0.89      0.88      3583
       I-tim       0.83      0.73      0.78      1352
           O       0.99      0.99      0.99    177386

   micro avg       0.97      0.97      0.97    209452
   macro avg       0.79      0.75      0.77    209452
weighted avg       0.97      0.97      0.97    209452

