In [1]:
import nltk

from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer
from nltk.corpus import  conll2002

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import GridSearchCV, StratifiedKFold


from sklearn.metrics import classification_report

import ast 

# LOC NE Identifier-classifier 

In [105]:
# read the data
articles_df = pd.read_csv("../files/pos_articles.csv")
print articles_df.shape
articles_df.head(1)
# articles_df.article_id.unique()

(208, 4)


Unnamed: 0.1,Unnamed: 0,article_id,tagged_title,tagged_content
0,0,3,"[[u'Citan', 'NC', u'none'], [u'a', u'SP', u'no...","[[[u'SAN', u'AQ', u'B-Loc'], [u'PEDRO', 'NC', ..."


## Convert sentences to df

In [64]:
def convert_df(content_vals):
    print "- convert_df"
    s_id = []
    s_word = []
    s_tag = []
    s_iob =[]
    s_pos = []
    
    art_id = []
    corpus_sent_id = []
    
    entity_count = []
    token_count = []
    csent_id = -1
    artid = -1
    for sentences in content_vals:
        artid += 1
        for sent_num in range(len(sentences)):
            csent_id += 1
            sent = sentences[sent_num]
            entity_count.append(0)
            token_count.append(0)
            for pos in range(len(sent)):
                word = sent[pos]
                s_id.append(sent_num)
                s_word.append(word[0])
                s_tag.append(word[1])
                s_iob.append(word[2])
                s_pos.append(pos)
                
                art_id.append(artid)
                corpus_sent_id.append(csent_id)
                if word[2] == "B-Loc":
                    entity_count[-1] += 1
                    token_count[-1]  += 1
                elif word[2] == "I-Loc":
                    token_count[-1] += 1
                
        
    df = pd.DataFrame({
            "sentence": s_id,
            "word": s_word,
            "tag": s_tag,
            "iob": s_iob,
            "pos": s_pos,
            "cs_id": corpus_sent_id,
            "art_id" : art_id
        })
    
    return df, entity_count, token_count

## Get Features

In [65]:
def get_ident_features(base_df):
    print "- get features"
    df = pd.DataFrame()
    # iob
    #clean the iob
    base_df.loc[:,"iob"] = base_df.iob.apply(lambda x: x if x!="B-Org" else "none" )
    
    le_iob = LabelEncoder()
    df.loc[:,"iob"] = le_iob.fit_transform(base_df.iob)

    #tag
    le_tag = LabelEncoder()
    df.loc[:,"tag"] = le_tag.fit_transform(base_df.tag)

    # Uppercase
    df.loc[:,"upper"] = base_df.word.apply(lambda x: x[0].isupper())
    
    # Pos
    df.loc[:,"pos"] = base_df.pos
    
    #first 
    df.loc[:,"first"] = base_df.pos.apply(lambda x: int(x == 0) )
    #size
    df.loc[:,"size"] = base_df.word.apply(lambda x: len(x))
    
    #word
    df.loc[:,"word"] = base_df.word.values
    
    #sentence
    df.loc[:,"first_sent"] = base_df.sentence.apply(lambda x: int(x==0)).values
#     df.loc[:,"sent_id"] = base_df.sentence.values


    #corpus_sent_id  this is not a feature but it will be used as reference
    df.loc[:,"cs_id"] = base_df.cs_id.values
    df.loc[:,"art_id"] = base_df.art_id.values
    
    
    return df, le_iob, le_tag

In [66]:
def extract_simple_features(df):
    
    # add tag features by shifting the list by one or two
    tags = df.tag.values.tolist()
    df.loc[:,"prev_1"] = tags[-1:] + tags[:-1]
#     df.at[df["pos"] < 1,'prev_1'] = -1
    df.loc[:,"prev_2"] = tags[-2:] + tags[:-2]
    df.at[df["pos"] < 2,'prev_2'] = -1
    
#     df.loc[:,"prev_3"] = tags[-3:] + tags[:-3]
#     df.at[df["pos"] < 3,'prev_3'] = -1
    
#     df.loc[:,"prev_4"] = tags[-4:] + tags[:-4]
#     df.at[df["pos"] < 4,'prev_4'] = -1
    
#     df.loc[:,"prev_5"] = tags[-5:] + tags[:-5]
#     df.at[df["pos"] < 5,'prev_5'] = -1
    
    df.loc[:,"next_1"] = tags[1:] + tags[:1]
    df.loc[:,"next_2"] = tags[2:] + tags[:2]
#     df.loc[:,"next_3"] = tags[3:] + tags[:3]
#     df.loc[:,"next_4"] = tags[4:] + tags[:4]
#     df.loc[:,"next_5"] = tags[5:] + tags[:5]
    
    prefixes = ["colonia", "barrio", "residencial","ciudad", "aldea","zona","puente","mercado"]
    words = df.word.apply(lambda x: x.lower() ).values.tolist()
    df.loc[:,"prev_prefix_1"] = words[-1:] + words[:-1]
    df.loc[:,"prev_prefix_1"]  = df.prev_prefix_1.apply(lambda x:  int( x in prefixes ))
    
    df.loc[:,"prev_prefix_2"] = words[-2:] + words[:-2]
    df.loc[:,"prev_prefix_2"]  = df.prev_prefix_2.apply(lambda x:  int( x in prefixes ))
    
    
#     iobs = df.iob.values.tolist()
#     df.loc[:,"prevIOB_1"] = iobs[-1:] + iobs[:-1]
#     df.loc[:,"prevIOB_2"] = iobs[-2:] + iobs[:-2]
    #Next IOB would be cheating
#     df.loc[:,"nextIOB_1"] = iobs[1:] + iobs[:1]
#     df.loc[:,"nextIOB_2"] = iobs[2:] + iobs[:2]

#     tag_df = pd.get_dummies(df.tag,prefix="tag_")
#     for column in tag_df.columns:
#         df.loc[:,column] = tag_df[column]

    return df

## Execute

In [67]:
# converting strings to objects 
# articles_df.tagged_content =articles_df.tagged_content.apply(lambda x: ast.literal_eval(x))

content_vals = articles_df.tagged_content.values


words_df, entity_count, token_count = convert_df(content_vals)
features_df, le_iob, le_tag = get_ident_features(words_df)
features_df = extract_simple_features(features_df)

print features_df.shape
print le_iob.classes_
features_df.head(5)

- convert_df
- get features
(64991, 16)
[u'B-Loc' u'I-Loc' u'none']


Unnamed: 0,iob,tag,upper,pos,first,size,word,first_sent,cs_id,art_id,prev_1,prev_2,next_1,next_2,prev_prefix_1,prev_prefix_2
0,0,1,True,0,1,3,SAN,1,0,0,27,-1,21,21,0,0
1,1,21,True,1,0,5,PEDRO,1,0,0,1,-1,21,12,0,0
2,1,21,True,2,0,4,SULA,1,0,0,21,1,12,33,0,0
3,2,12,False,0,1,1,-,0,1,0,21,-1,33,4,0,0
4,2,33,True,1,0,5,Hasta,0,1,0,12,-1,4,21,0,0


## prepare data

In [47]:
print "average entity per sentence"
print sum(entity_count) * 1.0 / len(entity_count)
print "average tokens per sentence"
print sum(token_count) * 1.0 / len(token_count)

#split using sentence entity count
skf = StratifiedKFold(n_splits=5,shuffle=True, random_state= 233 )
for train_index, test_index in skf.split([0 for i in range(len(entity_count))], entity_count):
    X_train = features_df[features_df["cs_id"].isin( train_index )].drop(["word","iob", "cs_id","art_id"],1).values
    X_test = features_df[features_df["cs_id"].isin( test_index )].drop(["word","iob", "cs_id","art_id"],1).values

    y_train = features_df[features_df["cs_id"].isin( train_index )].iob.values
    y_test = features_df[features_df["cs_id"].isin( test_index )].iob.values
    
    entity_count_train = [entity_count[i] for i in train_index]
    entity_count_test = [entity_count[i] for i in test_index]
    
    break

    
print "avg entities per sentence in train set"  
print sum(entity_count_train) * 1.0 / len(entity_count_train)
print "number of sentences in train set: %i"  %len(train_index)
print "avg entities per sentence in test set"  
print sum(entity_count_test) * 1.0 / len(entity_count_test)
print "number of sentences in train set: %i"  %len(test_index)



average entity per sentence
0.448767833982
average tokens per sentence
0.772589710333
avg entities per sentence in train set
0.45027027027
number of sentences in train set: 1850
avg entities per sentence in test set
0.442764578834
number of sentences in train set: 463


In [11]:
# X = features_df.drop(["word","iob"],1).values
# y = features_df.iob.values

# # split to X_train and X_test
# skf = StratifiedKFold(n_splits=10,shuffle=True, random_state= 233 )
# for train_index, test_index in skf.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]
#     break

## Train Classifier

In [48]:
from sklearn.metrics import f1_score, make_scorer
f_one_scorer = make_scorer(f1_score,average="weighted", labels=[0,1] )

print "- train classifier"
# parameters = {
#     "n_estimators": [200],
#     "max_depth": [ 3,4,10,16,20],
#     "min_samples_split" : [4,10,20]
# }
parameters = {
    "n_estimators": [200],
    "max_depth": [20],
    "min_samples_split" : [4]
}
clf = RandomForestClassifier(random_state= 233, n_jobs=4)
clf = GridSearchCV(clf, parameters, cv=5, scoring= f_one_scorer, verbose=1 )

clf.fit(X_train, y_train)

print clf.best_score_

print clf.best_params_

print clf.cv_results_['mean_train_score']
print clf.cv_results_['mean_test_score']

- train classifier
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.9s finished


0.607688319613
{'min_samples_split': 4, 'n_estimators': 200, 'max_depth': 20}
[ 0.91227813]
[ 0.60768832]


## Results

In [49]:
print "- Train Results -"
preds_train = clf.predict(X_train)
print(classification_report(y_train, preds_train,labels=[0,1]))
print "- Test Results -"
preds_test = clf.predict(X_test)
print(classification_report(y_test, preds_test,labels=[0,1]))

- Train Results -
             precision    recall  f1-score   support

          0       0.97      0.90      0.93       833
          1       0.97      0.78      0.87       596

avg / total       0.97      0.85      0.90      1429

- Test Results -
             precision    recall  f1-score   support

          0       0.87      0.71      0.78       205
          1       0.85      0.48      0.61       153

avg / total       0.86      0.61      0.71       358



0       0.82      0.65      0.73       104
1       0.93      0.51      0.66        75


0       0.82      0.64      0.72       104
1       0.90      0.48      0.63        75

# Stacking!

In [50]:
print X_train.shape
preds_train_rf = clf.predict(X_train)
preds_test_rf = clf.predict(X_test)

X_train2 = np.column_stack( (X_train, preds_train_rf) )
X_test2 = np.column_stack( (X_test, preds_test_rf) )
print len(X_train2[0])  #just to know whats the last one
print X_train2[:10,12]
print preds_train_rf[:10]

(51917, 12)
13
[2 2 2 2 2 2 2 2 2 2]
[2 2 2 2 2 2 2 2 2 2]


In [51]:
#weird uggly algorithm to get the prevNE stacked tag
def getStackedFeatures(X):
    prevNE = []
    nextNE = []
    for index, row in enumerate(X):
        if index == 0:
            prevNE.append( 2 )
        else:
            prevNE.append( X[index-1,-1] )

        if index == len(X)-1:
            nextNE.append( 2 )
        else:
            nextNE.append( X[index+1,-1] )

    X2 = np.column_stack( (X, prevNE) )
    X2 = np.column_stack( (X2, nextNE) )

    # #pos -2 and pos +2
    X2 = np.column_stack( (X2, prevNE[-2:] + prevNE[:-2]) )
    X2 = np.column_stack( (X2, nextNE[2:]  + nextNE[:2]) )

    print X2.shape
    return X2

X_train2 = getStackedFeatures(X_train2)
X_test2 = getStackedFeatures(X_test2)

(51917, 17)
(13074, 17)


In [18]:
# split to X_train and X_test
# skf = StratifiedKFold(n_splits=10,shuffle=True, random_state= 233 )
# for train_index, test_index in skf.split(X3, y):
# X2_train, X2_test = X3[train_index], X3[test_index]
# y2_train, y2_test = y[train_index], y[test_index]
#     break

In [52]:
from sklearn.metrics import f1_score, make_scorer
f_one_scorer = make_scorer(f1_score,average="weighted", labels=[0,1] )

print "- train classifier"
# parameters = {
#     "n_estimators": [200],
#     "max_depth": [ 3,4,10,16,20],
#     "min_samples_split" : [4,10,20]
# }
parameters = {
    "n_estimators": [200],
    "max_depth": [20],
    "min_samples_split" : [4]
}
clf2 = RandomForestClassifier(random_state= 233, n_jobs=4)
clf2 = GridSearchCV(clf2, parameters, cv=5, scoring= f_one_scorer, verbose=1 )

clf2.fit(X_train2, y_train)

print clf2.best_score_

print clf2.best_params_

print clf2.cv_results_['mean_train_score']
print clf2.cv_results_['mean_test_score']

- train classifier
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   20.7s finished


0.933270555232
{'min_samples_split': 4, 'n_estimators': 200, 'max_depth': 20}
[ 0.96009759]
[ 0.93327056]


In [20]:
print "- Train Results -"
preds_train2 = clf2.predict(X_train2)
print(classification_report(y_train, preds_train2,labels=[0,1]))
print "- Test Results -"
preds_test2 = clf2.predict(X_test2)
print(classification_report(y_test, preds_test2,labels=[0,1]))

- Train Results -
             precision    recall  f1-score   support

          0       0.98      0.94      0.96       833
          1       0.99      0.93      0.96       596

avg / total       0.98      0.94      0.96      1429

- Test Results -
             precision    recall  f1-score   support

          0       0.87      0.73      0.79       205
          1       0.86      0.64      0.73       153

avg / total       0.87      0.69      0.77       358



0       0.82      0.64      0.72       104
1       0.90      0.48      0.63        75

# Measure whole Named Entities recognition... somehow

Up until now we've just done token-based evaluation... now the idea is to make entity-based evaluation.
**Entity-Based evaluation** will be defined like this:

The algorithm must match exactly the full entity if it misses a token it should be considered an error.

So suppose that the true values are:

[ "San Marcos", "Los Dolores", **"El Carrizal"**, **"Navarro"** ]

and that the model identified:

[ **"Miguel"**, "San Marcos", "Los Dolores", **"Carrizal"**]  


then 


**precision** =   len(["San Marcos","Los Dolores" ]) / ( len(["San Marcos","Los Dolores" ]) + len(["Miguel", "Carrizal"]) )

and 

**recall**  = len(["San Marcos","Los Dolores" ]) / ( len(["San Marcos","Los Dolores" ]) + len(["El Carrizal", "Navarro"]) )

### Strategy

1) get all true values with artid, sentid, posid, entity  format

2) predict values and store them in artid, sentid, posid, entity  format

3) compare true values against predicted get recall

4) compare predicted against true values get precision










In [74]:
#1 get all true values with artid, sentid, posid, entity format

def get_true_values(df):
    art_id = []
    s_id = []
    s_word = []
    s_tag = []
    s_iob =[]
    s_pos = []
    
    corpus_sent_id = []
    csent_id = -1
    artid = -1
    for index, article in df.iterrows():
        sentences = article["tagged_content"]
        artid += 1
        for sent_num in range(len(sentences)):
            sent = sentences[sent_num]
            csent_id += 1
            for pos in range(len(sent)):
                word = sent[pos]
                
                if word[2][0] == "B":
                    corpus_sent_id.append(csent_id)
                    art_id.append(artid)
                    s_id.append(sent_num)
                    s_pos.append(pos)
                    s_word.append(word[0])
                    s_iob.append(word[2])
                elif word[2][0] == "I":
                    

                    s_word[-1] += " " + word[0]
                    
                
        
    df = pd.DataFrame({
            "art_id": art_id,
#             "sent_id": s_id,
            "pos": s_pos,
            "word": s_word,
            "cs_id": corpus_sent_id
        })
    
    return df


true_values = get_true_values(articles_df)

In [75]:
true_values.head(10)
# articles_df.head()

Unnamed: 0,art_id,cs_id,pos,word
0,0,0,0,SAN PEDRO SULA
1,0,2,13,San Pedro Sula
2,0,3,38,Lomas del Carmen
3,0,5,19,Monumento a la Madre
4,0,5,24,primera avenida
5,0,5,27,parque central
6,0,7,11,San Pedro Sula
7,1,10,0,TEGUCIGALPA
8,2,21,0,TEGUCIGALPA
9,2,29,9,Kennedy


In [96]:
words_train = features_df[features_df["cs_id"].isin( train_index )][["art_id","cs_id","pos", "word"]].values
words_test = features_df[features_df["cs_id"].isin( test_index )][["art_id","cs_id","pos", "word"]].values


preds_train2[:10]

# corpus_sent_train[:100]

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [97]:
def getIdentifiedEntities(words, preds):
    entities = []
    last_index = -1
    for index, pred in enumerate(preds):
        
        if pred == 0:  #B-Loc
            entities.append(words[index])
            last_index = index
        elif pred == 1 and last_index == index-1 : #I-Loc
            entities[-1][3] += " " + words[index][3]
            
            
    df = pd.DataFrame(entities,columns=["art_id","cs_id", "pos", "word"] )
            
    return df
            
train_identified = getIdentifiedEntities (words_train,preds_train2 ) 
test_identified = getIdentifiedEntities (words_test,preds_test2 ) 

In [98]:
train_true = true_values[true_values["cs_id"].isin( train_index )].sort_values(by=["art_id"])
test_true = true_values[true_values["cs_id"].isin( test_index )].sort_values(by=["art_id"])

# Get scores

In [100]:
def getPrecision(pred, true):
    true_positive = 0
    for index, entity in pred.iterrows():
        result = true[(true["cs_id"] == entity.cs_id)
                      & (true["pos"] == entity.pos)
                      & (true["word"] == entity.word)
                     ]
        if len(result) > 0:
            true_positive += 1
    
    print "true positives: %i" %true_positive
    print "predicted: %i" % pred.shape[0]
    precision = true_positive * 1.0 / pred.shape[0]
    print "precision: %0.4f" %precision
    
print "--- train ---"
getPrecision(train_identified, train_true)   
print "--- test ---"
getPrecision(test_identified, test_true)      

--- train ---
true positives: 632
predicted: 797
precision: 0.7930
--- test ---
true positives: 119
predicted: 171
precision: 0.6959


In [104]:
def getRecall(pred, true):
    true_positive = 0
    for index, entity in true.iterrows():
        result = pred[(pred["cs_id"] == entity.cs_id)
                      & (pred["pos"] == entity.pos)
                      & (pred["word"] == entity.word)
                     ]
        if len(result) > 0:
            true_positive += 1
    
    print "true positives: %i" %true_positive
    print "positives: %i" % true.shape[0]
    recall = true_positive * 1.0 / true.shape[0]
    print "recall: %0.4f" %recall
    
print "--- train ---"  
getRecall(train_identified, train_true)
print "--- test ---"
getRecall(test_identified, test_true)     

--- train ---
true positives: 632
positives: 834
recall: 0.7578
--- test ---
true positives: 119
positives: 206
recall: 0.5777
