# Introduction
This notebook is to test the classifier training phase of the NER Identifier.

In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, make_scorer, classification_report, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [19]:
train_df = pd.read_csv("../files/ner_features_train.csv")
test_df = pd.read_csv("../files/ner_features_test.csv")

In [5]:
train_df.head()

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,iob,geo_type,upper,...,first_sent,prev_1,prev_2,next_1,next_2,prev_prefix_1,prev_prefix_2,in_Country,in_State,in_City
0,3,0,0,0,SAN,1,B-City,0,City,1,...,1,22,-1,22,22,0,0,0,0,1
1,3,0,0,1,PEDRO,22,I-City,0,City,1,...,1,1,-1,22,13,0,0,0,0,1
2,3,0,0,2,SULA,22,I-City,0,City,1,...,1,22,1,13,34,0,0,0,0,1
3,3,1,1,0,-,13,none,1,none,0,...,0,22,-1,34,4,0,0,0,0,0
4,3,1,1,1,Hasta,34,none,1,none,1,...,0,13,-1,4,22,0,0,0,0,0


# Basic processing functions

In [None]:
def addNewFeatures():
    return

In [80]:
# formats in X and y
def getXY(df, dropColumns, includeColumns=None, names=False):
    if includeColumns == None:
        X = df.drop(dropColumns, 1).values
    else:
        X = df[includeColumns].values
    y = df.iob.values
    # print "-- iob counts --"
    # print df.iob.value_counts()
    if names:
        features_names = df.drop(dropColumns, 1).columns
        return X, y, features_names
    return X, y

In [8]:
# add stacked features 
def add_stacked_feature(clf_ix, pos, df):
    preds = df["pred_"+str(clf_ix)].values.tolist()
    # decide column name
    if pos < 0:
        column = "pred" + str(clf_ix) + "_prev_" + str(-pos)
    elif pos > 0:
        column = "pred" + str(clf_ix) + "_next_" + str(pos)
    else:
        print "- ERROR: 0 value passed"
        return None
    # shift list
    df.loc[:,column ] = preds[pos:] + preds[:pos]
    # correct the values for the first words
    # if pos < 0:
    #     df.at[df[column] < -pos,column] = 1
   
    return df

# produces features from the predictions
def get_stacked_features(df, lvl_1_classifiers):
    for i in range(len(lvl_1_classifiers)):
        df = add_stacked_feature(i, -1, df)
        df = add_stacked_feature(i, -2, df)
        df = add_stacked_feature(i, 1, df)
        df = add_stacked_feature(i, 2, df)

    return df

In [23]:
#trains a model and scores it
def train_score(model, params, X_train, y_train, X_test, y_test):
    f_one_scorer = make_scorer(f1_score,labels=[0], average="weighted")
    clf = GridSearchCV(model, params, cv=5, scoring= f_one_scorer, verbose=1, n_jobs=4 )
    clf.fit(X_train, y_train)

    print clf.best_score_
    print clf.best_params_
    print clf.cv_results_['mean_train_score']
    print clf.cv_results_['mean_test_score']
        
    print "- Train Results -"
    preds_train = clf.predict(X_train)
    print classification_report(y_train, preds_train )
    
    print "- Test Results -"
    preds_test = clf.predict(X_test)
    print classification_report(y_test, preds_test)
    
    return clf

## TODO LIST



- Dummy pos tags (even for pre and next)

- sentece Size
- previous word is uppercase
- feature selection L1
- Try KNN  (using minmax)
- Try SVC  (using minmax)
- Proba results?

- Try HMM

# Quickly test the Random Forest classifier

In [38]:
desc_columns = ["art_id","sent_id", "cs_id","word", "iob_tag","iob" ,"geo_type",
#                 "first", "first_sent", "next_2", "pos_tag", "next_1", "prev_2"
               ]
X_train, y_train, features_names = getXY(train_df, desc_columns, names=True)
X_test, y_test = getXY(test_df, desc_columns)

In [126]:
parameters = {
    "n_estimators": [500],
    "max_depth": [20], #[3,4,5,8,10,20],
    "min_samples_split" : [4]
}
rfc = RandomForestClassifier(random_state= 233, n_jobs=4)
clf = train_score(rfc, parameters, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   45.6s finished


0.793258312296
{'min_samples_split': 4, 'n_estimators': 500, 'max_depth': 20}
[ 0.91491597]
[ 0.79325831]
- Train Results -
             precision    recall  f1-score   support

          0       0.98      0.84      0.91      1640
          1       0.99      1.00      1.00     51007

avg / total       0.99      0.99      0.99     52647

- Test Results -
             precision    recall  f1-score   support

          0       0.87      0.74      0.80       368
          1       0.99      1.00      1.00     16233

avg / total       0.99      0.99      0.99     16601



In [37]:
pd.DataFrame({
    "names": features_names,
    "importance": clf.best_estimator_.feature_importances_
}).sort(columns="importance", ascending=False)




Unnamed: 0,importance,names
8,0.303603,in_City
1,0.22321,upper
6,0.099729,in_Country
0,0.089092,pos
7,0.085685,in_State
4,0.055829,prev_prefix_1
3,0.052094,prev_1
5,0.048783,prev_prefix_2
2,0.041975,size


# Test the Correction Algorithm
This algorithm assumes that all of the Positives from the first model are correct because of its high precision ( which should be around .9 )... Therefore, the only words that need correction are the False Negatives that cause the low recall. So by removing the correct answers we're gonna train a model to distinguish which words are false positives and should be corrected by using the previous algorithm results.  For instance, intuitively, a word that has it first letter capitalized and is next to another word that is an entity would most probably be also an entity.

In [85]:
#1. get preds and add them to the original features
train_stacked = train_df.copy()
test_stacked = test_df.copy()
preds_train = clf.predict(X_train)
preds_test = clf.predict(X_test)

train_stacked.loc[:,"pred_0"] = preds_train
test_stacked.loc[:,"pred_0"] = preds_test

#1.5 add dummy variables
prev_dummies = pd.get_dummies(train_stacked["prev_1"], prefix="prev_1")
train_stacked = pd.concat([train_stacked, prev_dummies], axis=1)
train_stacked.loc[:,"prev_1_14"] = np.zeros(train_stacked.shape[0])

prev_dummies = pd.get_dummies(test_stacked["prev_1"], prefix="prev_1")
test_stacked = pd.concat([test_stacked, prev_dummies], axis=1)
test_stacked.loc[:,"prev_1_10"] = np.zeros(test_stacked.shape[0])
test_stacked.loc[:,"prev_1_20"] = np.zeros(test_stacked.shape[0])




#2. Generate features with prev and next
train_stacked = get_stacked_features(train_stacked, [0])
test_stacked = get_stacked_features(test_stacked, [0])

#3. Filter only the ones that aren't positive i.e. only pred_0 == 1
train_stacked = train_stacked[train_stacked["pred_0"]==1]
test_stacked = test_stacked[test_stacked["pred_0"]==1]
# train_stacked[train_stacked["iob"]==0]


#4.Remove some of the features
desc_columns2 = desc_columns + [ 
    "in_City", "in_Country", "pred_0","in_State", "prev_1"
]
X_train2, y_train2, features_names2 = getXY(train_stacked, desc_columns2, names=True)
X_test2, y_test2 = getXY(test_stacked, desc_columns2, includeColumns= features_names2.tolist())

#5. Train the new classifier ... muhahaha!
parameters = {
    "n_estimators": [200],
    "max_depth": [40 ]  ,#,4,5,8,10,20],
    "min_samples_split" : [4]
}
rfc2 = RandomForestClassifier(random_state= 233, n_jobs=4)
clf2 = train_score(rfc2, parameters, X_train2, y_train2, X_test2, y_test2)


#  0.81      0.18      0.29   original
# 0.81      0.18      0.29    with restrictions
# 0.63      0.26      0.37    with max_depth 20
# 0.63      0.28      0.39    with max_depth 40

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   27.6s finished


0.388010165388
{'min_samples_split': 4, 'n_estimators': 200, 'max_depth': 40}
[ 0.87050959]
[ 0.38801017]
- Train Results -
             precision    recall  f1-score   support

          0       0.99      0.79      0.88       568
          1       1.00      1.00      1.00     50936

avg / total       1.00      1.00      1.00     51504

- Test Results -
             precision    recall  f1-score   support

          0       0.59      0.25      0.35       117
          1       0.99      1.00      1.00     16199

avg / total       0.99      0.99      0.99     16316



In [87]:
pd.DataFrame({
    "names": features_names2,
    "importance": clf2.best_estimator_.feature_importances_
}).sort(columns="importance", ascending=False)[:20]



Unnamed: 0,importance,names
0,0.181539,pos
2,0.134722,upper
4,0.11486,size
7,0.103601,next_1
8,0.10203,next_2
6,0.087242,prev_2
1,0.04834,pos_tag
63,0.032789,pred0_prev_2
44,0.030236,prev_1_34
65,0.020145,pred0_next_2


In [77]:
train_stacked = train_df.copy()
test_stacked = test_df.copy()

prev_dummies = pd.get_dummies(train_stacked["prev_1"], prefix="prev_1")
train_stacked = pd.concat([train_stacked, prev_dummies], axis=1)
print prev_dummies.columns
prev_dummies = pd.get_dummies(test_stacked["prev_1"], prefix="prev_1")
test_stacked = pd.concat([test_stacked, prev_dummies], axis=1)
prev_dummies.loc[:,"prev_1_10"] = np.zeros(test_stacked.shape[0])
print prev_dummies.columns

test_stacked.shape

Index([u'prev_1_0', u'prev_1_1', u'prev_1_2', u'prev_1_3', u'prev_1_4',
       u'prev_1_5', u'prev_1_6', u'prev_1_7', u'prev_1_8', u'prev_1_9',
       u'prev_1_10', u'prev_1_11', u'prev_1_12', u'prev_1_13', u'prev_1_15',
       u'prev_1_16', u'prev_1_17', u'prev_1_18', u'prev_1_19', u'prev_1_20',
       u'prev_1_21', u'prev_1_22', u'prev_1_23', u'prev_1_24', u'prev_1_25',
       u'prev_1_26', u'prev_1_27', u'prev_1_28', u'prev_1_29', u'prev_1_30',
       u'prev_1_31', u'prev_1_32', u'prev_1_33', u'prev_1_34', u'prev_1_35',
       u'prev_1_36', u'prev_1_37', u'prev_1_38', u'prev_1_39', u'prev_1_40',
       u'prev_1_41', u'prev_1_42', u'prev_1_43', u'prev_1_44', u'prev_1_45',
       u'prev_1_46', u'prev_1_47', u'prev_1_48', u'prev_1_49', u'prev_1_50'],
      dtype='object')
Index([u'prev_1_0', u'prev_1_1', u'prev_1_2', u'prev_1_3', u'prev_1_4',
       u'prev_1_5', u'prev_1_6', u'prev_1_7', u'prev_1_8', u'prev_1_9',
       u'prev_1_11', u'prev_1_12', u'prev_1_13', u'prev_1_14', u'prev_1_1

(16601, 71)

# Test the rule based correction Algorithm

In [119]:
#1. get preds and add them to the original features
train_stacked = train_df.copy()
test_stacked = test_df.copy()
preds_train = clf.predict(X_train)
preds_test = clf.predict(X_test)

train_stacked.loc[:,"pred_0"] = preds_train
test_stacked.loc[:,"pred_0"] = preds_test

#2. Generate features with prev and next
train_stacked = get_stacked_features(train_stacked, [0])
test_stacked = get_stacked_features(test_stacked, [0])

#3. Filter only the ones that aren't positive i.e. only pred_0 == 1
train_stacked = train_stacked[train_stacked["pred_0"]==1]
test_stacked = test_stacked[test_stacked["pred_0"]==1]
# train_stacked[train_stacked["iob"]==0]

#4. Simple rule based classifier
def rbc_predict(df):
    preds = []
    for _, row in df.iterrows():
#         if row.upper == 1 and row.pred0_prev_1 == 0 and row.pos>0:
#             preds.append(0)
        if row.pred0_prev_1 == 0 and row.pred0_next_1 == 0 and row.word=="de":
            preds.append(0)
        else:
            preds.append(1)
    return preds

preds_train2 = rbc_predict(train_stacked)
preds_test2 = rbc_predict(test_stacked)
            

print classification_report(y_train2, preds_train2  )
print classification_report(y_test2, preds_test2 )


             precision    recall  f1-score   support

          0       0.18      0.01      0.02       568
          1       0.99      1.00      0.99     50936

avg / total       0.98      0.99      0.98     51504

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       117
          1       0.99      1.00      1.00     16199

avg / total       0.99      0.99      0.99     16316



In [120]:
train_stacked.loc[:,"pred"] = preds_train2
train_stacked[train_stacked["pred"]==0][["art_id","sent_id","pos","upper","pred0_prev_1","pred0_next_1","word","iob","pred"]]

Unnamed: 0,art_id,sent_id,pos,upper,pred0_prev_1,pred0_next_1,word,iob,pred
3780,32,9,15,0,0,0,de,0,0
4762,46,4,35,0,0,0,de,1,0
5286,55,1,21,0,0,0,de,0,0
7010,78,8,18,0,0,0,de,0,0
12607,390,0,58,0,0,0,de,1,0
16242,495,4,26,0,0,0,de,0,0
20294,643,6,16,0,0,0,de,0,0
21171,671,0,43,0,0,0,de,1,0
21423,685,0,36,0,0,0,de,1,0
21957,707,0,48,0,0,0,de,1,0


In [123]:
train_stacked = train_df.copy()
test_stacked = test_df.copy()
preds_train = clf.predict(X_train)
preds_test = clf.predict(X_test)

train_stacked.loc[:,"pred_0"] = preds_train
test_stacked.loc[:,"pred_0"] = preds_test

#2. Generate features with prev and next
train_stacked = get_stacked_features(train_stacked, [0])
test_stacked = get_stacked_features(test_stacked, [0])

train_stacked.head(10)
train_stacked.loc[21170:21172]

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,iob,geo_type,upper,...,prev_prefix_1,prev_prefix_2,in_Country,in_State,in_City,pred_0,pred0_prev_1,pred0_prev_2,pred0_next_1,pred0_next_2
21170,671,0,754,42,Suyapa,22,I-City,0,City,1,...,0,1,0,0,0,0,0,1,1,0
21171,671,0,754,43,de,34,none,1,none,0,...,0,0,0,0,0,1,0,0,0,1
21172,671,0,754,44,Tegucigalpa,22,B-City,0,City,1,...,0,0,0,0,1,0,1,0,1,1


# Try HMM