# Introduction
This notebook is to test the classifier training phase of the NER Identifier.

### Specifically
- Specialized classifiers by geo-type

In [58]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, make_scorer, classification_report, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
train_df.head()

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,iob,geo_type,upper,...,next_1,next_2,prev_prefix_1,prev_prefix_2,trigger_1,trigger_2,sent_size,in_Country,in_State,in_City
0,3,0,0,0,SAN,1,B-City,0,City,1,...,22,22,0,0,0,0,3,0,0,1
1,3,0,0,1,PEDRO,22,I-City,0,City,1,...,22,13,0,0,0,0,3,0,0,1
2,3,0,0,2,SULA,22,I-City,0,City,1,...,13,34,0,0,0,0,3,0,0,1
3,3,1,1,0,-,13,none,1,none,0,...,34,4,0,0,0,0,32,0,0,0
4,3,1,1,1,Hasta,34,none,1,none,1,...,4,22,0,0,0,0,32,0,0,0


# Basic processing functions

In [4]:
iob_map = {
    "B":0,
    "I": 1,
    "n":2
}

def changeIob(df,geo_type):
    df.loc[:,"iob"] = df.iob_tag.apply(lambda x: iob_map[x[0]] if x[2:] == geo_type else 2)
    return df

In [5]:
def addNewFeatures(df):
    df.loc[:,"in_gazette"] = ((df["in_Country"] | df["in_State"] | df["in_City"]) & df["upper"])\
            .apply(lambda x: int(not x)).tolist()
    return df

In [6]:
# formats in X and y
def getXY(df, dropColumns, includeColumns=None, names=False):
    df = addNewFeatures(df)
    if includeColumns == None:
        X = df.drop(dropColumns, 1).values
    else:
        X = df[includeColumns].values
    y = df.iob.values
    # print "-- iob counts --"
    # print df.iob.value_counts()
    if names:
        features_names = df.drop(dropColumns, 1).columns
        return X, y, features_names
    return X, y

In [7]:
# add stacked features 
def add_stacked_feature(clf_ix, pos, df):
    preds = df["pred_"+str(clf_ix)].values.tolist()
    # decide column name
    if pos < 0:
        column = "pred" + str(clf_ix) + "_prev_" + str(-pos)
    elif pos > 0:
        column = "pred" + str(clf_ix) + "_next_" + str(pos)
    else:
        print "- ERROR: 0 value passed"
        return None
    # shift list
    df.loc[:,column ] = preds[pos:] + preds[:pos]
    # correct the values for the first words
    # if pos < 0:
    #     df.at[df[column] < -pos,column] = 1
   
    return df

# produces features from the predictions
def get_stacked_features(df, lvl_1_classifiers):
    for i in range(len(lvl_1_classifiers)):
        df = add_stacked_feature(i, -1, df)
        df = add_stacked_feature(i, -2, df)
        df = add_stacked_feature(i, 1, df)
        df = add_stacked_feature(i, 2, df)

    return df

In [50]:
#trains a model and scores it
def train_score(model, params, X_train, y_train, X_test, y_test, scores=False  ):
    f_one_scorer = make_scorer(f1_score,labels=[0,1], average="weighted")
    clf = GridSearchCV(model, params, cv=5, scoring= f_one_scorer, verbose=1, n_jobs=4 )
    clf.fit(X_train, y_train)

    if scores:
        print clf.best_score_
        print clf.best_params_
        print clf.cv_results_['mean_train_score']
        print clf.cv_results_['mean_test_score']

        print "- Train Results -"
        preds_train = clf.predict(X_train)
        print classification_report(y_train, preds_train,labels=[0,1] )

        print "- Test Results -"
        preds_test = clf.predict(X_test)
        print classification_report(y_test, preds_test,labels=[0,1])    
        
    
    return clf

In [23]:
def extract_entities(words, column):
    data = []
    for index, row in words.iterrows():
        if row[column][0] == "B":
            data.append([
                row.art_id,
                row.sent_id,
                row.cs_id,
                row.pos,
                row.word,
                row.geo_type
            ])
        elif row[column][0] == "I":
            if len(data) > 0:
                data[-1][4] += " " + row.word

    df = pd.DataFrame(data, columns=["art_id","sent_id","cs_id","pos","entity","geo_type"])

    return df

In [80]:
def score_entities(true, pred):
    geo_types =["Country","State","City","Zone","Col","Bar"]
    
    data = []
    for geo_type in geo_types:
        # Find the true positives
        true_positive = 0
        for index, entity in pred[pred["geo_type"]==geo_type].iterrows():
            result = true[ (true["geo_type"] == geo_type)
                          & (true["cs_id"] == entity.cs_id)
                          & (true["pos"] == entity.pos)
                          & (true["entity"] == entity.entity)
                         ]
            if len(result) > 0:
                true_positive += 1

        support = true[true["geo_type"]==geo_type].shape[0]
#         print "true positives: %i" %true_positive
#         print "predicted positives: %i" % pred[pred["geo_type"]==geo_type].shape[0]
#         print "real positives: %i" % support
        if true_positive == 0:
            precision =0.0
            recall = 0.0
            fscore = 0.0
        else:
            precision = true_positive * 1.0 / pred[pred["geo_type"]==geo_type].shape[0]
#         print "precision: %0.4f" %precision
            recall = true_positive * 1.0 / support
#         print "recall: %0.4f" %recall

        
            fscore = (precision * recall)*2.0 / (precision + recall)
#         print "fscores: %0.4f" %fscore
        
        data.append([geo_type,precision,recall,fscore,support ])
        
    scores_df = pd.DataFrame(data,columns=["geo_type","precision","recall","fscore","support"])
    
   
    f1_result = (scores_df["fscore"] * scores_df["support"]).sum() / scores_df["support"].sum()
    
 

    return scores_df, f1_result

In [11]:
desc_columns = ["art_id","sent_id", "cs_id","word", "iob_tag","iob" ,"geo_type",
#                 "first", "first_sent", "next_2", "pos_tag", "next_1", "prev_2"
               ]

# Initial Preds

In [13]:
def combine_preds(orig_preds, new_pred, geo_type):
    for idx, pred in enumerate(new_pred):
        if pred == 0:
            orig_preds[idx] = "B_"+ geo_type
        elif pred == 1:
            orig_preds[idx] = "I_"+ geo_type
    return orig_preds

In [51]:
def train_test_predict_rfc(train_df, test_df, scores=False, importance=False):
    X_train, y_train, features_names = getXY(train_df, desc_columns, names=True)
    X_test, y_test = getXY(test_df, desc_columns)
    
    parameters = {
        "n_estimators": [200],
        "max_depth": [20], #[3,4,5,8,10,20],
        "min_samples_split" : [4],
        "max_features": [.3] #[.8,.5,.3,.1]
    }
    rfc = RandomForestClassifier(random_state= 233, n_jobs=4)
    clf = train_score(rfc, parameters, X_train, y_train, X_test, y_test, scores=scores )
    #predict 
    tr_preds = clf.predict(X_train) 
    te_preds = clf.predict(X_test) 
    
    if importance:
        print pd.DataFrame({
            "names": features_names,
            "importance": clf.best_estimator_.feature_importances_
        }).sort_values(by="importance", ascending=False)
    
    return tr_preds, te_preds

# Classifiers

In [15]:
def country_clf(train_df,test_df, preds_train, preds_test):
    train_df = changeIob(train_df,"Country")
    test_df = changeIob(test_df,"Country")
    tr_preds,te_preds = train_test_predict_rfc(train_df, test_df)
    
    #combine
    preds_train = combine_preds(preds_train, tr_preds, "Country")
    preds_test = combine_preds(preds_test, te_preds, "Country")
    
    return preds_train, preds_test

In [31]:
def state_clf(train_df,test_df, preds_train, preds_test):
    train_df = changeIob(train_df,"State")
    test_df = changeIob(test_df,"State")
    tr_preds,te_preds = train_test_predict_rfc(train_df, test_df)
    
    #combine
    preds_train = combine_preds(preds_train, tr_preds, "State")
    preds_test = combine_preds(preds_test, te_preds, "State")
    
    return preds_train, preds_test

In [32]:
def city_clf(train_df,test_df, preds_train, preds_test):
    train_df = changeIob(train_df,"City")
    test_df = changeIob(test_df,"City")
    tr_preds,te_preds = train_test_predict_rfc(train_df, test_df)
    
    #combine
    preds_train = combine_preds(preds_train, tr_preds, "City")
    preds_test = combine_preds(preds_test, te_preds, "City")
    
    return preds_train, preds_test

In [52]:
def zone_clf(train_df,test_df, preds_train, preds_test, scores=False, importance=False):
    train_df = changeIob(train_df,"Zone")
    test_df = changeIob(test_df,"Zone")
    tr_preds,te_preds = train_test_predict_rfc(train_df, test_df, scores=scores, importance=importance)
    
    #combine
    preds_train = combine_preds(preds_train, tr_preds, "Zone")
    preds_test = combine_preds(preds_test, te_preds, "Zone")
    
    return preds_train, preds_test

In [55]:
def col_clf(train_df,test_df, preds_train, preds_test, scores=False, importance=False):
    train_df = changeIob(train_df,"Col")
    test_df = changeIob(test_df,"Col")
    tr_preds,te_preds = train_test_predict_rfc(train_df, test_df, scores=scores, importance=importance)
    
    #combine
    preds_train = combine_preds(preds_train, tr_preds, "Col")
    preds_test = combine_preds(preds_test, te_preds, "Col")
    
    return preds_train, preds_test

In [56]:
def bar_clf(train_df,test_df, preds_train, preds_test, scores=False, importance=False):
    train_df = changeIob(train_df,"Bar")
    test_df = changeIob(test_df,"Bar")
    tr_preds,te_preds = train_test_predict_rfc(train_df, test_df, scores=scores, importance=importance)
    
    #combine
    preds_train = combine_preds(preds_train, tr_preds, "Bar")
    preds_test = combine_preds(preds_test, te_preds, "Bar")
    
    return preds_train, preds_test

# Validate

In [68]:
train_df = pd.read_csv("../files/ner_features_train.csv")
test_df = pd.read_csv("../files/ner_features_test.csv")

In [69]:
preds_train = ["O" for i in range(train_df.shape[0])]
preds_test = ["O" for i in range(test_df.shape[0])]

In [70]:
preds_train, preds_test = country_clf(train_df,test_df, preds_train, preds_test)
preds_train, preds_test = state_clf(train_df,test_df, preds_train, preds_test)
preds_train, preds_test = city_clf(train_df,test_df, preds_train, preds_test)
preds_train, preds_test = zone_clf(train_df,test_df, preds_train, preds_test)
preds_train, preds_test = col_clf(train_df,test_df, preds_train, preds_test)
preds_train, preds_test = bar_clf(train_df,test_df, preds_train, preds_test)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   21.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   38.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   39.8s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   40.0s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   42.6s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   15.1s finished


In [66]:
preds_train, preds_test = zone_clf(train_df,test_df, preds_train, preds_test,scores=True,importance=True)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   26.0s finished


0.131773353424
{'max_features': 0.3, 'min_samples_split': 4, 'n_estimators': 200, 'max_depth': 20}
[ 0.75976917]
[ 0.13177335]
- Train Results -
             precision    recall  f1-score   support

          0       1.00      0.59      0.75       111
          1       1.00      0.62      0.77       129

avg / total       1.00      0.61      0.76       240

- Test Results -
             precision    recall  f1-score   support

          0       0.75      0.10      0.18        30
          1       0.60      0.08      0.15        36

avg / total       0.67      0.09      0.16        66

    importance                names
0     0.154271                  pos
26    0.149759            sent_size
6     0.102140                 size
11    0.098900               next_2
10    0.072937               next_1
9     0.065161               prev_2
8     0.060169               prev_1
16    0.039615         triggerzone1
1     0.036789              pos_tag
17    0.035547         triggerzone2
2     0.0346

# Entity Scores

In [81]:
final_train = train_df.copy()
final_test = test_df.copy()

final_train.loc[:,"pred"] = preds_train
final_test.loc[:,"pred"] = preds_test

#Train
print "--Train"
true_entities = extract_entities(final_train, "iob_tag")
pred_entities = extract_entities(final_train, "pred")
print score_entities(true_entities, pred_entities)

#Train
print "--Test"
true_entities = extract_entities(final_test, "iob_tag")
pred_entities = extract_entities(final_test, "pred")
print score_entities(true_entities, pred_entities)


--Train
(  geo_type  precision    recall    fscore  support
0  Country   0.979452  0.953333  0.966216      150
1    State   0.977941  0.869281  0.920415      153
2     City   0.941799  0.870416  0.904701      409
3     Zone   0.720588  0.441441  0.547486      111
4      Col   0.717949  0.617647  0.664032      136
5      Bar   0.952381  0.909091  0.930233       22, 0.8433467888927045)
--Test
(  geo_type  precision    recall    fscore  support
0  Country   0.971429  0.894737  0.931507       38
1    State   0.878788  0.743590  0.805556       39
2     City   0.806818  0.689320  0.743455      103
3     Zone   0.600000  0.100000  0.171429       30
4      Col   0.652174  0.416667  0.508475       36
5      Bar   1.000000  0.666667  0.800000        6, 0.681102321665181)


In [None]:
# --Train
# (  geo_type  precision    recall    fscore  support
# 0  Country   0.979452  0.953333  0.966216      150
# 1    State   0.977941  0.869281  0.920415      153
# 2     City   0.941799  0.870416  0.904701      409
# 3     Zone   0.720588  0.441441  0.547486      111
# 4      Col   0.717949  0.617647  0.664032      136
# 5      Bar   0.952381  0.909091  0.930233       22, 0.8433467888927045)
# --Test
# (  geo_type  precision    recall    fscore  support
# 0  Country   0.971429  0.894737  0.931507       38
# 1    State   0.878788  0.743590  0.805556       39
# 2     City   0.806818  0.689320  0.743455      103
# 3     Zone   0.600000  0.100000  0.171429       30
# 4      Col   0.652174  0.416667  0.508475       36
# 5      Bar   1.000000  0.666667  0.800000        6, 0.681102321665181)

# NEXT STEPS:
- GEt better scores for ZONE COL and BAR

In [79]:
results = [
    [0.931507,38],
    [0.794521,39],
    [0.715789,103],
    [0.062500,30],
    [0.275862,36],
    [0.000001,6],
]

num = sum([x[1] for x in results])
sum([x[0]*(x[1]*1.0/num) for x in results])


0.6028408333333334

981