# Introduction
This notebook is to test the classifier training phase of the NER Identifier.

### Specifically
- Specialized classifiers by geo-type

In [58]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, make_scorer, classification_report, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Basic processing functions

In [4]:
iob_map = {
    "B":0,
    "I": 1,
    "n":2
}

def changeIob(df,geo_type):
    df.loc[:,"iob"] = df.iob_tag.apply(lambda x: iob_map[x[0]] if x[2:] == geo_type else 2)
    return df

In [5]:
def addNewFeatures(df):
    df.loc[:,"in_gazette"] = ((df["in_Country"] | df["in_State"] | df["in_City"]) & df["upper"])\
            .apply(lambda x: int(not x)).tolist()
    return df

In [6]:
# formats in X and y
def getXY(df, dropColumns, includeColumns=None, names=False):
    df = addNewFeatures(df)
    if includeColumns == None:
        X = df.drop(dropColumns, 1).values
    else:
        X = df[includeColumns].values
    y = df.iob.values
    # print "-- iob counts --"
    # print df.iob.value_counts()
    if names:
        features_names = df.drop(dropColumns, 1).columns
        return X, y, features_names
    return X, y

In [7]:
# add stacked features 
def add_stacked_feature(clf_ix, pos, df):
    preds = df["pred_"+str(clf_ix)].values.tolist()
    # decide column name
    if pos < 0:
        column = "pred" + str(clf_ix) + "_prev_" + str(-pos)
    elif pos > 0:
        column = "pred" + str(clf_ix) + "_next_" + str(pos)
    else:
        print "- ERROR: 0 value passed"
        return None
    # shift list
    df.loc[:,column ] = preds[pos:] + preds[:pos]
    # correct the values for the first words
    # if pos < 0:
    #     df.at[df[column] < -pos,column] = 1
   
    return df

# produces features from the predictions
def get_stacked_features(df, lvl_1_classifiers):
    for i in range(len(lvl_1_classifiers)):
        df = add_stacked_feature(i, -1, df)
        df = add_stacked_feature(i, -2, df)
        df = add_stacked_feature(i, 1, df)
        df = add_stacked_feature(i, 2, df)

    return df

In [50]:
#trains a model and scores it
def train_score(model, params, X_train, y_train, X_test, y_test, scores=False  ):
    f_one_scorer = make_scorer(f1_score,labels=[0,1], average="weighted")
    clf = GridSearchCV(model, params, cv=5, scoring= f_one_scorer, verbose=1, n_jobs=4 )
    clf.fit(X_train, y_train)

    if scores:
        print clf.best_score_
        print clf.best_params_
        print clf.cv_results_['mean_train_score']
        print clf.cv_results_['mean_test_score']

        print "- Train Results -"
        preds_train = clf.predict(X_train)
        print classification_report(y_train, preds_train,labels=[0,1] )

        print "- Test Results -"
        preds_test = clf.predict(X_test)
        print classification_report(y_test, preds_test,labels=[0,1])    
        
    
    return clf

In [23]:
def extract_entities(words, column):
    data = []
    for index, row in words.iterrows():
        if row[column][0] == "B":
            data.append([
                row.art_id,
                row.sent_id,
                row.cs_id,
                row.pos,
                row.word,
                row.geo_type
            ])
        elif row[column][0] == "I":
            if len(data) > 0:
                data[-1][4] += " " + row.word

    df = pd.DataFrame(data, columns=["art_id","sent_id","cs_id","pos","entity","geo_type"])

    return df

In [80]:
def score_entities(true, pred):
    geo_types =["Country","State","City","Zone","Col","Bar"]
    
    data = []
    for geo_type in geo_types:
        # Find the true positives
        true_positive = 0
        for index, entity in pred[pred["geo_type"]==geo_type].iterrows():
            result = true[ (true["geo_type"] == geo_type)
                          & (true["cs_id"] == entity.cs_id)
                          & (true["pos"] == entity.pos)
                          & (true["entity"] == entity.entity)
                         ]
            if len(result) > 0:
                true_positive += 1

        support = true[true["geo_type"]==geo_type].shape[0]
#         print "true positives: %i" %true_positive
#         print "predicted positives: %i" % pred[pred["geo_type"]==geo_type].shape[0]
#         print "real positives: %i" % support
        if true_positive == 0:
            precision =0.0
            recall = 0.0
            fscore = 0.0
        else:
            precision = true_positive * 1.0 / pred[pred["geo_type"]==geo_type].shape[0]
#         print "precision: %0.4f" %precision
            recall = true_positive * 1.0 / support
#         print "recall: %0.4f" %recall

        
            fscore = (precision * recall)*2.0 / (precision + recall)
#         print "fscores: %0.4f" %fscore
        
        data.append([geo_type,precision,recall,fscore,support ])
        
    scores_df = pd.DataFrame(data,columns=["geo_type","precision","recall","fscore","support"])
    
   
    f1_result = (scores_df["fscore"] * scores_df["support"]).sum() / scores_df["support"].sum()
    
 

    return scores_df, f1_result

In [11]:
desc_columns = ["art_id","sent_id", "cs_id","word", "iob_tag","iob" ,"geo_type",
#                 "first", "first_sent", "next_2", "pos_tag", "next_1", "prev_2"
               ]

# Initial Preds

In [13]:
def combine_preds(orig_preds, new_pred, geo_type):
    for idx, pred in enumerate(new_pred):
        if pred == 0:
            orig_preds[idx] = "B_"+ geo_type
        elif pred == 1:
            orig_preds[idx] = "I_"+ geo_type
    return orig_preds

In [83]:
def train_test_predict_rfc(train_df, test_df, parameters =None, scores=False, importance=False):
    X_train, y_train, features_names = getXY(train_df, desc_columns, names=True)
    X_test, y_test = getXY(test_df, desc_columns)
    
    if parameters == None:
        parameters = {
            "n_estimators": [200],
            "max_depth": [20], #[3,4,5,8,10,20],
            "min_samples_split" : [4],
            "max_features": [.3] #[.8,.5,.3,.1]
        }
    rfc = RandomForestClassifier(random_state= 233, n_jobs=4)
    clf = train_score(rfc, parameters, X_train, y_train, X_test, y_test, scores=scores )
    #predict 
    tr_preds = clf.predict(X_train) 
    te_preds = clf.predict(X_test) 
    
    if importance:
        print pd.DataFrame({
            "names": features_names,
            "importance": clf.best_estimator_.feature_importances_
        }).sort_values(by="importance", ascending=False)
    
    return tr_preds, te_preds

# Classifiers

In [15]:
def country_clf(train_df,test_df, preds_train, preds_test):
    train_df = changeIob(train_df,"Country")
    test_df = changeIob(test_df,"Country")
    tr_preds,te_preds = train_test_predict_rfc(train_df, test_df)
    
    #combine
    preds_train = combine_preds(preds_train, tr_preds, "Country")
    preds_test = combine_preds(preds_test, te_preds, "Country")
    
    return preds_train, preds_test

In [31]:
def state_clf(train_df,test_df, preds_train, preds_test):
    train_df = changeIob(train_df,"State")
    test_df = changeIob(test_df,"State")
    tr_preds,te_preds = train_test_predict_rfc(train_df, test_df)
    
    #combine
    preds_train = combine_preds(preds_train, tr_preds, "State")
    preds_test = combine_preds(preds_test, te_preds, "State")
    
    return preds_train, preds_test

In [32]:
def city_clf(train_df,test_df, preds_train, preds_test):
    train_df = changeIob(train_df,"City")
    test_df = changeIob(test_df,"City")
    tr_preds,te_preds = train_test_predict_rfc(train_df, test_df)
    
    #combine
    preds_train = combine_preds(preds_train, tr_preds, "City")
    preds_test = combine_preds(preds_test, te_preds, "City")
    
    return preds_train, preds_test

In [89]:
def zone_clf(train_df,test_df, preds_train, preds_test, scores=False, importance=False):
    train_df = changeIob(train_df,"Zone")
    test_df = changeIob(test_df,"Zone")
    
    parameters = {
        "n_estimators": [200],
        "max_depth": [20],#[3,4,5,8,10,20],
        "min_samples_split" : [4], #[4,8,20],
        "max_features": [.8] # [.8,.5,.3,.1]
    }
    
    
    tr_preds,te_preds = train_test_predict_rfc(train_df, test_df, \
                                               parameters=parameters, scores=scores, importance=importance)
    
    #combine
    preds_train = combine_preds(preds_train, tr_preds, "Zone")
    preds_test = combine_preds(preds_test, te_preds, "Zone")
    
    return preds_train, preds_test

In [55]:
def col_clf(train_df,test_df, preds_train, preds_test, scores=False, importance=False):
    train_df = changeIob(train_df,"Col")
    test_df = changeIob(test_df,"Col")
    tr_preds,te_preds = train_test_predict_rfc(train_df, test_df, scores=scores, importance=importance)
    
    #combine
    preds_train = combine_preds(preds_train, tr_preds, "Col")
    preds_test = combine_preds(preds_test, te_preds, "Col")
    
    return preds_train, preds_test

In [56]:
def bar_clf(train_df,test_df, preds_train, preds_test, scores=False, importance=False):
    train_df = changeIob(train_df,"Bar")
    test_df = changeIob(test_df,"Bar")
    tr_preds,te_preds = train_test_predict_rfc(train_df, test_df, scores=scores, importance=importance)
    
    #combine
    preds_train = combine_preds(preds_train, tr_preds, "Bar")
    preds_test = combine_preds(preds_test, te_preds, "Bar")
    
    return preds_train, preds_test

# Validate

In [114]:
train_df = pd.read_csv("../files/ner_features_train.csv")
test_df = pd.read_csv("../files/ner_features_test.csv")

In [115]:
print train_df.cs_id.unique()[:20]
print test_df.cs_id.unique()[:20]

[ 0  1  3  5  6  8 10 11 13 14 15 16 17 18 19 21 22 23 25 28]
[ 2  4  7 12 24 26 27 31 48 49 52 53 54 55 56 57 63 66 68 72]


In [116]:
preds_train = ["O" for i in range(train_df.shape[0])]
preds_test = ["O" for i in range(test_df.shape[0])]

In [117]:
preds_train, preds_test = zone_clf(train_df,test_df, preds_train, preds_test)
preds_train, preds_test = country_clf(train_df,test_df, preds_train, preds_test)
preds_train, preds_test = state_clf(train_df,test_df, preds_train, preds_test)
preds_train, preds_test = city_clf(train_df,test_df, preds_train, preds_test)
preds_train, preds_test = col_clf(train_df,test_df, preds_train, preds_test)
preds_train, preds_test = bar_clf(train_df,test_df, preds_train, preds_test)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   50.4s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   17.0s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   20.7s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   25.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   27.3s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   16.7s finished


In [97]:
preds_train, preds_test = zone_clf(train_df,test_df, preds_train, preds_test,scores=True,importance=True)

# 0       0.75      0.10      0.18        30
# 1       0.60      0.08      0.15        36
# 22 24

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   57.1s finished


0.284803728144
{'max_features': 0.8, 'min_samples_split': 4, 'n_estimators': 200, 'max_depth': 20}
[ 0.88321402]
[ 0.28480373]
- Train Results -
             precision    recall  f1-score   support

          0       1.00      0.81      0.90       111
          1       1.00      0.72      0.84       129

avg / total       1.00      0.76      0.86       240

- Test Results -
             precision    recall  f1-score   support

          0       0.69      0.37      0.48        30
          1       0.58      0.19      0.29        36

avg / total       0.63      0.27      0.38        66

    importance                names
0     0.192099                  pos
26    0.136070            sent_size
6     0.105731                 size
9     0.095907               prev_2
11    0.082397               next_2
8     0.074025               prev_1
10    0.058814               next_1
16    0.038785         triggerzone1
17    0.035464         triggerzone2
2     0.034386                upper
7     0.0240

# Entity Scores

In [118]:
final_train = train_df.copy()
final_test = test_df.copy()

final_train.loc[:,"pred"] = preds_train
final_test.loc[:,"pred"] = preds_test

#Train
print "--Train"
true_entities = extract_entities(final_train, "iob_tag")
pred_entities = extract_entities(final_train, "pred")
print score_entities(true_entities, pred_entities)

#Train
print "--Test"
true_entities = extract_entities(final_test, "iob_tag")
pred_entities = extract_entities(final_test, "pred")
print score_entities(true_entities, pred_entities)


--Train
(  geo_type  precision    recall    fscore  support
0  Country   0.950704  0.912162  0.931034      148
1    State   0.964789  0.867089  0.913333      158
2     City   0.949602  0.875306  0.910941      409
3     Zone   0.797872  0.641026  0.710900      117
4      Col   0.735537  0.635714  0.681992      140
5      Bar   0.960000  0.923077  0.941176       26, 0.8595187421009722)
--Test
(  geo_type  precision    recall    fscore  support
0  Country   0.950000  0.950000  0.950000       40
1    State   1.000000  0.794118  0.885246       34
2     City   0.793103  0.669903  0.726316      103
3     Zone   0.666667  0.250000  0.363636       24
4      Col   0.758621  0.687500  0.721311       32
5      Bar   1.000000  1.000000  1.000000        2, 0.7519920294124027)


In [None]:
# --Train
# (  geo_type  precision    recall    fscore  support
# 0  Country   0.979452  0.953333  0.966216      150
# 1    State   0.977941  0.869281  0.920415      153
# 2     City   0.941799  0.870416  0.904701      409
# 3     Zone   0.720588  0.441441  0.547486      111
# 4      Col   0.717949  0.617647  0.664032      136
# 5      Bar   0.952381  0.909091  0.930233       22, 0.8433467888927045)
# --Test
# (  geo_type  precision    recall    fscore  support
# 0  Country   0.971429  0.894737  0.931507       38
# 1    State   0.878788  0.743590  0.805556       39
# 2     City   0.806818  0.689320  0.743455      103
# 3     Zone   0.600000  0.100000  0.171429       30
# 4      Col   0.652174  0.416667  0.508475       36
# 5      Bar   1.000000  0.666667  0.800000        6, 0.681102321665181)

# NEXT STEPS:
- GEt better scores for ZONE COL and BAR

In [79]:
results = [
    [0.931507,38],
    [0.794521,39],
    [0.715789,103],
    [0.062500,30],
    [0.275862,36],
    [0.000001,6],
]

num = sum([x[1] for x in results])
sum([x[0]*(x[1]*1.0/num) for x in results])


0.6028408333333334

981

In [93]:
test_df[test_df["iob_tag"].isin(["B-Zone","I-Zone"])]

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,iob,geo_type,upper,...,triggerbarrio2,triggerresidencial1,triggerresidencial2,trigger_1,trigger_2,sent_size,in_Country,in_State,in_City,in_gazette
27,3,5,5,19,Monumento,22,B-Zone,2,Zone,1,...,0,0,0,0,0,44,0,0,0,1
28,3,5,5,20,a,34,I-Zone,2,Zone,0,...,0,0,0,0,0,44,0,0,0,1
29,3,5,5,21,la,4,I-Zone,2,Zone,0,...,0,0,0,0,0,44,0,0,0,1
30,3,5,5,22,Madre,22,I-Zone,2,Zone,1,...,0,0,0,1,0,44,0,0,0,1
1704,78,4,297,29,La,4,B-Zone,2,Zone,1,...,0,0,0,1,0,48,0,0,0,1
1705,78,4,297,30,Cumbre,22,I-Zone,2,Zone,1,...,0,0,0,1,1,48,0,0,0,1
1706,78,4,297,31,de,34,I-Zone,2,Zone,0,...,0,0,0,0,1,48,0,0,0,1
1707,78,4,297,32,Trojes,22,I-Zone,2,Zone,1,...,0,0,0,1,0,48,0,0,1,0
2360,192,1,416,28,6,50,B-Zone,2,Zone,0,...,0,0,0,1,1,57,0,0,0,1
2361,192,1,416,29,calle,22,I-Zone,2,Zone,0,...,0,0,0,0,1,57,0,0,0,1
