In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.metrics import f1_score, make_scorer, classification_report, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import unicodedata
from sklearn.preprocessing import LabelEncoder

In [52]:
train_df = pd.read_csv("../files/ner_features_train.csv")
test_df = pd.read_csv("../files/ner_features_test.csv")

In [66]:
preds_train = ((train_df["in_Country"] | train_df["in_State"] | train_df["in_City"]) & train_df["upper"]).apply(lambda x: int(not x)).tolist()
preds_test = ((test_df["in_Country"] | test_df["in_State"] | test_df["in_City"]) & test_df["upper"]).apply(lambda x: int(not x)).tolist()

In [67]:
print "- Train Results -"
print classification_report(y_train, preds_train )

print "- Test Results -"
print classification_report(y_test, preds_test)

#  0.84      0.52      0.65 
#  0.84      0.51      0.63

- Train Results -
             precision    recall  f1-score   support

          0       0.89      0.52      0.65      1595
          1       0.99      1.00      0.99     53792

avg / total       0.98      0.98      0.98     55387

- Test Results -
             precision    recall  f1-score   support

          0       0.88      0.50      0.64       413
          1       0.98      1.00      0.99     13448

avg / total       0.98      0.98      0.98     13861



# ...

In [71]:
# train_df2[(train_df2["pred"]==0)&(train_df2["iob"]==1)][["art_id","sent_id","pos",  "word","in_Country","in_State","in_City"]]

In [38]:
countries_df = pd.read_csv("../files/countries.csv", encoding="utf-8")
dep_mun_df = pd.read_csv("../files/DepartamentosMunicipios.csv", encoding="utf-8")
world_cities = pd.read_csv("../files/ciudades_mundo.csv", encoding="utf-8")
countries_df.value = countries_df.value.apply(lambda x: to_ascii(x).lower() )
dep_mun_df.Departamento = dep_mun_df.Departamento.apply(lambda x: to_ascii(x).lower() )
dep_mun_df.Municipio = dep_mun_df.Municipio.apply(lambda x: to_ascii(x).lower() )
world_cities.city = world_cities.city.apply(lambda x: to_ascii(x).lower() )

# Layer 2

In [53]:
# formats in X and y
def getXY(df, dropColumns, includeColumns=None, names=False):
#     df = addNewFeatures(df)
    if includeColumns == None:
        X = df.drop(dropColumns, 1).values
    else:
        X = df[includeColumns].values
    y = df.iob.values
    # print "-- iob counts --"
    # print df.iob.value_counts()
    if names:
        features_names = df.drop(dropColumns, 1).columns
        return X, y, features_names
    return X, y

In [68]:
def to_ascii(s):
    return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')

In [82]:
train_df2 = train_df.copy()
test_df2 = test_df.copy()

train_df2.loc[:,"pred"] = preds_train
test_df2.loc[:,"pred"] = preds_test

#filter out the "positive" ones

train_df2 = train_df2[train_df2["pred"]==1]
test_df2 = test_df2[test_df2["pred"]==1]

In [83]:
desc_columns = ["art_id","sent_id", "cs_id","word", "iob_tag","iob" ,"geo_type",
#                 "first", "first_sent", "next_2", "pos_tag", "next_1", "prev_2"
               ]
X_train, y_train, features_names = getXY(train_df2, desc_columns, names=True)
X_test, y_test = getXY(test_df2, desc_columns)

In [84]:
def train_score(model, params, X_train, y_train, X_test, y_test):
    f_one_scorer = make_scorer(f1_score,labels=[0], average="weighted")
    clf = GridSearchCV(model, params, cv=5, scoring= f_one_scorer, verbose=1, n_jobs=4 )
    clf.fit(X_train, y_train)

    print clf.best_score_
    print clf.best_params_
    print clf.cv_results_['mean_train_score']
    print clf.cv_results_['mean_test_score']
        
    print "- Train Results -"
    preds_train = clf.predict(X_train)
    print classification_report(y_train, preds_train )
    
    print "- Test Results -"
    preds_test = clf.predict(X_test)
    print classification_report(y_test, preds_test)
    
    return clf

In [85]:
parameters = {
    "n_estimators": [200],
    "max_depth": [20], #[3,4,5,8,10,20],
    "min_samples_split" : [4],
    "max_features": [.8,.5,.3,.1]
}
rfc = RandomForestClassifier(random_state= 233, n_jobs=4)
clf = train_score(rfc, parameters, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:  1.8min finished


0.590388566832
{'max_features': 0.5, 'min_samples_split': 4, 'n_estimators': 200, 'max_depth': 20}
[ 0.97400444  0.96685014  0.94604985  0.86302794]
[ 0.58224071  0.59038857  0.57906091  0.42799107]
- Train Results -
             precision    recall  f1-score   support

          0       0.99      0.94      0.96       771
          1       1.00      1.00      1.00     53689

avg / total       1.00      1.00      1.00     54460

- Test Results -
             precision    recall  f1-score   support

          0       0.81      0.50      0.62       206
          1       0.99      1.00      1.00     13421

avg / total       0.99      0.99      0.99     13627



In [89]:
final_test = test_df.copy()
final_test.loc[:,"pred"] = preds_test
final_test.loc[test_df2.index,"pred2"]= clf.predict(X_test)
final_test.loc[:,"pred_final"]= (final_test.pred & final_test.pred2).apply(lambda x: int(x))

final_test.head()

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,iob,geo_type,upper,...,next_2,prev_prefix_1,prev_prefix_2,sent_size,in_Country,in_State,in_City,pred,pred2,pred_final
0,3,1,1,3,momento,22,none,1,none,0,...,22,0,0,32,0,0,0,1,1.0,1
1,3,1,1,7,sido,47,none,1,none,0,...,3,0,0,32,0,0,0,1,1.0,1
2,3,1,1,31,Rivera,22,none,1,none,1,...,1,0,0,32,0,0,0,1,1.0,1
3,3,2,2,0,Los,4,none,1,none,1,...,34,0,0,16,0,0,0,1,1.0,1
4,3,2,2,1,operativos,1,none,1,none,0,...,4,0,0,16,0,0,0,1,1.0,1


In [98]:
final_test[(final_test["pred_final"]==0)&(final_test["pred2"].isnull())&(final_test["pred"]==1)].head(30)

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,iob,geo_type,upper,...,next_2,prev_prefix_1,prev_prefix_2,sent_size,in_Country,in_State,in_City,pred,pred2,pred_final


In [93]:
print "- Test Results -"
print classification_report(final_test.iob.values, final_test.pred_final.values)

- Test Results -
             precision    recall  f1-score   support

          0       0.86      0.75      0.80       413
          1       0.99      1.00      0.99     13448

avg / total       0.99      0.99      0.99     13861

