# Introduction
This notebook is part of a series of tests for the classifier training phase of the NER Identifier.

### Specifically....
This notebook studies the ussage of IOB instead of  OB 

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, make_scorer, classification_report, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [2]:
train_df = pd.read_csv("../files/ner_features_train.csv")
test_df = pd.read_csv("../files/ner_features_test.csv")

In [3]:
train_df.head()

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,iob,geo_type,upper,...,triggerbarrio1,triggerbarrio2,triggerresidencial1,triggerresidencial2,trigger_1,trigger_2,sent_size,in_Country,in_State,in_City
0,3,0,0,0,SAN,1,B-City,0,City,1,...,0,0,0,0,0,0,3,0,0,1
1,3,0,0,1,PEDRO,22,I-City,0,City,1,...,0,0,0,0,0,0,3,0,0,1
2,3,0,0,2,SULA,22,I-City,0,City,1,...,0,0,0,0,0,0,3,0,0,1
3,3,1,1,0,-,13,none,1,none,0,...,0,0,0,0,0,0,32,0,0,0
4,3,1,1,1,Hasta,34,none,1,none,1,...,0,0,0,0,0,0,32,0,0,0


In [4]:
test_df.head()

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,iob,geo_type,upper,...,triggerbarrio1,triggerbarrio2,triggerresidencial1,triggerresidencial2,trigger_1,trigger_2,sent_size,in_Country,in_State,in_City
0,3,4,4,0,Indicó,39,none,1,none,1,...,0,0,0,0,0,0,35,0,0,0
1,3,4,4,1,que,3,none,1,none,0,...,0,0,0,0,0,0,35,0,0,0
2,3,4,4,2,tienen,39,none,1,none,0,...,0,0,0,0,0,0,35,0,0,0
3,3,4,4,3,once,7,none,1,none,0,...,0,0,0,0,0,0,35,0,0,0
4,3,4,4,4,personas,22,none,1,none,0,...,0,0,0,0,0,0,35,0,0,0


## Correcting the iob

In [4]:
iob_map = {
    "B": 0,
    "I": 1,
    "n": 2
}

train_df.loc[:,"iob"] = train_df.iob_tag.apply(lambda x: iob_map[x[0]])
test_df.loc[:,"iob"] = test_df.iob_tag.apply(lambda x: iob_map[x[0]])
train_df.head()

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,iob,geo_type,upper,...,next_1,next_2,prev_prefix_1,prev_prefix_2,trigger_1,trigger_2,sent_size,in_Country,in_State,in_City
0,3,0,0,0,SAN,1,B-City,0,City,1,...,22,22,0,0,0,0,3,0,0,1
1,3,0,0,1,PEDRO,22,I-City,1,City,1,...,22,13,0,0,0,0,3,0,0,1
2,3,0,0,2,SULA,22,I-City,1,City,1,...,13,34,0,0,0,0,3,0,0,1
3,3,1,1,0,-,13,none,2,none,0,...,34,4,0,0,0,0,32,0,0,0
4,3,1,1,1,Hasta,34,none,2,none,1,...,4,22,0,0,0,0,32,0,0,0


# Basic processing functions

In [5]:
def addNewFeatures(df):
    df.loc[:,"in_gazette"] = ((df["in_Country"] | df["in_State"] | df["in_City"]) & df["upper"])\
            .apply(lambda x: int(not x)).tolist()
    return df

In [6]:
# formats in X and y
def getXY(df, dropColumns, includeColumns=None, names=False):
    df = addNewFeatures(df)
    if includeColumns == None:
        X = df.drop(dropColumns, 1).values
    else:
        X = df[includeColumns].values
    y = df.iob.values
    # print "-- iob counts --"
    # print df.iob.value_counts()
    if names:
        features_names = df.drop(dropColumns, 1).columns
        return X, y, features_names
    return X, y

In [7]:
# add stacked features 
def add_stacked_feature(clf_ix, pos, df):
    preds = df["pred_"+str(clf_ix)].values.tolist()
    # decide column name
    if pos < 0:
        column = "pred" + str(clf_ix) + "_prev_" + str(-pos)
    elif pos > 0:
        column = "pred" + str(clf_ix) + "_next_" + str(pos)
    else:
        print "- ERROR: 0 value passed"
        return None
    # shift list
    df.loc[:,column ] = preds[pos:] + preds[:pos]
    # correct the values for the first words
    # if pos < 0:
    #     df.at[df[column] < -pos,column] = 1
   
    return df

# produces features from the predictions
def get_stacked_features(df, lvl_1_classifiers):
    for i in range(len(lvl_1_classifiers)):
        df = add_stacked_feature(i, -1, df)
        df = add_stacked_feature(i, -2, df)
        df = add_stacked_feature(i, 1, df)
        df = add_stacked_feature(i, 2, df)

    return df

In [74]:
#trains a model and scores it
def train_score(model, params, X_train, y_train, X_test, y_test):
    f_one_scorer = make_scorer(f1_score,labels=[0,1], average="weighted")
    clf = GridSearchCV(model, params, cv=5, scoring= f_one_scorer, verbose=0, n_jobs=4 )
    clf.fit(X_train, y_train)

    print clf.best_score_
    print clf.best_params_
    print clf.cv_results_['mean_train_score']
    print clf.cv_results_['mean_test_score']
        
    print "- Train Results -"
    preds_train = clf.predict(X_train)
    print classification_report(y_train, preds_train, labels=[0,1] )
    
    print "- Test Results -"
    preds_test = clf.predict(X_test)
    print classification_report(y_test, preds_test, labels=[0,1])
    
    return clf

## TODO LIST



- Dummy pos tags (even for pre and next)

- sentece Size
- previous word is uppercase
- feature selection L1
- Try KNN  (using minmax)
- Try SVC  (using minmax)
- Proba results?

- Try HMM

# Quickly test the Random Forest classifier

In [9]:
desc_columns = ["art_id","sent_id", "cs_id","word", "iob_tag","iob" ,"geo_type",
#                 "first", "first_sent", "next_2", "pos_tag", "next_1", "prev_2"
               ]
X_train, y_train, features_names = getXY(train_df, desc_columns, names=True)
X_test, y_test = getXY(test_df, desc_columns)

In [12]:
parameters = {
    "n_estimators": [200],
    "max_depth": [20], #[3,4,5,8,10,20],
    "min_samples_split" : [4],
    "max_features": [.3] #[.8,.5,.3,.1]
}
rfc = RandomForestClassifier(random_state= 233, n_jobs=4)
clf = train_score(rfc, parameters, X_train, y_train, X_test, y_test)
# 0.87      0.74      0.80       368  max depth 20 original features
# 0.91      0.75      0.82       413  new features and distribution
# 0.87      0.79      0.83       252

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   32.5s finished


0.766038245021
{'max_features': 0.3, 'min_samples_split': 4, 'n_estimators': 200, 'max_depth': 20}
[ 0.96432568]
[ 0.76603825]
- Train Results -
             precision    recall  f1-score   support

          0       0.98      0.96      0.97       981
          1       1.00      0.91      0.95       614

avg / total       0.99      0.94      0.96      1595

- Test Results -
             precision    recall  f1-score   support

          0       0.87      0.79      0.83       252
          1       0.90      0.58      0.71       161

avg / total       0.88      0.71      0.78       413



In [None]:
pd.DataFrame({
    "names": features_names,
    "importance": clf.best_estimator_.feature_importances_
}).sort_values(by="importance", ascending=False)


## test svc

In [42]:
from sklearn.svm import SVC
parameters = {
    "C":[100.0], # [100,10,1,0.1,0.01],
    "kernel":["linear"] #,[ "poly", "rbf", "sigmoid"],
}

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svc_clf = SVC(random_state = 452)
clf2 = train_score(svc_clf, parameters, X_train_scaled, y_train, X_test_scaled, y_test)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] kernel=linear, C=100 ............................................
[CV] kernel=linear, C=100 ............................................
[CV] kernel=linear, C=100 ............................................
[CV] kernel=linear, C=100 ............................................
[CV] ............................. kernel=linear, C=100, total=  43.7s
[CV] kernel=linear, C=100 ............................................
[CV] ............................. kernel=linear, C=100, total=  47.0s
[CV] kernel=linear, C=10 .............................................
[CV] ............................. kernel=linear, C=100, total=  56.6s
[CV] kernel=linear, C=10 .............................................
[CV] .............................. kernel=linear, C=10, total=   7.2s
[CV] kernel=linear, C=10 .............................................
[CV] ............................. kernel=linear, C=100, total= 1.0min
[CV] kernel=linea

[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:  2.1min finished


0.709363615632
{'kernel': 'linear', 'C': 100}
[ 0.71076641  0.71076641  0.70581867  0.69309714  0.64437155]
[ 0.70936362  0.70936362  0.69856155  0.68820102  0.63996254]
- Train Results -
             precision    recall  f1-score   support

          0       0.87      0.74      0.80       981
          1       0.80      0.52      0.63       614

avg / total       0.84      0.66      0.73      1595

- Test Results -
             precision    recall  f1-score   support

          0       0.89      0.75      0.82       252
          1       0.85      0.50      0.63       161

avg / total       0.88      0.66      0.75       413



# simple classifier

In [48]:
from sklearn import tree
parameters = {
    "max_depth":[8],
    "min_samples_split" : [10,20,40]
}
dtc = tree.DecisionTreeClassifier(random_state=343)

clf3 = train_score(dtc, parameters, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] min_samples_split=10, max_depth=8 ...............................
[CV] min_samples_split=10, max_depth=8 ...............................
[CV] min_samples_split=10, max_depth=8 ...............................
[CV] min_samples_split=10, max_depth=8 ...............................
[CV] ................ min_samples_split=10, max_depth=8, total=   0.4s
[CV] min_samples_split=10, max_depth=8 ...............................
[CV] ................ min_samples_split=10, max_depth=8, total=   0.6s
[CV] min_samples_split=20, max_depth=8 ...............................
[CV] ................ min_samples_split=10, max_depth=8, total=   0.6s
[CV] min_samples_split=20, max_depth=8 ...............................
[CV] ................ min_samples_split=10, max_depth=8, total=   0.7s
[CV] min_samples_split=20, max_depth=8 ...............................
[CV] ................ min_samples_split=10, max_depth=8, total=   0.7s
[CV] min_samples_

[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    2.7s finished


0.746315814162
{'min_samples_split': 20, 'max_depth': 8}
[ 0.78652714  0.77773677  0.76196798]
[ 0.73915316  0.74631581  0.73475652]
- Train Results -
             precision    recall  f1-score   support

          0       0.91      0.77      0.83       981
          1       0.91      0.55      0.68       614

avg / total       0.91      0.68      0.78      1595

- Test Results -
             precision    recall  f1-score   support

          0       0.90      0.77      0.83       252
          1       0.87      0.52      0.65       161

avg / total       0.89      0.67      0.76       413



# Test Second level (stacked) Algorithm
This algorithm uses the results of the first one as features

In [37]:
train_stacked[(train_stacked["pred_0"]==2)&(train_stacked["pred_1"]==1)][["word","iob"]]

Unnamed: 0,word,iob
580,Hadas,2
1260,Francisco,2
1839,Sevilla,2
2884,Pino,2
4338,Isidro,2
4499,Colón,2
4849,de,2
6173,Cabañas,2
6398,Cabañas,2
7804,MUERTELos,2


In [49]:
#1. get preds and add them to the original features
train_stacked = train_df.copy()
test_stacked = test_df.copy()
preds_train = clf.predict(X_train)
preds_test = clf.predict(X_test)

train_stacked.loc[:,"pred_0"] = preds_train
test_stacked.loc[:,"pred_0"] = preds_test

train_stacked.loc[:,"pred_1"] = clf2.predict(X_train_scaled)
test_stacked.loc[:,"pred_1"] = clf2.predict(X_test_scaled)


train_stacked.loc[:,"pred_2"] = clf3.predict(X_train)
test_stacked.loc[:,"pred_2"] = clf3.predict(X_test)



#2. Generate features with prev and next
# train_stacked = get_stacked_features(train_stacked, [0,1])
# test_stacked = get_stacked_features(test_stacked, [0,1])



# X_train2, y_train2, features_names2 = getXY(train_stacked, desc_columns, names=True)
features_names2 = ["pred_0","pred_1","pred_2"]
X_train2, y_train2 = getXY(train_stacked,desc_columns, includeColumns= features_names2)
X_test2, y_test2 = getXY(test_stacked, desc_columns, includeColumns= features_names2)

#5. Train the new classifier ... muhahaha!
# parameters = {
#     "n_estimators": [200],
#     "max_depth": [4], #[4,5,8,10,20],
#     "min_samples_split" : [4], # [2,4,8,10],
#     "max_features": [0.5] #[.8,.5,.3,.1]
# }
# rfc2 = RandomForestClassifier(random_state= 233, n_jobs=4)
# clf2 = train_score(rfc2, parameters, X_train2, y_train2, X_test2, y_test2)


from sklearn.linear_model import LogisticRegression
parameters = {
    "C":[0.1] #[10, 1.0, 0.1, 0.01]
}
lrc = LogisticRegression(penalty="l2", random_state=238, n_jobs=4, solver="lbfgs", multi_class="multinomial")
clf_lvl2 = train_score(lrc, parameters, X_train2, y_train2, X_test2, y_test2)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] C=0.1 ...........................................................
[CV] C=0.1 ...........................................................


  **self._backend_args)


[CV] C=0.1 ...........................................................


  **self._backend_args)


[CV] C=0.1 ...........................................................


  **self._backend_args)
  **self._backend_args)


[CV] ............................................ C=0.1, total=   0.8s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=   0.8s
[CV] ............................................ C=0.1, total=   0.9s
[CV] ............................................ C=0.1, total=   1.0s
[CV] ............................................ C=0.1, total=   0.6s


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    1.5s finished


0.962174274929
{'C': 0.1}
[ 0.96224019]
[ 0.96217427]
- Train Results -
             precision    recall  f1-score   support

          0       0.98      0.96      0.97       981
          1       1.00      0.91      0.95       614

avg / total       0.99      0.94      0.96      1595

- Test Results -
             precision    recall  f1-score   support

          0       0.87      0.79      0.83       252
          1       0.90      0.58      0.71       161

avg / total       0.88      0.71      0.78       413



In [51]:
train_stacked["s_preds"] = clf_lvl2.predict(X_train2)
test_stacked["s_preds"] = clf_lvl2.predict(X_test2)
# ["iob_tag"].value_counts()
test_stacked[(test_stacked["iob"]==1)&(test_stacked["s_preds"]==2)]\
[["art_id","word","iob_tag","in_gazette","iob","s_preds"]]

Unnamed: 0,art_id,word,iob_tag,in_gazette,iob,s_preds
28,3,a,I-Zone,1,1,2
29,3,la,I-Zone,1,1,2
30,3,Madre,I-Zone,1,1,2
921,32,de,I-City,1,1,2
1562,66,California,I-State,1,1,2
1610,66,de,I-City,1,1,2
1614,66,UU,I-Country,1,1,2
1636,66,UU,I-Country,1,1,2
1706,78,de,I-Zone,1,1,2
2227,188,Lindo,I-Col,1,1,2


In [None]:
test_stacked["in_gazette"].value_counts()

In [25]:
def extract_entities(words, column):
    data = []
    for index, row in words.iterrows():
        if row[column] == 0:
            data.append([
                row.art_id,
                row.sent_id,
                row.cs_id,
                row.pos,
                row.word
            ])
        elif row[column] == 1:
            if len(data) > 0:
                data[-1][4] += " " + row.word

    df = pd.DataFrame(data, columns=["art_id","sent_id","cs_id","pos","entity"])

    return df

In [26]:
def score_entities(true, pred):

    # Find the true positives
    true_positive = 0
    for index, entity in pred.iterrows():
        result = true[(true["cs_id"] == entity.cs_id)
                      & (true["pos"] == entity.pos)
                      & (true["entity"] == entity.entity)
                     ]
        if len(result) > 0:
            true_positive += 1


    print "true positives: %i" %true_positive
    print "predicted positives: %i" % pred.shape[0]
    print "real positives: %i" % true.shape[0]
    
    precision = true_positive * 1.0 / pred.shape[0]
    print "precision: %0.4f" %precision
    recall = true_positive * 1.0 / true.shape[0]
    print "recall: %0.4f" %recall

    fscore = (precision + recall) / 2.0
    print "fscores: %0.4f" %fscore

    return {
        "precision": precision,
        "recall": recall,
        "fscore": fscore
    }

In [52]:
true_entities = extract_entities(test_stacked, "iob")
pred_entities = extract_entities(test_stacked, "s_preds")
score_entities(true_entities, pred_entities)



true positives: 172
predicted positives: 229
real positives: 252
precision: 0.7511
recall: 0.6825
fscores: 0.7168


{'fscore': 0.7168156927982255,
 'precision': 0.7510917030567685,
 'recall': 0.6825396825396826}

Best Score: {'fscore': 0.7168156927982255,
 'precision': 0.7510917030567685,
 'recall': 0.6825396825396826}

# dummy with NB

In [53]:
#1.5 add dummy variables
train_copy = train_df.copy()
test_copy = test_df.copy()

prev_dummies = pd.get_dummies(train_copy["prev_1"], prefix="prev_1")
train_copy = pd.concat([train_copy, prev_dummies], axis=1)
train_copy.loc[:,"prev_1_14"] = np.zeros(train_copy.shape[0])

prev_dummies = pd.get_dummies(test_copy["prev_1"], prefix="prev_1")
test_copy = pd.concat([test_copy, prev_dummies], axis=1)
test_copy.loc[:,"prev_1_10"] = np.zeros(test_copy.shape[0])
test_copy.loc[:,"prev_1_20"] = np.zeros(test_copy.shape[0])

In [78]:
from sklearn.naive_bayes import MultinomialNB
parameters = {}

X_train3, y_train3, features_names = getXY(train_copy, desc_columns, names=True)
X_test3, y_test3 = getXY(test_copy, desc_columns)

mnbc = MultinomialNB()

clf3 = train_score(mnbc, parameters, X_train3, y_train3, X_test3, y_test3)

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 3839: ordinal not in range(128)

In [68]:
train_copy.isnull().any().any()

False