# Introduction
This notebook is to test the classifier training phase of the NER Identifier.

### Notes to self...
- re-introduce I O B ...  it seems that the classifier has problem to distinguish "de" 
- Specialized classifiers by geo-type

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, make_scorer, classification_report, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [65]:
train_df = pd.read_csv("../files/ner_features_train.csv")
test_df = pd.read_csv("../files/ner_features_test.csv")

In [66]:
train_df.head()

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,iob,geo_type,upper,...,triggerbarrio1,triggerbarrio2,triggerresidencial1,triggerresidencial2,trigger_1,trigger_2,sent_size,in_Country,in_State,in_City
0,3,0,0,0,SAN,1,B-City,0,City,1,...,0,0,0,0,0,0,3,0,0,1
1,3,0,0,1,PEDRO,22,I-City,0,City,1,...,0,0,0,0,0,0,3,0,0,1
2,3,0,0,2,SULA,22,I-City,0,City,1,...,0,0,0,0,0,0,3,0,0,1
3,3,1,1,0,-,13,none,1,none,0,...,0,0,0,0,0,0,32,0,0,0
4,3,1,1,1,Hasta,34,none,1,none,1,...,0,0,0,0,0,0,32,0,0,0


In [4]:
train_df.trigger_1.value_counts()

0    47376
1     7997
Name: trigger_1, dtype: int64

# Basic processing functions

In [5]:
def addNewFeatures(df):
    df.loc[:,"in_gazette"] = ((df["in_Country"] | df["in_State"] | df["in_City"]) & df["upper"])\
            .apply(lambda x: int(not x)).tolist()
    return df

In [6]:
# formats in X and y
def getXY(df, dropColumns, includeColumns=None, names=False):
#     df = addNewFeatures(df)
    if includeColumns == None:
        X = df.drop(dropColumns, 1).values
    else:
        X = df[includeColumns].values
    y = df.iob.values
    # print "-- iob counts --"
    # print df.iob.value_counts()
    if names:
        features_names = df.drop(dropColumns, 1).columns
        return X, y, features_names
    return X, y

In [7]:
# add stacked features 
def add_stacked_feature(clf_ix, pos, df):
    preds = df["pred_"+str(clf_ix)].values.tolist()
    # decide column name
    if pos < 0:
        column = "pred" + str(clf_ix) + "_prev_" + str(-pos)
    elif pos > 0:
        column = "pred" + str(clf_ix) + "_next_" + str(pos)
    else:
        print "- ERROR: 0 value passed"
        return None
    # shift list
    df.loc[:,column ] = preds[pos:] + preds[:pos]
    # correct the values for the first words
    # if pos < 0:
    #     df.at[df[column] < -pos,column] = 1
   
    return df

# produces features from the predictions
def get_stacked_features(df, lvl_1_classifiers):
    for i in range(len(lvl_1_classifiers)):
        df = add_stacked_feature(i, -1, df)
        df = add_stacked_feature(i, -2, df)
        df = add_stacked_feature(i, 1, df)
        df = add_stacked_feature(i, 2, df)

    return df

In [8]:
#trains a model and scores it
def train_score(model, params, X_train, y_train, X_test, y_test,labels=[0]):
    f_one_scorer = make_scorer(f1_score,labels=labels, average="weighted")
    clf = GridSearchCV(model, params, cv=5, scoring= f_one_scorer, verbose=1, n_jobs=4 )
    clf.fit(X_train, y_train)

    print clf.best_score_
    print clf.best_params_
    print clf.cv_results_['mean_train_score']
    print clf.cv_results_['mean_test_score']
        
    print "- Train Results -"
    preds_train = clf.predict(X_train)
    print classification_report(y_train, preds_train )
    
    print "- Test Results -"
    preds_test = clf.predict(X_test)
    print classification_report(y_test, preds_test)
    
    return clf

## TODO LIST



- Dummy pos tags (even for pre and next)

- sentece Size
- previous word is uppercase
- feature selection L1
- Try KNN  (using minmax)
- Try SVC  (using minmax)
- Proba results?

- Try HMM

# Quickly test the Random Forest classifier

In [10]:
desc_columns = ["art_id","sent_id", "cs_id","word", "iob_tag","iob" ,"geo_type",
#                 "first", "first_sent", "next_2", "pos_tag", "next_1", "prev_2"
               ]
X_train, y_train, features_names = getXY(train_df, desc_columns, names=True)
X_test, y_test = getXY(test_df, desc_columns)

In [13]:
parameters = {
    "n_estimators": [200],
    "max_depth": [20], #[3,4,5,8,10,20],
    "min_samples_split" : [4],
    "max_features": [.3] #[.8,.5,.3,.1]
}
rfc = RandomForestClassifier(random_state= 233, n_jobs=4)
clf = train_score(rfc, parameters, X_train, y_train, X_test, y_test)
# 0.87      0.74      0.80       368  max depth 20 original features
# 0.91      0.75      0.82       413  new features and distribution

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   38.7s finished


0.789797319676
{'max_features': 0.3, 'min_samples_split': 4, 'n_estimators': 200, 'max_depth': 20}
[ 0.92690897]
[ 0.78979732]
- Train Results -
             precision    recall  f1-score   support

          0       0.98      0.87      0.92      1646
          1       1.00      1.00      1.00     53727

avg / total       1.00      1.00      1.00     55373

- Test Results -
             precision    recall  f1-score   support

          0       0.90      0.81      0.85       356
          1       1.00      1.00      1.00     13514

avg / total       0.99      0.99      0.99     13870



In [14]:
pd.DataFrame({
    "names": features_names,
    "importance": clf.best_estimator_.feature_importances_
}).sort_values(by="importance", ascending=False)

Unnamed: 0,importance,names
29,0.274955,in_City
2,0.125079,upper
27,0.077017,in_Country
0,0.071036,pos
28,0.061691,in_State
26,0.046094,sent_size
7,0.039983,size
8,0.035229,prev_1
18,0.033102,triggercolonia1
9,0.031774,prev_2


# Test Second level (stacked) Algorithm
This algorithm uses the results of the first one as features

In [15]:
#1. get preds and add them to the original features
train_stacked = train_df.copy()
test_stacked = test_df.copy()
preds_train = clf.predict(X_train)
preds_test = clf.predict(X_test)

train_stacked.loc[:,"pred_0"] = preds_train
test_stacked.loc[:,"pred_0"] = preds_test


#2. Generate features with prev and next
train_stacked = get_stacked_features(train_stacked, [0])
test_stacked = get_stacked_features(test_stacked, [0])



X_train2, y_train2, features_names2 = getXY(train_stacked, desc_columns, names=True)
X_test2, y_test2 = getXY(test_stacked, desc_columns, includeColumns= features_names2.tolist())

#5. Train the new classifier ... muhahaha!
parameters = {
    "n_estimators": [200],
    "max_depth": [4], #[4,5,8,10,20],
    "min_samples_split" : [4], # [2,4,8,10],
    "max_features": [0.5] #[.8,.5,.3,.1]
}
rfc2 = RandomForestClassifier(random_state= 233, n_jobs=4)
clf2 = train_score(rfc2, parameters, X_train2, y_train2, X_test2, y_test2)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   31.6s finished


0.920223658742
{'max_features': 0.5, 'min_samples_split': 4, 'n_estimators': 200, 'max_depth': 4}
[ 0.92056204]
[ 0.92022366]
- Train Results -
             precision    recall  f1-score   support

          0       0.98      0.87      0.92      1646
          1       1.00      1.00      1.00     53727

avg / total       1.00      1.00      1.00     55373

- Test Results -
             precision    recall  f1-score   support

          0       0.90      0.81      0.85       356
          1       1.00      1.00      1.00     13514

avg / total       0.99      0.99      0.99     13870



In [16]:
train_stacked["s_preds"] = clf2.predict(X_train2)
test_stacked["s_preds"] = clf2.predict(X_test2)

#just looking which ones are the errors
test_stacked[(test_stacked["iob"]==0)&(test_stacked["s_preds"]==1)]\
["iob_tag"].value_counts()
# [["art_id","word","iob_tag","in_gazette","iob","s_preds"]]

I-Col      14
B-Col      14
I-City     11
B-Zone     10
I-Zone      9
B-City      7
B-State     1
I-State     1
Name: iob_tag, dtype: int64

In [54]:
def extract_entities(words, io_column, geo_column):
    data = []
    
    last_artid = -1
    last_io = 1
    
    for index, row in words.iterrows():
        if (row[io_column]==0 and last_io == 1) or row.art_id != last_artid:
            data.append([
                row.art_id,
                row.sent_id,
                row.cs_id,
                row.pos,
                row.word,
                row[geo_column]
            ])
        elif row[io_column]==0 and last_io == 0:
#             if len(data) > 0:
            data[-1][4] += " " + row.word
        last_artid = row.art_id
        last_io = row[io_column]
        

    df = pd.DataFrame(data, columns=["art_id","sent_id","cs_id","pos","entity","geo_type"])

    return df

In [62]:
# true_entities = extract_entities(train_stacked, "iob", "geo_type")
# pred_entities = extract_entities(final_train, "s_preds", "pred")
# print true_entities.head()
# print pred_entities.head()
pred_entities
final_train

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,iob,geo_type,upper,...,in_Country,in_State,in_City,pred_0,pred0_prev_1,pred0_prev_2,pred0_next_1,pred0_next_2,s_preds,pred
0,3,0,0,0,SAN,1,B-City,0,City,1,...,0,0,1,0,1,1,0,0,0,City
29,3,1,1,26,San,1,B-City,0,City,1,...,0,0,1,0,1,1,0,0,0,City
48,3,2,2,13,San,1,B-City,0,City,1,...,0,0,1,0,1,1,0,0,0,City
89,3,3,3,38,Lomas,22,B-Col,0,Col,1,...,0,0,0,0,1,1,0,1,0,Col
202,5,0,10,0,TEGUCIGALPA,22,B-City,0,City,1,...,0,0,1,0,1,1,1,1,0,City
407,9,0,21,0,TEGUCIGALPA,22,B-City,0,City,1,...,0,0,1,0,1,1,1,1,0,City
591,9,8,29,9,Kennedy,22,B-Col,0,Col,1,...,0,0,0,0,1,1,1,1,0,Col
623,9,8,29,41,Distrito,22,B-City,0,City,1,...,0,0,1,0,1,1,0,1,0,City
917,11,0,40,0,PUERTO,22,B-City,0,City,1,...,0,0,1,0,1,1,0,1,0,City
920,11,0,40,3,Cortés,1,B-State,0,State,1,...,0,1,0,0,1,0,1,1,0,State


In [51]:
def score_entities(true, pred):
    geo_types =["Country","State","City","Zone","Col","Bar"]
    
    data = []
    for geo_type in geo_types:
        # Find the true positives
        true_positive = 0
        for index, entity in pred[pred["geo_type"]==geo_type].iterrows():
            result = true[ (true["geo_type"] == geo_type)
                          & (true["cs_id"] == entity.cs_id)
                          & (true["pos"] == entity.pos)
                          & (true["entity"] == entity.entity)
                         ]
            if len(result) > 0:
                true_positive += 1

        support = true[true["geo_type"]==geo_type].shape[0]
#         print "true positives: %i" %true_positive
#         print "predicted positives: %i" % pred[pred["geo_type"]==geo_type].shape[0]
#         print "real positives: %i" % support
        if true_positive == 0:
            precision =0.0
            recall = 0.0
            fscore = 0.0
        else:
            precision = true_positive * 1.0 / pred[pred["geo_type"]==geo_type].shape[0]
#         print "precision: %0.4f" %precision
            recall = true_positive * 1.0 / support
#         print "recall: %0.4f" %recall

        
            fscore = (precision * recall)*2.0 / (precision + recall)
#         print "fscores: %0.4f" %fscore
        
        data.append([geo_type,precision,recall,fscore,support ])
        
    scores_df = pd.DataFrame(data,columns=["geo_type","precision","recall","fscore","support"])
    
   
    f1_result = (scores_df["fscore"] * scores_df["support"]).sum() / scores_df["support"].sum()
    
 

    return scores_df, f1_result

## 3rd Level ....

Before, the 3rd level classified the identified NE parts into <I|O|B>-<category> ... this caused problems since the classifier sometimes confuses the B with an I and viceversa, generating some entities without any B :( 

For that reason, I'm generating a new preprocessing function that follows the IOB rule and the classifier will only classify the geo-type and nothing else, since the other 2 levels are devoted to identify the entity! :)


In [20]:
train_stacked.head()

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,iob,geo_type,upper,...,sent_size,in_Country,in_State,in_City,pred_0,pred0_prev_1,pred0_prev_2,pred0_next_1,pred0_next_2,s_preds
0,3,0,0,0,SAN,1,B-City,0,City,1,...,3,0,0,1,0,1,1,0,0,0
1,3,0,0,1,PEDRO,22,I-City,0,City,1,...,3,0,0,1,0,0,1,0,1,0
2,3,0,0,2,SULA,22,I-City,0,City,1,...,3,0,0,1,0,0,0,1,1,0
3,3,1,1,0,-,13,none,1,none,0,...,32,0,0,0,1,0,0,1,1,1
4,3,1,1,1,Hasta,34,none,1,none,1,...,32,0,0,0,1,1,0,1,1,1


In [46]:
def getOnlyB(df):
    temp_df = df.copy()
    temp_df.loc[:,"B"] = np.zeros(temp_df.shape[0])
    last = 1
    last_artid = -1
    for idx, word in temp_df.iterrows():        
        if ( last == 1 and word.s_preds == 0 ) or last_artid != word.art_id :
            temp_df.loc[idx,"B"] = 1
        last = word.s_preds
        last_artid = word.art_id
    
    return temp_df[temp_df["B"]==1].drop("B",1)
            
getOnlyB(train_stacked).head()

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,iob,geo_type,upper,...,sent_size,in_Country,in_State,in_City,pred_0,pred0_prev_1,pred0_prev_2,pred0_next_1,pred0_next_2,s_preds
0,3,0,0,0,SAN,1,B-City,0,City,1,...,3,0,0,1,0,1,1,0,0,0
29,3,1,1,26,San,1,B-City,0,City,1,...,32,0,0,1,0,1,1,0,0,0
48,3,2,2,13,San,1,B-City,0,City,1,...,16,0,0,1,0,1,1,0,0,0
89,3,3,3,38,Lomas,22,B-Col,0,Col,1,...,42,0,0,0,0,1,1,0,1,0
202,5,0,10,0,TEGUCIGALPA,22,B-City,0,City,1,...,1,0,0,1,0,1,1,1,1,0


In [48]:
# Try to classify the preds 
# in this case 0 means it's part of an entity
# cat_train = train_stacked[train_stacked["s_preds"]==0]
# cat_test = test_stacked[test_stacked["s_preds"]==0]

cat_train = getOnlyB(train_stacked)
cat_test = getOnlyB(test_stacked)

X_train3, _, features_names3 = getXY(cat_train, desc_columns, names=True)
X_test3, _ = getXY(cat_test, desc_columns, includeColumns= features_names3.tolist())

y_train3 = cat_train.iob_tag.apply(lambda x: x[2:]).values
y_test3 = cat_test.iob_tag.apply(lambda x: x[2:]).values

#5. Train the new classifier ... muhahaha!
parameters = {
    "n_estimators": [200],
    "max_depth": [20], #[4,5,8,10,20],
    "min_samples_split" : [4], # [2,4,8,10],
    "max_features": [0.5] #[.8,.5,.3,.1]
}
rfc3 = RandomForestClassifier(random_state= 233, n_jobs=4)
clf3 = train_score(rfc3, parameters, X_train3, y_train3, X_test3, y_test3, labels=None)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    4.5s finished


0.858973139649
{'max_features': 0.5, 'min_samples_split': 4, 'n_estimators': 200, 'max_depth': 20}
[ 0.98493593]
[ 0.85897314]
- Train Results -
             precision    recall  f1-score   support

        Bar       1.00      1.00      1.00        24
       City       0.96      1.00      0.98       407
        Col       1.00      0.98      0.99       117
    Country       0.99      1.00      1.00       143
      State       0.99      0.97      0.98       151
       Zone       0.99      0.99      0.99       100
         ne       1.00      0.95      0.98       188

avg / total       0.98      0.98      0.98      1130

- Test Results -
             precision    recall  f1-score   support

        Bar       1.00      1.00      1.00         3
       City       0.76      0.97      0.85        93
        Col       0.93      0.82      0.87        34
    Country       0.97      1.00      0.99        39
      State       0.78      0.88      0.82        32
       Zone       0.69      0.90      0

In [52]:
preds_train = clf3.predict(X_train3)
preds_test = clf3.predict(X_test3)

In [64]:


geo_preds_train = pd.Series(preds_train, index = cat_train.index )
geo_preds_test = pd.Series(preds_test, index = cat_test.index )

final_train = train_stacked.copy()
final_test = test_stacked.copy()

final_train.loc[:,"pred_geo"] = geo_preds_train
final_test.loc[:,"pred_geo"] = geo_preds_test

# final_train.loc[:,"pred_geo"] = final_train.pred_geo.apply(lambda x: x if type(x)==str else "none" )



#Train
print "--Train"
true_entities = extract_entities(train_stacked, "iob", "geo_type")
pred_entities = extract_entities(final_train, "s_preds", "pred_geo")
print score_entities(true_entities, pred_entities)

#Train
print "--Test"
true_entities = extract_entities(test_stacked, "iob", "geo_type")
pred_entities = extract_entities(final_test, "s_preds", "pred_geo")
print score_entities(true_entities, pred_entities)

--Train
(  geo_type  precision    recall    fscore  support
0  Country   0.993056  0.986207  0.989619      145
1    State   0.952703  0.886792  0.918567      159
2     City   0.930952  0.937650  0.934289      417
3     Zone   0.610000  0.504132  0.552036      121
4      Col   0.782609  0.692308  0.734694      130
5      Bar   1.000000  0.960000  0.979592       25, 0.8685475158988629)
--Test
(  geo_type  precision    recall    fscore  support
0  Country   0.975000  0.951220  0.962963       41
1    State   0.750000  0.818182  0.782609       33
2     City   0.686441  0.852632  0.760563       95
3     Zone   0.615385  0.421053  0.500000       19
4      Col   0.766667  0.547619  0.638889       42
5      Bar   1.000000  1.000000  1.000000        3, 0.7592035317514898)


In [43]:
df_test,_ = score_entities(true_entities, pred_entities)
df_test

Unnamed: 0,geo_type,precision,recall,fscore,support
0,Country,1.0,0.975,0.987342,40
1,State,1.0,0.911765,0.953846,34
2,City,0.819149,0.747573,0.781726,103
3,Zone,0.888889,0.333333,0.484848,24
4,Col,0.833333,0.78125,0.806452,32
5,Bar,1.0,1.0,1.0,2


In [42]:
df_test.to_dict(orient='records')

[{'fscore': 0.9873417721518987,
  'geo_type': 'Country',
  'precision': 1.0,
  'recall': 0.975,
  'support': 40},
 {'fscore': 0.9538461538461539,
  'geo_type': 'State',
  'precision': 1.0,
  'recall': 0.9117647058823529,
  'support': 34},
 {'fscore': 0.781725888324873,
  'geo_type': 'City',
  'precision': 0.8191489361702128,
  'recall': 0.7475728155339806,
  'support': 103},
 {'fscore': 0.48484848484848486,
  'geo_type': 'Zone',
  'precision': 0.8888888888888888,
  'recall': 0.3333333333333333,
  'support': 24},
 {'fscore': 0.8064516129032259,
  'geo_type': 'Col',
  'precision': 0.8333333333333334,
  'recall': 0.78125,
  'support': 32},
 {'fscore': 1.0,
  'geo_type': 'Bar',
  'precision': 1.0,
  'recall': 1.0,
  'support': 2}]

# Test the Correction Algorithm
This algorithm assumes that all of the Positives from the first model are correct because of its high precision ( which should be around .9 )... Therefore, the only words that need correction are the False Negatives that cause the low recall. So by removing the correct answers we're gonna train a model to distinguish which words are false positives and should be corrected by using the previous algorithm results.  For instance, intuitively, a word that has it first letter capitalized and is next to another word that is an entity would most probably be also an entity.

In [17]:
#1. get preds and add them to the original features
train_stacked2 = train_df.copy()
test_stacked2 = test_df.copy()
preds_train = clf.predict(X_train)
preds_test = clf.predict(X_test)

train_stacked.loc[:,"pred_0"] = preds_train
test_stacked.loc[:,"pred_0"] = preds_test

#1.5 add dummy variables
# prev_dummies = pd.get_dummies(train_stacked["prev_1"], prefix="prev_1")
# train_stacked = pd.concat([train_stacked, prev_dummies], axis=1)
# train_stacked.loc[:,"prev_1_14"] = np.zeros(train_stacked.shape[0])

# prev_dummies = pd.get_dummies(test_stacked["prev_1"], prefix="prev_1")
# test_stacked = pd.concat([test_stacked, prev_dummies], axis=1)
# test_stacked.loc[:,"prev_1_10"] = np.zeros(test_stacked.shape[0])
# test_stacked.loc[:,"prev_1_20"] = np.zeros(test_stacked.shape[0])




#2. Generate features with prev and next
train_stacked = get_stacked_features(train_stacked, [0])
test_stacked = get_stacked_features(test_stacked, [0])

#3. Filter only the ones that aren't positive i.e. only pred_0 == 1
train_stacked = train_stacked[train_stacked["pred_0"]==1]
test_stacked = test_stacked[test_stacked["pred_0"]==1]
# train_stacked[train_stacked["iob"]==0]


#4.Remove some of the features
desc_columns2 = desc_columns + [ 
#     "in_City", "in_Country", "pred_0","in_State", #"prev_1"
]
X_train2, y_train2, features_names2 = getXY(train_stacked, desc_columns2, names=True)
X_test2, y_test2 = getXY(test_stacked, desc_columns2, includeColumns= features_names2.tolist())

#5. Train the new classifier ... muhahaha!
parameters = {
    "n_estimators": [200],
    "max_depth": [10],#,4,5,8,10,20],
    "min_samples_split" : [4]
}
rfc2 = RandomForestClassifier(random_state= 233, n_jobs=4)
clf2 = train_score(rfc2, parameters, X_train2, y_train2, X_test2, y_test2)


#  0.81      0.18      0.29   original
# 0.81      0.18      0.29    with restrictions
# 0.63      0.26      0.37    with max_depth 20
# 0.63      0.28      0.39    with max_depth 40

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   20.6s finished


0.2862342677
{'min_samples_split': 4, 'n_estimators': 200, 'max_depth': 10}
[ 0.73958]
[ 0.28623427]
- Train Results -
             precision    recall  f1-score   support

          0       1.00      0.55      0.71        95
          1       1.00      1.00      1.00     53773

avg / total       1.00      1.00      1.00     53868

- Test Results -
             precision    recall  f1-score   support

          0       0.80      0.04      0.07       104
          1       0.99      1.00      1.00     13417

avg / total       0.99      0.99      0.99     13521



In [87]:
pd.DataFrame({
    "names": features_names2,
    "importance": clf2.best_estimator_.feature_importances_
}).sort(columns="importance", ascending=False)[:20]



Unnamed: 0,importance,names
0,0.181539,pos
2,0.134722,upper
4,0.11486,size
7,0.103601,next_1
8,0.10203,next_2
6,0.087242,prev_2
1,0.04834,pos_tag
63,0.032789,pred0_prev_2
44,0.030236,prev_1_34
65,0.020145,pred0_next_2


In [77]:
train_stacked = train_df.copy()
test_stacked = test_df.copy()

prev_dummies = pd.get_dummies(train_stacked["prev_1"], prefix="prev_1")
train_stacked = pd.concat([train_stacked, prev_dummies], axis=1)
print prev_dummies.columns
prev_dummies = pd.get_dummies(test_stacked["prev_1"], prefix="prev_1")
test_stacked = pd.concat([test_stacked, prev_dummies], axis=1)
prev_dummies.loc[:,"prev_1_10"] = np.zeros(test_stacked.shape[0])
print prev_dummies.columns

test_stacked.shape

Index([u'prev_1_0', u'prev_1_1', u'prev_1_2', u'prev_1_3', u'prev_1_4',
       u'prev_1_5', u'prev_1_6', u'prev_1_7', u'prev_1_8', u'prev_1_9',
       u'prev_1_10', u'prev_1_11', u'prev_1_12', u'prev_1_13', u'prev_1_15',
       u'prev_1_16', u'prev_1_17', u'prev_1_18', u'prev_1_19', u'prev_1_20',
       u'prev_1_21', u'prev_1_22', u'prev_1_23', u'prev_1_24', u'prev_1_25',
       u'prev_1_26', u'prev_1_27', u'prev_1_28', u'prev_1_29', u'prev_1_30',
       u'prev_1_31', u'prev_1_32', u'prev_1_33', u'prev_1_34', u'prev_1_35',
       u'prev_1_36', u'prev_1_37', u'prev_1_38', u'prev_1_39', u'prev_1_40',
       u'prev_1_41', u'prev_1_42', u'prev_1_43', u'prev_1_44', u'prev_1_45',
       u'prev_1_46', u'prev_1_47', u'prev_1_48', u'prev_1_49', u'prev_1_50'],
      dtype='object')
Index([u'prev_1_0', u'prev_1_1', u'prev_1_2', u'prev_1_3', u'prev_1_4',
       u'prev_1_5', u'prev_1_6', u'prev_1_7', u'prev_1_8', u'prev_1_9',
       u'prev_1_11', u'prev_1_12', u'prev_1_13', u'prev_1_14', u'prev_1_1

(16601, 71)

# Test the rule based correction Algorithm

In [23]:
#1. get preds and add them to the original features
train_stacked = train_df.copy()
test_stacked = test_df.copy()
preds_train = clf.predict(X_train)
preds_test = clf.predict(X_test)

train_stacked.loc[:,"pred_0"] = preds_train
test_stacked.loc[:,"pred_0"] = preds_test

#2. Generate features with prev and next
train_stacked = get_stacked_features(train_stacked, [0])
test_stacked = get_stacked_features(test_stacked, [0])

#3. Filter only the ones that aren't positive i.e. only pred_0 == 1
train_stacked = train_stacked[train_stacked["pred_0"]==1]
test_stacked = test_stacked[test_stacked["pred_0"]==1]
# train_stacked[train_stacked["iob"]==0]

#4. Simple rule based classifier
def rbc_predict(df):
    preds = []
    for _, row in df.iterrows():
#         if row.upper == 1 and row.pred0_prev_1 == 0 and row.pos>0:
#             preds.append(0)
        if row.pred0_prev_1 == 0 and row.pred0_next_1 == 0 and row.word=="de":
            preds.append(0)
        else:
            preds.append(1)
    return preds

preds_train2 = rbc_predict(train_stacked)
preds_test2 = rbc_predict(test_stacked)
            
print len(y_train2)
print len(preds_train2)
print classification_report(train_stacked["iob"].values, preds_train2  )
print classification_report(test_stacked["iob"].values, preds_test2 )


55387
53868
             precision    recall  f1-score   support

          0       0.15      0.04      0.07        95
          1       1.00      1.00      1.00     53773

avg / total       1.00      1.00      1.00     53868

             precision    recall  f1-score   support

          0       0.50      0.02      0.04       104
          1       0.99      1.00      1.00     13417

avg / total       0.99      0.99      0.99     13521



In [24]:
train_stacked.loc[:,"pred"] = preds_train2
train_stacked[train_stacked["pred"]==0][["art_id","sent_id","pos","upper","pred0_prev_1","pred0_next_1","word","iob","pred"]]

Unnamed: 0,art_id,sent_id,pos,upper,pred0_prev_1,pred0_next_1,word,iob,pred
4474,35,9,14,0,0,0,de,1,0
13818,434,1,14,0,0,0,de,1,0
15921,495,4,26,0,0,0,de,0,0
19299,643,6,16,0,0,0,de,0,0
22977,844,0,23,0,0,0,de,1,0
23111,844,6,33,0,0,0,de,1,0
23540,865,17,20,0,0,0,de,1,0
24241,915,17,41,0,0,0,de,1,0
26331,1008,0,31,0,0,0,de,1,0
28460,1112,0,1,0,0,0,de,1,0


In [123]:
train_stacked = train_df.copy()
test_stacked = test_df.copy()
preds_train = clf.predict(X_train)
preds_test = clf.predict(X_test)

train_stacked.loc[:,"pred_0"] = preds_train
test_stacked.loc[:,"pred_0"] = preds_test

#2. Generate features with prev and next
train_stacked = get_stacked_features(train_stacked, [0])
test_stacked = get_stacked_features(test_stacked, [0])

train_stacked.head(10)
train_stacked.loc[21170:21172]

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,iob,geo_type,upper,...,prev_prefix_1,prev_prefix_2,in_Country,in_State,in_City,pred_0,pred0_prev_1,pred0_prev_2,pred0_next_1,pred0_next_2
21170,671,0,754,42,Suyapa,22,I-City,0,City,1,...,0,1,0,0,0,0,0,1,1,0
21171,671,0,754,43,de,34,none,1,none,0,...,0,0,0,0,0,1,0,0,0,1
21172,671,0,754,44,Tegucigalpa,22,B-City,0,City,1,...,0,0,0,0,1,0,1,0,1,1


# Try HMM

In [170]:
import ast 
def convertToEntities(articles):
    data = []
    cs_id = -1 #corpus sentence id
    for _, article in articles.iterrows():
        sentences = article.tagged_content
        for sent_ix, sent in enumerate(sentences):
            cs_id +=1
            for pos, word in enumerate(sent):
                if word[2][0] == "B" or word[2] == "none": 
                    data.append([[
                        article.article_id, 
                        sent_ix,
                        cs_id,
                        pos,
                        word[0],
                        word[1],
                        word[2]
                    ]])
                elif word[2][0] == "I":
                    data[-1].append([
                        article.article_id, 
                        sent_ix,
                        cs_id,
                        pos,
                        word[0],
                        word[1],
                        word[2]
                    ])
    entity_df = pd.DataFrame({ 
        "entity":data
    })

    entity_df.loc[:,"cat"] = entity_df.entity.apply(lambda x: x[0][6] + "_" + str(len(x)))
    
    return entity_df

In [207]:
def convertToWords(articles):
    print "- Convert to Word tokens"
    data = []
    cs_id = -1  #corpus sentence id
    for _, article in articles.iterrows():
        sentences = article.tagged_content
        for sent_ix, sent in enumerate(sentences):
            cs_id +=1
            for pos, word in enumerate(sent):
                data.append([
                    article.article_id, 
                    sent_ix,
                    cs_id,
                    pos,
                    word[0],
                    word[1],
                    word[2]
                ])        
    df = pd.DataFrame(data, columns=["art_id","sent_id","cs_id","pos", "word","pos_tag","iob_tag"])
    
    #clean the iob
    df.loc[:,"iob_tag"] = df.iob_tag.apply(lambda x: x if x not in ["B-Misc", "I-Misc"] else "none" )
    df.loc[:,"iob_tag"] = df.iob_tag.apply(lambda x: x if x not in ["B-Res", "I-Res"] else x[:2]+"Zone" )
    # separate the geo-entity classification from the IOB tag
    df.loc[:, "iob"] = df.iob_tag.apply(lambda x: "O" if x == "none"  else x[0] )
    df.loc[:, "geo_type"] = df.iob_tag.apply(lambda x: x[2:] if x != "none"  else x )
    
    return df

In [228]:
articles_df = pd.read_csv("../files/pos_articles.csv")
articles_df.tagged_content = articles_df.tagged_content.apply(lambda x: ast.literal_eval(x))
words_df = convertToWords(articles_df)
words_df.head(10)

- Convert to Word tokens


Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,iob,geo_type
0,3,0,0,0,SAN,AQ,B-City,B,City
1,3,0,0,1,PEDRO,NC,I-City,I,City
2,3,0,0,2,SULA,NC,I-City,I,City
3,3,1,1,0,-,Fg,none,O,none
4,3,1,1,1,Hasta,SP,none,O,none
5,3,1,1,2,el,DA,none,O,none
6,3,1,1,3,momento,NC,none,O,none
7,3,1,1,4,ocho,DN,none,O,none
8,3,1,1,5,bodegas,NC,none,O,none
9,3,1,1,6,han,VAI,none,O,none


In [229]:
def WordsToEntities(df):
    data = []
    for idx, row in df.iterrows():
        if row.iob in ["B","O"]:
            data.append([row])
        else:
            data[-1].append(row)
    
    entity_df = pd.DataFrame({ 
        "entity":data
    })

    entity_df.loc[:,"cat"] = entity_df.entity.apply(lambda x: x[0].geo_type + "_" + str(len(x)))
    entity_df.loc[:,"category"] = entity_df.entity.apply(lambda x: x[0].geo_type)
    entity_df.loc[:,"size"] = entity_df.entity.apply(lambda x: len(x))
    
    return entity_df
    
entities_df = WordsToEntities(words_df)
entities_df.head()

Unnamed: 0,entity,cat,category,size
0,"[[3, 0, 0, 0, SAN, AQ, B-City, B, City], [3, 0...",City_3,City,3
1,"[[3, 1, 1, 0, -, Fg, none, O, none]]",none_1,none,1
2,"[[3, 1, 1, 1, Hasta, SP, none, O, none]]",none_1,none,1
3,"[[3, 1, 1, 2, el, DA, none, O, none]]",none_1,none,1
4,"[[3, 1, 1, 3, momento, NC, none, O, none]]",none_1,none,1


In [226]:
low_cats = entities_df.cat.value_counts()[ entities_df.cat.value_counts() < 8 ].index.values
safety = 10
while low_cats.shape[0] > 0 and safety > 0:
    for cat in low_cats:
        entities_df.at[entities_df["cat"]==cat,"cat"] = cat[:-1] + str( int(cat[-1]) - 1 )
    low_cats = entities_df.cat.value_counts()[ entities_df.cat.value_counts() < 8 ].index.values
    safety -= 1
print safety
entities_df.cat.value_counts()

8


none_1       67240
City_1         299
Country_1      173
State_1        152
City_2         105
City_3          88
Col_2           78
Zone_2          62
Col_1           46
Zone_1          44
State_2         40
Col_3           35
City_4          20
Bar_2           19
Country_2       15
Zone_3          14
Col_4           13
Zone_4          11
Zone_5          10
Bar_1            8
Name: cat, dtype: int64

In [210]:
def EntitiesToWords(df):
    data = []
    for _, row in df.iterrows():
        for entity in row.entity:
            data.append(entity)
            
    df = pd.DataFrame(data, columns=["art_id","sent_id","cs_id","pos", "word","pos_tag","iob_tag"])
    return df

EntitiesToWords(entities_df).head()

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag
0,3,0,0,0,SAN,AQ,B-City
1,3,0,0,1,PEDRO,NC,I-City
2,3,0,0,2,SULA,NC,I-City
3,3,1,1,0,-,Fg,none
4,3,1,1,1,Hasta,SP,none


In [233]:
print entities_df.cat.value_counts() #/entities_df.shape[0]

none_1       67240
City_1         299
Country_1      173
State_1        152
City_2         105
City_3          88
Col_2           78
Zone_2          62
Col_1           46
Zone_1          44
State_2         37
Col_3           35
Bar_2           18
City_4          18
Country_2       15
Zone_3          14
Zone_4          11
Col_4           11
Bar_1            9
Zone_5           9
State_3          3
City_5           2
Col_5            2
Bar_3            1
Zone_7           1
Name: cat, dtype: int64


In [202]:
entities_df[entities_df["cat"].isin(["Bar_1","Bar_2"])]\
.entity.apply(lambda x: [w[4] + " " for w in x ])

3385               [Alvarado ]
3490               [Alvarado ]
10158            [La , Ronda ]
11644           [Barandillas ]
11829           [Barandillas ]
15060         [El , Chaparro ]
18838          [Las , Palmas ]
26098               [Cabañas ]
26380          [Las , Palmas ]
27008                 [López ]
27949       [San , Francisco ]
33791           [La , Granja ]
40260         [El , Chaparro ]
40889               [Morazán ]
40892             [Torocagua ]
56717         [Villa , Adela ]
57376            [El , Chile ]
58222    [Perpetuo , Socorro ]
58583          [La , Reforma ]
58964         [Villa , Adela ]
60630    [Perpetuo , Socorro ]
62506           [El , Centro ]
63269    [Perpetuo , Socorro ]
63382        [Las , Crucitas ]
65803        [La , Esperanza ]
66500    [Perpetuo , Socorro ]
Name: entity, dtype: object

In [201]:
entities_df[entities_df["cat"].isin(["none_2"])].entity.values

array([ [art_id        1661
sent_id          6
cs_id         2076
pos             42
word        Barrio
pos_tag         NC
iob_tag       none
iob              O
geo_type      none
Name: 57990, dtype: object, art_id       1661
sent_id         6
cs_id        2076
pos            43
word        Abajo
pos_tag        NC
iob_tag     I-Bar
iob             I
geo_type      Bar
Name: 57991, dtype: object]], dtype=object)