# TODO Notes:

- How to identify IOB chunks? how to do chunking? Label correctly chunks...

# Intro

In this notebook I'll try to classify identified NEs.

The first part will be a remake of the NE identifier but with a twist, since it will prepare the way for the second part.

The second part will be of engineering features in order to identify the type of NER

The third part will be of using other text in order to test if it works with external data.

In [251]:
import nltk

from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer
from nltk.corpus import  conll2002

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# 1.  NE Identifier

## 1.1 convert the sentences into DataFrame format

In [12]:
def convert_df(sentences):
    print "- convert_df"
    s_id = []
    s_word = []
    s_tag = []
    s_iob =[]
    s_pos = []

    for sent_num in range(len(sentences)):
        sent = sentences[sent_num]
        for pos in range(len(sent)):
            word = sent[pos]
            s_id.append(sent_num)
            s_word.append(word[0])
            s_tag.append(word[1])
            s_iob.append(word[2])
            s_pos.append(pos)
        
    df = pd.DataFrame({
            "sentence": s_id,
            "word": s_word,
            "tag": s_tag,
            "iob": s_iob,
            "pos": s_pos
        })
    
    return df

## 1.2 Engineer features

In [21]:
def get_ident_features(base_df):
    print "- get features"
    df = pd.DataFrame()
    # iob
    le_iob = LabelEncoder()
    df.loc[:,"iob"] = le_iob.fit_transform(base_df.iob)


    #tag
    le_tag = LabelEncoder()
    df.loc[:,"tag"] = le_tag.fit_transform(base_df.tag)

    
    # Uppercase
    df.loc[:,"upper"] = base_df.word.apply(lambda x: x[0].isupper())
    
    # Pos
    df.loc[:,"pos"] = base_df.pos
    
    #first 
    df.loc[:,"first"] = base_df.pos.apply(lambda x: int(x == 0) )
    #size
    df.loc[:,"size"] = base_df.word.apply(lambda x: len(x))
    
    
    return df, le_iob, le_tag

# 1.3 Convert into binary 

In [14]:
def convert_binary_classes(df):
    print "- convert to binary classes"
    classes = []
    for i in range(9):
        #binarizing
        if i == 8:
            class_df = df[df["iob"]==i]
            class_df.iob = 0
            classes.append(class_df )
        else:
            class_df = df[df["iob"]==i]
            class_df.iob = 1
            classes.append(class_df)
            
    
    newdf = pd.concat(classes)
    
    print newdf.iob.value_counts()
        
    return newdf

## 1.4 Simple function to divide data into X and Y

In [15]:
def getXY(df):
    print "- get X and Y"
    X = df.drop("iob",1).values
    y = df.iob.values
    
    return X, y

## 1.5 Classifier

In [35]:
def train_classifier(X,y, cv=False ):
    print "- train classifier"
    clf = RandomForestClassifier(n_estimators=100, random_state= 233, n_jobs=2)
    
    #If it doesn't require to cross validate
    if not cv:
        clf.fit(X, y)
        print "train score: %0.6f" %clf.score(X,y)
        return clf
        
        

    #train
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=10,shuffle=True, random_state= 233)

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        #train
        clf.fit(X_train, y_train)
        print "---------"
        print "train score: %0.6f" %clf.score(X_train,y_train)
        print "test score: %0.6f" %clf.score(X_test,y_test)
        
    return clf

## 1.6 Train the Identifier

In [20]:

df = convert_df(conll2002.iob_sents('esp.train'))
df, le_iob, le_tag = get_ident_features(df)
df = convert_binary_classes(df)
X,y = getXY(df)
clf = train_identifier(X,y, cv=True )

- convert_df
- get features
[u'B-LOC', u'B-MISC', u'B-ORG', u'B-PER', u'I-LOC', u'I-MISC', u'I-ORG', u'I-PER', u'O']
- convert to binary classes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


0    231920
1     32795
Name: iob, dtype: int64
- get X and Y
- train identifier
---------
train score: 0.978740
test score: 0.975899
---------
train score: 0.978480
test score: 0.978166
---------
train score: 0.978677
test score: 0.976692
---------
train score: 0.978698
test score: 0.975975
---------
train score: 0.978576
test score: 0.977297
---------
train score: 0.978795
test score: 0.975709
---------
train score: 0.978673
test score: 0.976351
---------
train score: 0.978564
test score: 0.977674
---------
train score: 0.978635
test score: 0.977485
---------
train score: 0.978539
test score: 0.978014


# 2. Classifying Identified NE

Given that the NE was identified its time to classify. The IOB will be the target.
Steps:
1. Converting sentences into DF 
2. adding a new feature NE,  1= it's NE  0= it is not an NE
3. for each NE, generate features (try the simple ones first)
    **IMPORTANT: ** is should be a list of NE (or not?) because its a classification task
4. train a classifier
5. measure results



## 2.1 Converting sentences into DF

In [213]:
raw_df = convert_df(conll2002.iob_sents('esp.train'))
raw_df.head()

- convert_df


Unnamed: 0,iob,pos,sentence,tag,word
0,B-LOC,0,0,NP,Melbourne
1,O,1,0,Fpa,(
2,B-LOC,2,0,NP,Australia
3,O,3,0,Fpt,)
4,O,4,0,Fc,","


## 2.2 add new feature NE

In [214]:
raw_df.loc[:,"NE"] = raw_df.iob.apply(lambda x: int(x!="O"))
raw_df.head()

Unnamed: 0,iob,pos,sentence,tag,word,NE
0,B-LOC,0,0,NP,Melbourne,1
1,O,1,0,Fpa,(,0
2,B-LOC,2,0,NP,Australia,1
3,O,3,0,Fpt,),0
4,O,4,0,Fc,",",0


## 2.3 generate features 

In [215]:
def get_features(base_df):
    print "- get features"
    df = pd.DataFrame()
    
    # iob
    le_iob = LabelEncoder()
    df.loc[:,"iob"] = le_iob.fit_transform(base_df.iob)
    
    # NE 
    df.loc[:,"NE"] = base_df.NE
    
    #tag
    le_tag = LabelEncoder()
    df.loc[:,"tag"] = le_tag.fit_transform(base_df.tag)

    # Uppercase
    df.loc[:,"upper"] = base_df.word.apply(lambda x: int(x[0].isupper()))
    
    # Pos
    df.loc[:,"pos"] = base_df.pos
    
    #first 
    df.loc[:,"first"] = base_df.pos.apply(lambda x: int(x == 0) )
    #size
    df.loc[:,"size"] = base_df.word.apply(lambda x: len(x))
    #sentence
#     df.loc[:,"sentence"] = base_df.sentence
#     #word
#     df.loc[:,"word"] = base_df.word
    
    return df, le_iob, le_tag 

simple_df, le_iob, le_tag = get_features(raw_df)
print le_iob.classes_
simple_df.head()

- get features
[u'B-LOC' u'B-MISC' u'B-ORG' u'B-PER' u'I-LOC' u'I-MISC' u'I-ORG' u'I-PER'
 u'O']


Unnamed: 0,iob,NE,tag,upper,pos,first,size
0,0,1,28,1,0,1,9
1,8,0,20,0,1,0,1
2,0,1,28,1,2,0,9
3,8,0,21,0,3,0,1
4,8,0,12,0,4,0,1


In [216]:
a[-2:]

[6, 7]

In [217]:
# A simpler way, just shifting 1
def get_simpler_features(df):
    
    tags = df.tag.values.tolist()
    
    df.loc[:,"prev_1"] = tags[-1:] + tags[:-1]
#     df.at[df["pos"] == 0,'prev_1'] = -1
    df.loc[:,"prev_2"] = tags[-2:] + tags[:-2]
#     df.at[df["pos"] == 0,'prev_2'] = -1
#     df.at[df["pos"] == 1,'prev_2'] = -1
#     df.loc[:,"prev_3"] = tags[-3:] + tags[:-3]
    
    
    df.loc[:,"next_1"] = tags[1:] + tags[:1]
    df.loc[:,"next_2"] = tags[2:] + tags[:2]
#     df.loc[:,"next_3"] = tags[3:] + tags[:3]
    
    nes = df.NE.values.tolist()
    df.loc[:,"prevNE_1"] = nes[-1:] + nes[:-1]
    df.loc[:,"prevNE_2"] = nes[-2:] + nes[:-2]
    df.loc[:,"nextNE_1"] = nes[1:] + nes[:1]
    df.loc[:,"nextNE_2"] = nes[2:] + nes[:2]
    
    
#     for sent in df.sentence.unique():
#         words = df[df["sentence"]== sent].word.values.tolist()
#         print words
#         words = [None] + words[:-1]
#         print words
#         return 


    tag_df = pd.get_dummies(df.tag,prefix="tag_")
    for column in tag_df.columns:
        df.loc[:,column] = tag_df[column]
        
    
    

    return df
    
features_df = get_simpler_features(simple_df)  
features_df.head()

Unnamed: 0,iob,NE,tag,upper,pos,first,size,prev_1,prev_2,next_1,...,tag__49,tag__50,tag__51,tag__52,tag__53,tag__54,tag__55,tag__56,tag__57,tag__58
0,0,1,28,1,0,1,9,19,1,20,...,0,0,0,0,0,0,0,0,0,0
1,8,0,20,0,1,0,1,28,19,28,...,0,0,0,0,0,0,0,0,0,0
2,0,1,28,1,2,0,9,20,28,21,...,0,0,0,0,0,0,0,0,0,0
3,8,0,21,0,3,0,1,28,20,12,...,0,0,0,0,0,0,0,0,0,0
4,8,0,12,0,4,0,1,21,28,58,...,0,0,0,0,0,0,0,0,0,0


# 2.4 train a classifier
### 2.4.1 prepare Data

In [218]:
# filtered by NE = True because we are only interested on classifying the NE

X = features_df[ features_df["NE"]==1].drop(["NE","iob","tag"],1).values
y = features_df[ features_df["NE"]==1].iob.values

In [254]:
def train_classifier2(X,y, cv=False, report = False ):
    print "- train classifier"
    clf = RandomForestClassifier(n_estimators=300, random_state= 233, n_jobs=4,
                                 max_depth=16, min_samples_split =10)
    
    #If it doesn't require to cross validate
    if not cv:
        clf.fit(X, y)
        print "train score: %0.6f" %clf.score(X,y)
        return clf
        
        

    #train
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=10,shuffle=True, random_state= 233, )

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        #train
        clf.fit(X_train, y_train)
        print "---------"
        print "train score: %0.6f" %clf.score(X_train,y_train)
        print "test score: %0.6f" %clf.score(X_test,y_test)
        if report:
            preds = clf.predict(X_test)
            print confusion_matrix(y_test, preds)
#             target_names = ['NE', 'Other']
            print(classification_report(y_test, preds)) #target_names=target_names
            
        break
        
    return clf

In [220]:
clf2 = train_classifier2(X,y, cv=True )

- train classifier
---------
train score: 0.757540
test score: 0.676712
---------
train score: 0.758065
test score: 0.680171
---------
train score: 0.755099
test score: 0.691253
---------
train score: 0.758038
test score: 0.672361
---------
train score: 0.756615
test score: 0.688225
---------
train score: 0.755700
test score: 0.684564
---------
train score: 0.757767
test score: 0.669921
---------
train score: 0.755463
test score: 0.675717
---------
train score: 0.755158
test score: 0.671446
---------
train score: 0.754819
test score: 0.689750


In [224]:
pd.DataFrame({
        "name":features_df.drop(["NE","iob","tag"],1).columns,
        "importance": clf2.feature_importances_ 
    }) #.sort_values("importance", ascending=False)

Unnamed: 0,importance,name
0,0.032703,upper
1,0.070054,pos
2,0.008150,first
3,0.100237,size
4,0.124559,prev_1
5,0.080534,prev_2
6,0.080575,next_1
7,0.068814,next_2
8,0.227243,prevNE_1
9,0.045507,prevNE_2


# Stacking

In [222]:
print X.shape
print y_feature.shape

y_feature = clf2.predict(X)


X2 = np.column_stack( (X, y_feature) )

print len(X2[0])
print X2[:10,71]
print y_feature[:10]

(32795, 71)
(32795,)
72
[0 0 2 2 6 6 6 3 7 2]
[0 0 2 2 6 6 6 3 7 2]


In [249]:
#weird uggly algorithm to get the prevNE stacked tag
prevNE = []
nextNE = []
for index, row in enumerate(X2):
    if row[8] == 1:
        prevNE.append( X2[index-1,71] )
    else:
        prevNE.append(-1)
    if row[10] == 1:
        nextNE.append( X2[index+1,71] )
    else:
        nextNE.append(-1)
        
X3 = np.column_stack( (X2, prevNE) )
X3 = np.column_stack( (X3, nextNE) )

# #pos -2 and pos +2
X3 = np.column_stack( (X3, prevNE[-2:] + prevNE[:-2]) )
X3 = np.column_stack( (X3, nextNE[2:]  + nextNE[:2]) )

print X3.shape
print len( prevNE)
print prevNE[:10]

(32795, 76)
32795
[-1, -1, -1, -1, 2, 6, 6, -1, 3, -1]


In [255]:
# Second level stacking
clf3 = train_classifier2(X3,y, cv=True, report=True)

- train classifier
---------
train score: 0.799288
test score: 0.769863
[[402   2  74  11   0   0   0   3]
 [ 31  88  93   6   0   0   0   0]
 [ 85   5 618  31   0   0   0   0]
 [ 49   3  43 324   1   1   2  10]
 [  0   0   0   1  78  14  72  25]
 [  0   1   0   1   7 217  83  13]
 [  0   0   0   0   7  19 448  26]
 [  0   0   0   1   3   1  32 354]]
             precision    recall  f1-score   support

          0       0.71      0.82      0.76       492
          1       0.89      0.40      0.56       218
          2       0.75      0.84      0.79       739
          3       0.86      0.75      0.80       433
          4       0.81      0.41      0.55       190
          5       0.86      0.67      0.76       322
          6       0.70      0.90      0.79       500
          7       0.82      0.91      0.86       391

avg / total       0.78      0.77      0.76      3285

