# Assignment 4: Named entity recognition

Построить модель для обнаружения и классификации именованных сущностей (named entities). На базе корпуса CoNLL 2002.  

Используйте в своем решении ансамбли над решающими деревьями: RandomForest, Gradient Boosting (xgboost, lightgbm, catboost) 
Tutorials:  
1. https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide
1. https://github.com/catboost/tutorials 


Чем больше baseline'ов вы превзойдете, тем выше ваша оценка
Метрика качества f1 (f1_macro) (чем выше, тем лучше)
 
baseline 1: 0.0604      random labels  
baseline 2: 0.3966      PoS features + logistic regression  
baseline 3: 0.8122      word2vec cbow embedding + baseline 2 + svm    

! Your results must be reproducible. Если ваша модель - стохастическая, то вы явно должны задавать все seed и random_state в параметрах моделей   

bonus, think about:  
1. How can you exploit that words belong to some sentence?
2. Why we selected f1 score with macro averaging as our classification quality measure? What other metrics are suitable?   

In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
#import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


SEED=1337

In [19]:
df = pd.read_csv('data/ner_short.csv', index_col=0)
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O


In [20]:
# number of sentences
df.sentence_idx.max()

1500.0

In [21]:
# class distribution
df.tag.value_counts(normalize=True )

O        0.852828
B-geo    0.027604
B-gpe    0.020935
B-org    0.020247
I-per    0.017795
B-tim    0.016927
B-per    0.015312
I-org    0.013937
I-geo    0.005383
I-tim    0.004247
B-art    0.001376
I-gpe    0.000837
I-art    0.000748
B-eve    0.000628
I-eve    0.000508
B-nat    0.000449
I-nat    0.000239
Name: tag, dtype: float64

In [22]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').tag.count()
df = tdf.reset_index(drop=False)

In [23]:
# encode categorial variables

le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])

In [24]:
df.head()

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
0,1.0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O,48
1,1.0,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O,48
2,1.0,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O,48
3,1.0,9,through,32,marched,33,18,9,of,demonstrators,have,O,48
4,1.0,16,London,9,through,32,33,18,demonstrators,have,marched,O,48


In [25]:
# splitting
y = LabelEncoder().fit_transform(df.tag)

df_train, df_test, y_train, y_test = model_selection.train_test_split(df, y, stratify=y, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


In [26]:
# some wrappers to work with word2vec
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from collections import defaultdict

   
class Word2VecWrapper(TransformerMixin):
    def __init__(self, window=5,negative=5, size=100, iter=100, is_cbow=False, random_state=SEED):
        self.window_ = window
        self.negative_ = negative
        self.size_ = size
        self.iter_ = iter
        self.is_cbow_ = is_cbow
        self.w2v = None
        self.random_state = random_state
        
    def get_size(self):
        return self.size_

    def fit(self, X, y=None):
        """
        X: list of strings
        """
        sentences_list = [x.split() for x in X]
        self.w2v = Word2Vec(sentences_list, 
                            window=self.window_,
                            negative=self.negative_, 
                            size=self.size_, 
                            iter=self.iter_,
                            sg=not self.is_cbow_, seed=self.random_state)

        return self
    
    def has(self, word):
        return word in self.w2v

    def transform(self, X):
        """
        X: a list of words
        """
        if self.w2v is None:
            raise Exception('model not fitted')
        return np.array([self.w2v[w] if w in self.w2v else np.zeros(self.size_) for w in X ])
    


In [27]:
%%time
# here we exploit that word2vec is an unsupervised learning algorithm
# so we can train it on the whole dataset (subject to discussion)

sentences_list = [x.strip() for x in ' '.join(df.word).split('.')]

w2v_cbow = Word2VecWrapper(window=5, negative=5, size=300, iter=300, is_cbow=True, random_state=SEED)
w2v_cbow.fit(sentences_list)

CPU times: user 29 s, sys: 401 ms, total: 29.4 s
Wall time: 12.5 s


In [28]:
%%time
# baseline 1 
# random labels
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyClassifier


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', DummyClassifier(random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))


train 0.05887736725599869
test 0.060439542712750365
CPU times: user 85.2 ms, sys: 19 ms, total: 104 ms
Wall time: 108 ms


In [29]:
%%time
# baseline 2 
# pos features + one hot encoding + logistic regression
from sklearn.preprocessing import OneHotEncoder


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', LogisticRegressionCV(Cs=5, cv=5, n_jobs=-1, scoring='f1_macro', 
                             penalty='l2', solver='newton-cg', multi_class='multinomial', random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.46639500282346874
test 0.39660981421559566
CPU times: user 2min 56s, sys: 9.55 s, total: 3min 5s
Wall time: 12min 31s


In [30]:
%%time
# baseline 3
# use word2vec cbow embedding + baseline 2 + svm
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
import scipy.sparse as sp

embeding = w2v_cbow
encoder_pos = OneHotEncoder()
X_train = sp.hstack([
    embeding.transform(df_train.word),
    embeding.transform(df_train['next-word']),
    embeding.transform(df_train['next-next-word']),
    embeding.transform(df_train['prev-word']),
    embeding.transform(df_train['prev-prev-word']),
    encoder_pos.fit_transform(df_train[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])
X_test = sp.hstack([
    embeding.transform(df_test.word),
    embeding.transform(df_test['next-word']),
    embeding.transform(df_test['next-next-word']),
    embeding.transform(df_test['prev-word']),
    embeding.transform(df_test['prev-prev-word']),
    encoder_pos.transform(df_test[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])

model = model_selection.GridSearchCV(LinearSVC(penalty='l2', multi_class='ovr', random_state=SEED), 
                                    {'C': np.logspace(-4, 0, 5)}, 
                                    cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print('train', metrics.f1_score(y_train, model.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(X_test), average='macro'))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  6.8min finished


train 0.9564419272624621
test 0.8107188907410726
CPU times: user 2min 21s, sys: 16 s, total: 2min 37s
Wall time: 9min 1s


## HW

#### RandomForestClassifier

In [56]:
from sklearn.ensemble import RandomForestClassifier

In [57]:
columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos', 'sentence_idx']

In [70]:
model = RandomForestClassifier(random_state=SEED)

In [71]:
%%time
model.fit(df_train[columns], y_train)

CPU times: user 311 ms, sys: 18.7 ms, total: 329 ms
Wall time: 329 ms


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=1337, verbose=0,
            warm_start=False)

In [72]:
print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.9840905177482637
test 0.7681646307933354


In [73]:
model = RandomForestClassifier(n_estimators=1000, random_state=SEED)

In [74]:
%%time
model.fit(df_train[columns], y_train)

CPU times: user 29.6 s, sys: 1.31 s, total: 30.9 s
Wall time: 31.1 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=1337, verbose=0,
            warm_start=False)

In [75]:
print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.9998132730662732
test 0.8375171019038912


Baseline побит

#### CatBoostClassifier

In [87]:
from catboost import CatBoostClassifier

In [88]:
?CatBoostClassifier

In [108]:
model = CatBoostClassifier(learning_rate=1, iterations=500, random_state=SEED, depth=10, loss_function='MultiClassOneVsAll', custom_loss='F1')

In [109]:
%%time
model.fit(df_train[columns], y_train)

0:	learn: -0.1520296	total: 230ms	remaining: 1m 54s
1:	learn: -0.0711681	total: 459ms	remaining: 1m 54s
2:	learn: -0.0425666	total: 693ms	remaining: 1m 54s
3:	learn: -0.0331851	total: 921ms	remaining: 1m 54s
4:	learn: -0.0283574	total: 1.19s	remaining: 1m 57s
5:	learn: -0.0261516	total: 1.42s	remaining: 1m 56s
6:	learn: -0.0249479	total: 1.64s	remaining: 1m 55s
7:	learn: -0.0237545	total: 1.87s	remaining: 1m 55s
8:	learn: -0.0229641	total: 2.1s	remaining: 1m 54s
9:	learn: -0.0224826	total: 2.32s	remaining: 1m 53s
10:	learn: -0.0216834	total: 2.55s	remaining: 1m 53s
11:	learn: -0.0209366	total: 2.79s	remaining: 1m 53s
12:	learn: -0.0204668	total: 3.02s	remaining: 1m 53s
13:	learn: -0.0199539	total: 3.25s	remaining: 1m 52s
14:	learn: -0.0195303	total: 3.47s	remaining: 1m 52s
15:	learn: -0.0191501	total: 3.7s	remaining: 1m 52s
16:	learn: -0.0188818	total: 3.93s	remaining: 1m 51s
17:	learn: -0.0185861	total: 4.15s	remaining: 1m 51s
18:	learn: -0.0182668	total: 4.38s	remaining: 1m 50s
19:	l

155:	learn: -0.0058288	total: 36.4s	remaining: 1m 20s
156:	learn: -0.0058119	total: 36.6s	remaining: 1m 20s
157:	learn: -0.0057891	total: 36.8s	remaining: 1m 19s
158:	learn: -0.0057500	total: 37.1s	remaining: 1m 19s
159:	learn: -0.0057181	total: 37.3s	remaining: 1m 19s
160:	learn: -0.0056843	total: 37.5s	remaining: 1m 18s
161:	learn: -0.0056526	total: 37.7s	remaining: 1m 18s
162:	learn: -0.0056183	total: 38s	remaining: 1m 18s
163:	learn: -0.0055753	total: 38.2s	remaining: 1m 18s
164:	learn: -0.0055562	total: 38.4s	remaining: 1m 17s
165:	learn: -0.0055375	total: 38.6s	remaining: 1m 17s
166:	learn: -0.0055095	total: 38.8s	remaining: 1m 17s
167:	learn: -0.0054661	total: 39.1s	remaining: 1m 17s
168:	learn: -0.0054496	total: 39.3s	remaining: 1m 16s
169:	learn: -0.0054290	total: 39.5s	remaining: 1m 16s
170:	learn: -0.0054118	total: 39.7s	remaining: 1m 16s
171:	learn: -0.0053891	total: 39.9s	remaining: 1m 16s
172:	learn: -0.0053634	total: 40.2s	remaining: 1m 15s
173:	learn: -0.0053304	total: 

310:	learn: -0.0031554	total: 1m 10s	remaining: 42.9s
311:	learn: -0.0031504	total: 1m 10s	remaining: 42.7s
312:	learn: -0.0031393	total: 1m 11s	remaining: 42.5s
313:	learn: -0.0031322	total: 1m 11s	remaining: 42.2s
314:	learn: -0.0031194	total: 1m 11s	remaining: 42s
315:	learn: -0.0031129	total: 1m 11s	remaining: 41.8s
316:	learn: -0.0031047	total: 1m 11s	remaining: 41.5s
317:	learn: -0.0030978	total: 1m 12s	remaining: 41.3s
318:	learn: -0.0030855	total: 1m 12s	remaining: 41.1s
319:	learn: -0.0030742	total: 1m 12s	remaining: 40.8s
320:	learn: -0.0030626	total: 1m 12s	remaining: 40.6s
321:	learn: -0.0030539	total: 1m 13s	remaining: 40.4s
322:	learn: -0.0030390	total: 1m 13s	remaining: 40.1s
323:	learn: -0.0030329	total: 1m 13s	remaining: 39.9s
324:	learn: -0.0030206	total: 1m 13s	remaining: 39.7s
325:	learn: -0.0030128	total: 1m 13s	remaining: 39.5s
326:	learn: -0.0030007	total: 1m 14s	remaining: 39.2s
327:	learn: -0.0029938	total: 1m 14s	remaining: 39s
328:	learn: -0.0029880	total: 1m

463:	learn: -0.0021347	total: 1m 45s	remaining: 8.22s
464:	learn: -0.0021325	total: 1m 46s	remaining: 7.99s
465:	learn: -0.0021269	total: 1m 46s	remaining: 7.76s
466:	learn: -0.0021238	total: 1m 46s	remaining: 7.53s
467:	learn: -0.0021207	total: 1m 46s	remaining: 7.3s
468:	learn: -0.0021172	total: 1m 47s	remaining: 7.07s
469:	learn: -0.0021140	total: 1m 47s	remaining: 6.84s
470:	learn: -0.0021105	total: 1m 47s	remaining: 6.61s
471:	learn: -0.0021086	total: 1m 47s	remaining: 6.39s
472:	learn: -0.0021035	total: 1m 47s	remaining: 6.16s
473:	learn: -0.0020975	total: 1m 48s	remaining: 5.93s
474:	learn: -0.0020936	total: 1m 48s	remaining: 5.7s
475:	learn: -0.0020893	total: 1m 48s	remaining: 5.47s
476:	learn: -0.0020839	total: 1m 48s	remaining: 5.24s
477:	learn: -0.0020788	total: 1m 48s	remaining: 5.01s
478:	learn: -0.0020714	total: 1m 49s	remaining: 4.79s
479:	learn: -0.0020670	total: 1m 49s	remaining: 4.56s
480:	learn: -0.0020615	total: 1m 49s	remaining: 4.33s
481:	learn: -0.0020561	total: 

<catboost.core.CatBoostClassifier at 0x11584b550>

In [110]:
print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.9927836303142821
test 0.8391754656989064


Baseline побит

#### Why we selected f1 score with macro averaging as our classification quality measure? What other metrics are suitable?


Потому что наши классы несбалансированны.
Можем также использовать метрику roc_auc_score с average='macro'