# Assignment 4: Named entity recognition

Create a model for Named Entity Recognition for dataset CoNLL 2002.  
Your quality metric = f1_macro

In your solution you should use: RandomForest, Gradient Boosting (xgboost, lightgbm, catboost)   
Tutorials:  
1. https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide
1. https://github.com/catboost/tutorials 

More baselines you beat - better your score
 
baseline 1 [3 points]: 0.0604      random labels  
baseline 2 [5 points]: 0.3966      PoS features + logistic regression  
baseline 3 [8 points]: 0.8122      word2vec cbow embedding + baseline 2 + svm    

[1 point] using feature engineering (creating features not presented in the baselines)

! Your results must be reproducible. You should explicitly set all seeds random_states in yout model.  
! Remember to use proper training pipeline.  

bonus, think about:  
1. [1 point] Why did we select f1 score with macro averaging as our classification quality measure? What other metrics are suitable?   

'macro' finds unweighted mean of metrics for each label. This does not take label imbalance into account. 
We can also use weighted macro-average by using parameter value 'weighted'. 

In [14]:
from catboost import CatBoostClassifier, Pool
from catboost import cv
from collections import defaultdict
from gensim.models.word2vec import Word2Vec
from sklearn.base import TransformerMixin
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
import lightgbm as lgb
import numpy as np
import pickle
import scipy.sparse as sp
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

SEED=1337

In [2]:
df = pd.read_csv('data/ner_short.csv', index_col=0)
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O


In [3]:
# number of sentences
df.sentence_idx.max()

1500.0

In [4]:
# class distribution
df.tag.value_counts(normalize=True )

O        0.852828
B-geo    0.027604
B-gpe    0.020935
B-org    0.020247
I-per    0.017795
B-tim    0.016927
B-per    0.015312
I-org    0.013937
I-geo    0.005383
I-tim    0.004247
B-art    0.001376
I-gpe    0.000837
I-art    0.000748
B-eve    0.000628
I-eve    0.000508
B-nat    0.000449
I-nat    0.000239
Name: tag, dtype: float64

In [5]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').tag.count()
df = tdf.reset_index(drop=False)

## Features

Basic feature engineering:

In [6]:
df['contains-dash'] = df.word.apply(lambda x: '-' in x)
df['contains-dot'] = df.word.apply(lambda x: '.' in x)
df['all-caps'] = df.word.apply(lambda x: x == x.upper())
df['capitalized'] = df.word.apply(lambda x: x[0].isupper())
df['prev-capitalized'] = df['prev-word'].apply(lambda x: x[0].isupper())
df['next-capitalized'] = df['next-word'].apply(lambda x: x[0].isupper())
df['prev-all-caps'] = df['prev-word'].apply(lambda x: x == x.capitalize())
df['next-all-caps'] = df['next-word'].apply(lambda x: x == x.capitalize())

In [7]:
# encode categorial variables
le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['capitalized'] = le.fit_transform(df['capitalized'])
df['prev-capitalized'] = le.fit_transform(df['prev-capitalized'])
df['next-capitalized'] = le.fit_transform(df['next-capitalized'])
df['prev-all-caps'] = le.fit_transform(df['prev-all-caps'])
df['next-all-caps'] = le.fit_transform(df['next-all-caps'])
df['contains-dash'] = le.fit_transform(df['contains-dash'])
df['contains-dot'] = le.fit_transform(df['contains-dot'])
df['all-caps'] = le.fit_transform(df['all-caps'])
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])

In [8]:
df.head()

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,...,tag,length,contains-dash,contains-dot,all-caps,capitalized,prev-capitalized,next-capitalized,prev-all-caps,next-all-caps
0,1.0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,...,O,48,0,0,0,1,0,0,0,0
1,1.0,33,have,18,demonstrators,9,18,39,__START1__,Thousands,...,O,48,0,0,0,0,1,0,1,0
2,1.0,32,marched,33,have,18,9,18,Thousands,of,...,O,48,0,0,0,0,0,0,0,0
3,1.0,9,through,32,marched,33,18,9,of,demonstrators,...,O,48,0,0,0,0,0,0,0,0
4,1.0,16,London,9,through,32,33,18,demonstrators,have,...,O,48,0,0,0,0,0,0,0,0


In [9]:
# splitting
y = LabelEncoder().fit_transform(df.tag)

df_train, df_test, y_train, y_test = model_selection.train_test_split(df, y, stratify=y, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


### Training word embeddings

In [10]:
# some wrappers to work with word2vec
class Word2VecWrapper(TransformerMixin):
    def __init__(self, window=5,negative=5, size=100, iter=100, is_cbow=False, random_state=SEED):
        self.window_ = window
        self.negative_ = negative
        self.size_ = size
        self.iter_ = iter
        self.is_cbow_ = is_cbow
        self.w2v = None
        self.random_state = random_state
        
    def get_size(self):
        return self.size_

    def fit(self, X, y=None):
        """
        X: list of strings
        """
        sentences_list = [x.split() for x in X]
        self.w2v = Word2Vec(sentences_list, 
                            window=self.window_,
                            negative=self.negative_, 
                            size=self.size_, 
                            iter=self.iter_,
                            sg=not self.is_cbow_, seed=self.random_state)

        return self
    
    def has(self, word):
        return word in self.w2v

    def transform(self, X):
        """
        X: a list of words
        """
        if self.w2v is None:
            raise Exception('model not fitted')
        return np.array([self.w2v[w] if w in self.w2v else np.zeros(self.size_) for w in X ])

In [11]:
sentences_list = [' '.join(x).strip() for x in df.groupby('sentence_idx')['word'].apply(list).values]

w2v_cbow = Word2VecWrapper(window=5, negative=5, size=300, iter=300, is_cbow=True, random_state=SEED)
w2v_cbow.fit(sentences_list)

<__main__.Word2VecWrapper at 0x203bcb81a20>

## Baselines

In [13]:
# baseline 1 
# random labels
columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', DummyClassifier(random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.05887736725599869
test 0.060439542712750365


In [14]:
columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', LogisticRegressionCV(Cs=5, cv=5, n_jobs=-1, scoring='f1_macro', 
                             penalty='l2', solver='newton-cg', multi_class='multinomial', random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.46639500282346874
test 0.39660981421559566


In [13]:
columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos', 'contains-dash', 'contains-dot',
       'all-caps', 'capitalized', 'prev-capitalized', 'next-capitalized',
       'prev-all-caps', 'next-all-caps']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', LogisticRegressionCV(Cs=5, cv=5, n_jobs=-1, scoring='f1_macro', 
                             penalty='l2', solver='newton-cg', multi_class='multinomial', random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.5578845946120418
test 0.4565175934928435


In [16]:
embedding = w2v_cbow
encoder_pos = OneHotEncoder()
X_train = sp.hstack([
    embedding.transform(df_train.word),
    embedding.transform(df_train['next-word']),
    embedding.transform(df_train['next-next-word']),
    embedding.transform(df_train['prev-word']),
    embedding.transform(df_train['prev-prev-word']),
    encoder_pos.fit_transform(df_train[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])
X_test = sp.hstack([
    embedding.transform(df_test.word),
    embedding.transform(df_test['next-word']),
    embedding.transform(df_test['next-next-word']),
    embedding.transform(df_test['prev-word']),
    embedding.transform(df_test['prev-prev-word']),
    encoder_pos.transform(df_test[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])

In [None]:
%%time
model = model_selection.GridSearchCV(LinearSVC(penalty='l2', multi_class='ovr', random_state=SEED), 
                                    {'C': np.logspace(-4, 0, 5)}, 
                                    cv=3, scoring='f1_macro', n_jobs=3, verbose=1)
model.fit(X_train, y_train)

print('train', metrics.f1_score(y_train, model.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(X_test), average='macro'))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


In [None]:
# %%time
# model = model_selection.GridSearchCV(SVC(penalty='l2', multi_class='ovr', random_state=SEED), 
#                                     {'C': np.logspace(-4, 0, 5)}, 
#                                     cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
# model.fit(X_train, y_train)

# print('train', metrics.f1_score(y_train, model.predict(X_train), average='macro'))
# print('test', metrics.f1_score(y_test, model.predict(X_test), average='macro'))

# Random Forest

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
embedding = w2v_cbow
encoder_pos = OneHotEncoder()
X_train = sp.hstack([
    embedding.transform(df_train.word),
    embedding.transform(df_train['next-word']),
    embedding.transform(df_train['next-next-word']),
    embedding.transform(df_train['prev-word']),
    embedding.transform(df_train['prev-prev-word']),
    encoder_pos.fit_transform(df_train[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
]).toarray()
X_test = sp.hstack([
    embedding.transform(df_test.word),
    embedding.transform(df_test['next-word']),
    embedding.transform(df_test['next-next-word']),
    embedding.transform(df_test['prev-word']),
    embedding.transform(df_test['prev-prev-word']),
    encoder_pos.transform(df_test[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
]).toarray()

In [17]:
model = RandomForestClassifier(n_estimators=1000, max_depth=5,random_state=SEED,n_jobs=-1,class_weight='balanced', oob_score=True)
cv = StratifiedKFold(n_splits=3, random_state=SEED, shuffle=True)
results = []

for (train, test), i in zip(cv.split(X_train, y_train), range(3)):
    model.fit(X_train[train], y_train[train])
    pred = model.predict(X_train[test])
    results.append(metrics.f1_score(y_train[test], pred, average='macro'))
print(np.mean(results))

model.fit(X_train, y_train)
print('train', metrics.f1_score(y_train, model.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(X_test), average='macro'))

0.44872708203108863
train 0.5019499283672822
test 0.4606488037075824


In [18]:
param_grid = {'n_estimators': [500, 1000, 1500],
                 'max_depth': [5, 10, 15] }
model = RandomForestClassifier(random_state=SEED,n_jobs=-1, class_weight='balanced')
grid_rf = GridSearchCV(model, param_grid, cv=cv)
grid_rf.fit(X_train, y_train)
grid_rf.grid_scores_

KeyboardInterrupt: 

# Gradient Boosting models

## catboost

https://catboost.ai/docs/concepts/python-usages-examples.html#multiclassification

In [0]:
embedding = w2v_cbow
encoder_pos = OneHotEncoder()
X_train = sp.hstack([
    embedding.transform(df_train.word),
    embedding.transform(df_train['next-word']),
    embedding.transform(df_train['next-next-word']),
    embedding.transform(df_train['prev-word']),
    embedding.transform(df_train['prev-prev-word']),
    encoder_pos.fit_transform(df_train[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])
X_test = sp.hstack([
    embedding.transform(df_test.word),
    embedding.transform(df_test['next-word']),
    embedding.transform(df_test['next-next-word']),
    embedding.transform(df_test['prev-word']),
    embedding.transform(df_test['prev-prev-word']),
    encoder_pos.transform(df_test[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])

In [0]:
embedding.transform(df_train.word[0]).shape

(9, 300)

Word embedding length = 300, and we have 5 word embedding features => the first 1500 variables are numeric, the rest is categorical.

In [0]:
X_train.toarray()[0, range(1500, X_train.shape[1])]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.])

In [0]:
X_train.toarray()[0, :1500]

array([-2.68607473,  1.88339102,  0.78943968, ..., -2.13685369,
        0.0206965 ,  0.5490101 ])

In [0]:
cat_features = range(1500, X_train.shape[1])
X_train_dense = pd.DataFrame(X_train.toarray())
for i in range(1500, X_train.shape[1]):
    X_train_dense[i] = X_train_dense[i].apply(int)
cat_features = range(1500, X_test.shape[1])
X_test_dense = pd.DataFrame(X_test.toarray())
for i in range(1500, X_test.shape[1]):
    X_test_dense[i] = X_test_dense[i].apply(int)

In [0]:
train_pool = Pool(X_train_dense, label=y_train, cat_features=cat_features)
test_pool = Pool(X_test_dense, label=y_test, cat_features=cat_features)

In [0]:
cv = StratifiedKFold(n_splits=3, random_state=SEED, shuffle=True)
results = []
for (train, test), i in zip(cv.split(X_train, y_train), range(3)):
    model = CatBoostClassifier(iterations=700,
                           learning_rate=0.3,
                           depth=7, random_seed=SEED, task_type="GPU",
                           loss_function='MultiClass', custom_metric='F1')
    cv_train_pool = Pool(X_train_dense.iloc[train, :], label=y_train[train], cat_features=cat_features)
    cv_test_pool = Pool(X_train_dense.iloc[test, :], label=y_train[test], cat_features=cat_features)
    model.fit(cv_train_pool)
    pred = model.predict(cv_test_pool)
    results.append(metrics.f1_score(y_train[test], pred, average='macro'))

0:	learn: 0.4939191	total: 424ms	remaining: 4m 56s
1:	learn: 0.3999060	total: 783ms	remaining: 4m 33s
2:	learn: 0.3494629	total: 1.12s	remaining: 4m 21s
3:	learn: 0.3199036	total: 1.45s	remaining: 4m 12s
4:	learn: 0.2978637	total: 1.73s	remaining: 4m
5:	learn: 0.2852235	total: 2.04s	remaining: 3m 56s
6:	learn: 0.2714483	total: 2.34s	remaining: 3m 52s
7:	learn: 0.2602653	total: 2.63s	remaining: 3m 47s
8:	learn: 0.2518238	total: 2.93s	remaining: 3m 44s
9:	learn: 0.2444319	total: 3.25s	remaining: 3m 44s
10:	learn: 0.2350723	total: 3.61s	remaining: 3m 46s
11:	learn: 0.2264581	total: 3.97s	remaining: 3m 47s
12:	learn: 0.2231187	total: 4.29s	remaining: 3m 46s
13:	learn: 0.2178996	total: 4.62s	remaining: 3m 46s
14:	learn: 0.2139548	total: 4.95s	remaining: 3m 46s
15:	learn: 0.2093347	total: 5.29s	remaining: 3m 46s
16:	learn: 0.2054936	total: 5.64s	remaining: 3m 46s
17:	learn: 0.2015199	total: 6.01s	remaining: 3m 47s
18:	learn: 0.1980902	total: 6.36s	remaining: 3m 47s
19:	learn: 0.1957162	total

159:	learn: 0.0738715	total: 49.3s	remaining: 2m 46s
160:	learn: 0.0736111	total: 49.6s	remaining: 2m 45s
161:	learn: 0.0733508	total: 49.9s	remaining: 2m 45s
162:	learn: 0.0731394	total: 50.1s	remaining: 2m 45s
163:	learn: 0.0728904	total: 50.4s	remaining: 2m 44s
164:	learn: 0.0727000	total: 50.7s	remaining: 2m 44s
165:	learn: 0.0723649	total: 51s	remaining: 2m 44s
166:	learn: 0.0721610	total: 51.3s	remaining: 2m 43s
167:	learn: 0.0720231	total: 51.6s	remaining: 2m 43s
168:	learn: 0.0718992	total: 51.8s	remaining: 2m 42s
169:	learn: 0.0716272	total: 52.1s	remaining: 2m 42s
170:	learn: 0.0712681	total: 52.4s	remaining: 2m 42s
171:	learn: 0.0710907	total: 52.7s	remaining: 2m 41s
172:	learn: 0.0707321	total: 53s	remaining: 2m 41s
173:	learn: 0.0703797	total: 53.3s	remaining: 2m 41s
174:	learn: 0.0701791	total: 53.6s	remaining: 2m 40s
175:	learn: 0.0700816	total: 53.9s	remaining: 2m 40s
176:	learn: 0.0696667	total: 54.2s	remaining: 2m 40s
177:	learn: 0.0693959	total: 54.5s	remaining: 2m 3

314:	learn: 0.0434022	total: 1m 35s	remaining: 1m 56s
315:	learn: 0.0432870	total: 1m 35s	remaining: 1m 56s
316:	learn: 0.0432127	total: 1m 35s	remaining: 1m 55s
317:	learn: 0.0431280	total: 1m 36s	remaining: 1m 55s
318:	learn: 0.0430295	total: 1m 36s	remaining: 1m 55s
319:	learn: 0.0429107	total: 1m 36s	remaining: 1m 54s
320:	learn: 0.0428381	total: 1m 36s	remaining: 1m 54s
321:	learn: 0.0426368	total: 1m 37s	remaining: 1m 54s
322:	learn: 0.0424639	total: 1m 37s	remaining: 1m 53s
323:	learn: 0.0424204	total: 1m 37s	remaining: 1m 53s
324:	learn: 0.0422846	total: 1m 38s	remaining: 1m 53s
325:	learn: 0.0420271	total: 1m 38s	remaining: 1m 52s
326:	learn: 0.0419123	total: 1m 38s	remaining: 1m 52s
327:	learn: 0.0418341	total: 1m 39s	remaining: 1m 52s
328:	learn: 0.0417319	total: 1m 39s	remaining: 1m 51s
329:	learn: 0.0416511	total: 1m 39s	remaining: 1m 51s
330:	learn: 0.0414296	total: 1m 39s	remaining: 1m 51s
331:	learn: 0.0412212	total: 1m 40s	remaining: 1m 51s
332:	learn: 0.0411603	total:

467:	learn: 0.0284682	total: 2m 21s	remaining: 1m 10s
468:	learn: 0.0284173	total: 2m 21s	remaining: 1m 9s
469:	learn: 0.0283636	total: 2m 22s	remaining: 1m 9s
470:	learn: 0.0282984	total: 2m 22s	remaining: 1m 9s
471:	learn: 0.0282223	total: 2m 22s	remaining: 1m 8s
472:	learn: 0.0281850	total: 2m 22s	remaining: 1m 8s
473:	learn: 0.0281357	total: 2m 23s	remaining: 1m 8s
474:	learn: 0.0280500	total: 2m 23s	remaining: 1m 7s
475:	learn: 0.0279902	total: 2m 23s	remaining: 1m 7s
476:	learn: 0.0279543	total: 2m 24s	remaining: 1m 7s
477:	learn: 0.0278417	total: 2m 24s	remaining: 1m 7s
478:	learn: 0.0277592	total: 2m 24s	remaining: 1m 6s
479:	learn: 0.0276774	total: 2m 24s	remaining: 1m 6s
480:	learn: 0.0274909	total: 2m 25s	remaining: 1m 6s
481:	learn: 0.0274547	total: 2m 25s	remaining: 1m 5s
482:	learn: 0.0273814	total: 2m 25s	remaining: 1m 5s
483:	learn: 0.0273255	total: 2m 26s	remaining: 1m 5s
484:	learn: 0.0272234	total: 2m 26s	remaining: 1m 4s
485:	learn: 0.0271281	total: 2m 26s	remaining

623:	learn: 0.0190421	total: 3m 9s	remaining: 23.1s
624:	learn: 0.0189573	total: 3m 10s	remaining: 22.8s
625:	learn: 0.0189038	total: 3m 10s	remaining: 22.5s
626:	learn: 0.0188562	total: 3m 10s	remaining: 22.2s
627:	learn: 0.0188060	total: 3m 11s	remaining: 21.9s
628:	learn: 0.0187511	total: 3m 11s	remaining: 21.6s
629:	learn: 0.0186758	total: 3m 11s	remaining: 21.3s
630:	learn: 0.0186503	total: 3m 12s	remaining: 21s
631:	learn: 0.0186325	total: 3m 12s	remaining: 20.7s
632:	learn: 0.0186146	total: 3m 12s	remaining: 20.4s
633:	learn: 0.0185422	total: 3m 13s	remaining: 20.1s
634:	learn: 0.0185057	total: 3m 13s	remaining: 19.8s
635:	learn: 0.0184706	total: 3m 13s	remaining: 19.5s
636:	learn: 0.0183649	total: 3m 14s	remaining: 19.2s
637:	learn: 0.0182977	total: 3m 14s	remaining: 18.9s
638:	learn: 0.0182277	total: 3m 14s	remaining: 18.6s
639:	learn: 0.0181937	total: 3m 15s	remaining: 18.3s
640:	learn: 0.0181385	total: 3m 15s	remaining: 18s
641:	learn: 0.0180887	total: 3m 15s	remaining: 17.7

80:	learn: 0.1137787	total: 25.8s	remaining: 3m 17s
81:	learn: 0.1129028	total: 26.2s	remaining: 3m 17s
82:	learn: 0.1126226	total: 26.4s	remaining: 3m 16s
83:	learn: 0.1118319	total: 26.8s	remaining: 3m 16s
84:	learn: 0.1113546	total: 27s	remaining: 3m 15s
85:	learn: 0.1108152	total: 27.3s	remaining: 3m 14s
86:	learn: 0.1104831	total: 27.6s	remaining: 3m 14s
87:	learn: 0.1096212	total: 27.9s	remaining: 3m 13s
88:	learn: 0.1089131	total: 28.2s	remaining: 3m 13s
89:	learn: 0.1081640	total: 28.5s	remaining: 3m 13s
90:	learn: 0.1073414	total: 28.8s	remaining: 3m 13s
91:	learn: 0.1070011	total: 29.1s	remaining: 3m 12s
92:	learn: 0.1067677	total: 29.4s	remaining: 3m 11s
93:	learn: 0.1061248	total: 29.7s	remaining: 3m 11s
94:	learn: 0.1055999	total: 30s	remaining: 3m 10s
95:	learn: 0.1050062	total: 30.3s	remaining: 3m 10s
96:	learn: 0.1047233	total: 30.6s	remaining: 3m 9s
97:	learn: 0.1042512	total: 30.9s	remaining: 3m 9s
98:	learn: 0.1038009	total: 31.2s	remaining: 3m 9s
99:	learn: 0.102885

237:	learn: 0.0589381	total: 1m 13s	remaining: 2m 21s
238:	learn: 0.0588078	total: 1m 13s	remaining: 2m 21s
239:	learn: 0.0586542	total: 1m 13s	remaining: 2m 21s
240:	learn: 0.0583373	total: 1m 14s	remaining: 2m 21s
241:	learn: 0.0580318	total: 1m 14s	remaining: 2m 20s
242:	learn: 0.0578636	total: 1m 14s	remaining: 2m 20s
243:	learn: 0.0577337	total: 1m 14s	remaining: 2m 20s
244:	learn: 0.0575400	total: 1m 15s	remaining: 2m 19s
245:	learn: 0.0574265	total: 1m 15s	remaining: 2m 19s
246:	learn: 0.0570555	total: 1m 15s	remaining: 2m 19s
247:	learn: 0.0569676	total: 1m 16s	remaining: 2m 18s
248:	learn: 0.0568721	total: 1m 16s	remaining: 2m 18s
249:	learn: 0.0567433	total: 1m 16s	remaining: 2m 18s
250:	learn: 0.0566316	total: 1m 16s	remaining: 2m 17s
251:	learn: 0.0562868	total: 1m 17s	remaining: 2m 17s
252:	learn: 0.0562199	total: 1m 17s	remaining: 2m 16s
253:	learn: 0.0559489	total: 1m 17s	remaining: 2m 16s
254:	learn: 0.0558392	total: 1m 18s	remaining: 2m 16s
255:	learn: 0.0555692	total:

390:	learn: 0.0372284	total: 1m 58s	remaining: 1m 34s
391:	learn: 0.0371522	total: 1m 59s	remaining: 1m 33s
392:	learn: 0.0369526	total: 1m 59s	remaining: 1m 33s
393:	learn: 0.0367925	total: 1m 59s	remaining: 1m 33s
394:	learn: 0.0367304	total: 2m	remaining: 1m 32s
395:	learn: 0.0366171	total: 2m	remaining: 1m 32s
396:	learn: 0.0365590	total: 2m	remaining: 1m 32s
397:	learn: 0.0364115	total: 2m 1s	remaining: 1m 31s
398:	learn: 0.0362805	total: 2m 1s	remaining: 1m 31s
399:	learn: 0.0362112	total: 2m 1s	remaining: 1m 31s
400:	learn: 0.0361067	total: 2m 2s	remaining: 1m 30s
401:	learn: 0.0359855	total: 2m 2s	remaining: 1m 30s
402:	learn: 0.0359309	total: 2m 2s	remaining: 1m 30s
403:	learn: 0.0358568	total: 2m 2s	remaining: 1m 30s
404:	learn: 0.0357415	total: 2m 3s	remaining: 1m 29s
405:	learn: 0.0356335	total: 2m 3s	remaining: 1m 29s
406:	learn: 0.0355792	total: 2m 3s	remaining: 1m 29s
407:	learn: 0.0354977	total: 2m 4s	remaining: 1m 28s
408:	learn: 0.0354318	total: 2m 4s	remaining: 1m 28

545:	learn: 0.0242822	total: 2m 46s	remaining: 46.9s
546:	learn: 0.0242350	total: 2m 46s	remaining: 46.6s
547:	learn: 0.0241153	total: 2m 46s	remaining: 46.3s
548:	learn: 0.0240209	total: 2m 47s	remaining: 46s
549:	learn: 0.0239734	total: 2m 47s	remaining: 45.7s
550:	learn: 0.0239246	total: 2m 47s	remaining: 45.4s
551:	learn: 0.0238499	total: 2m 48s	remaining: 45.1s
552:	learn: 0.0238242	total: 2m 48s	remaining: 44.8s
553:	learn: 0.0237582	total: 2m 48s	remaining: 44.5s
554:	learn: 0.0237399	total: 2m 49s	remaining: 44.2s
555:	learn: 0.0237101	total: 2m 49s	remaining: 43.8s
556:	learn: 0.0236101	total: 2m 49s	remaining: 43.6s
557:	learn: 0.0235637	total: 2m 49s	remaining: 43.3s
558:	learn: 0.0235191	total: 2m 50s	remaining: 42.9s
559:	learn: 0.0234018	total: 2m 50s	remaining: 42.7s
560:	learn: 0.0232981	total: 2m 50s	remaining: 42.4s
561:	learn: 0.0232346	total: 2m 51s	remaining: 42.1s
562:	learn: 0.0232013	total: 2m 51s	remaining: 41.7s
563:	learn: 0.0231404	total: 2m 51s	remaining: 4

2:	learn: 0.3495968	total: 1.04s	remaining: 4m 1s
3:	learn: 0.3204435	total: 1.44s	remaining: 4m 10s
4:	learn: 0.2944622	total: 1.84s	remaining: 4m 15s
5:	learn: 0.2768979	total: 2.14s	remaining: 4m 7s
6:	learn: 0.2604422	total: 2.5s	remaining: 4m 7s
7:	learn: 0.2500905	total: 2.79s	remaining: 4m 1s
8:	learn: 0.2405177	total: 3.14s	remaining: 4m 1s
9:	learn: 0.2325027	total: 3.49s	remaining: 4m
10:	learn: 0.2250177	total: 3.83s	remaining: 3m 59s
11:	learn: 0.2204126	total: 4.14s	remaining: 3m 57s
12:	learn: 0.2149782	total: 4.45s	remaining: 3m 54s
13:	learn: 0.2111917	total: 4.73s	remaining: 3m 51s
14:	learn: 0.2064251	total: 5.08s	remaining: 3m 51s
15:	learn: 0.2041665	total: 5.39s	remaining: 3m 50s
16:	learn: 0.2012424	total: 5.69s	remaining: 3m 48s
17:	learn: 0.1979456	total: 6.01s	remaining: 3m 47s
18:	learn: 0.1952539	total: 6.38s	remaining: 3m 48s
19:	learn: 0.1924715	total: 6.69s	remaining: 3m 47s
20:	learn: 0.1884910	total: 7.02s	remaining: 3m 46s
21:	learn: 0.1864047	total: 7.

161:	learn: 0.0760449	total: 50.3s	remaining: 2m 47s
162:	learn: 0.0755785	total: 50.6s	remaining: 2m 46s
163:	learn: 0.0753023	total: 50.9s	remaining: 2m 46s
164:	learn: 0.0749827	total: 51.2s	remaining: 2m 45s
165:	learn: 0.0747104	total: 51.4s	remaining: 2m 45s
166:	learn: 0.0745321	total: 51.7s	remaining: 2m 45s
167:	learn: 0.0740717	total: 52s	remaining: 2m 44s
168:	learn: 0.0737289	total: 52.3s	remaining: 2m 44s
169:	learn: 0.0736020	total: 52.6s	remaining: 2m 43s
170:	learn: 0.0734450	total: 52.9s	remaining: 2m 43s
171:	learn: 0.0733151	total: 53.1s	remaining: 2m 43s
172:	learn: 0.0727122	total: 53.5s	remaining: 2m 42s
173:	learn: 0.0723601	total: 53.8s	remaining: 2m 42s
174:	learn: 0.0721956	total: 54.1s	remaining: 2m 42s
175:	learn: 0.0720305	total: 54.3s	remaining: 2m 41s
176:	learn: 0.0714890	total: 54.7s	remaining: 2m 41s
177:	learn: 0.0713725	total: 55s	remaining: 2m 41s
178:	learn: 0.0711826	total: 55.3s	remaining: 2m 40s
179:	learn: 0.0706377	total: 55.6s	remaining: 2m 4

316:	learn: 0.0453881	total: 1m 36s	remaining: 1m 57s
317:	learn: 0.0452746	total: 1m 37s	remaining: 1m 56s
318:	learn: 0.0451886	total: 1m 37s	remaining: 1m 56s
319:	learn: 0.0451338	total: 1m 37s	remaining: 1m 56s
320:	learn: 0.0450261	total: 1m 38s	remaining: 1m 55s
321:	learn: 0.0446729	total: 1m 38s	remaining: 1m 55s
322:	learn: 0.0442911	total: 1m 38s	remaining: 1m 55s
323:	learn: 0.0441979	total: 1m 39s	remaining: 1m 54s
324:	learn: 0.0441118	total: 1m 39s	remaining: 1m 54s
325:	learn: 0.0440579	total: 1m 39s	remaining: 1m 54s
326:	learn: 0.0440108	total: 1m 39s	remaining: 1m 54s
327:	learn: 0.0439000	total: 1m 40s	remaining: 1m 53s
328:	learn: 0.0438097	total: 1m 40s	remaining: 1m 53s
329:	learn: 0.0437059	total: 1m 40s	remaining: 1m 53s
330:	learn: 0.0436370	total: 1m 41s	remaining: 1m 52s
331:	learn: 0.0435507	total: 1m 41s	remaining: 1m 52s
332:	learn: 0.0433622	total: 1m 41s	remaining: 1m 52s
333:	learn: 0.0432820	total: 1m 42s	remaining: 1m 51s
334:	learn: 0.0431686	total:

469:	learn: 0.0298512	total: 2m 23s	remaining: 1m 10s
470:	learn: 0.0297799	total: 2m 24s	remaining: 1m 10s
471:	learn: 0.0296661	total: 2m 24s	remaining: 1m 9s
472:	learn: 0.0295943	total: 2m 24s	remaining: 1m 9s
473:	learn: 0.0295456	total: 2m 25s	remaining: 1m 9s
474:	learn: 0.0294374	total: 2m 25s	remaining: 1m 8s
475:	learn: 0.0293317	total: 2m 25s	remaining: 1m 8s
476:	learn: 0.0292225	total: 2m 25s	remaining: 1m 8s
477:	learn: 0.0291704	total: 2m 26s	remaining: 1m 7s
478:	learn: 0.0291425	total: 2m 26s	remaining: 1m 7s
479:	learn: 0.0290828	total: 2m 26s	remaining: 1m 7s
480:	learn: 0.0290290	total: 2m 27s	remaining: 1m 6s
481:	learn: 0.0289741	total: 2m 27s	remaining: 1m 6s
482:	learn: 0.0288858	total: 2m 27s	remaining: 1m 6s
483:	learn: 0.0288058	total: 2m 28s	remaining: 1m 6s
484:	learn: 0.0287362	total: 2m 28s	remaining: 1m 5s
485:	learn: 0.0286517	total: 2m 28s	remaining: 1m 5s
486:	learn: 0.0285919	total: 2m 29s	remaining: 1m 5s
487:	learn: 0.0285309	total: 2m 29s	remainin

626:	learn: 0.0199396	total: 3m 12s	remaining: 22.4s
627:	learn: 0.0199079	total: 3m 12s	remaining: 22.1s
628:	learn: 0.0198937	total: 3m 12s	remaining: 21.8s
629:	learn: 0.0198707	total: 3m 13s	remaining: 21.5s
630:	learn: 0.0198113	total: 3m 13s	remaining: 21.2s
631:	learn: 0.0197540	total: 3m 13s	remaining: 20.9s
632:	learn: 0.0196685	total: 3m 14s	remaining: 20.6s
633:	learn: 0.0196577	total: 3m 14s	remaining: 20.3s
634:	learn: 0.0196153	total: 3m 14s	remaining: 19.9s
635:	learn: 0.0195950	total: 3m 15s	remaining: 19.6s
636:	learn: 0.0195713	total: 3m 15s	remaining: 19.3s
637:	learn: 0.0195510	total: 3m 15s	remaining: 19s
638:	learn: 0.0194854	total: 3m 16s	remaining: 18.7s
639:	learn: 0.0194174	total: 3m 16s	remaining: 18.4s
640:	learn: 0.0193585	total: 3m 16s	remaining: 18.1s
641:	learn: 0.0192900	total: 3m 16s	remaining: 17.8s
642:	learn: 0.0192462	total: 3m 17s	remaining: 17.5s
643:	learn: 0.0191971	total: 3m 17s	remaining: 17.2s
644:	learn: 0.0191556	total: 3m 17s	remaining: 1

Tried combinations:
* iterations = 700, depth = 7, learning_rate=0.3 = 0.8071248815000267
* iterations = 500, depth = 7, learning_rate=0.1 = 0.6809201963196093
* iterations = 500, depth = 7, learning_rate=0.3 = 0.8004303348578977
* iterations = 500, depth = 7, learning_rate=0.7 = 0.7442901356639421
* iterations = 500, depth = 7 = 0.76000677987306
* iterations = 100 = 0.5870651930750483

In [0]:
model = CatBoostClassifier(iterations=700,
                           learning_rate=0.3,
                           depth=7, random_seed=SEED, task_type="GPU",
                           loss_function='MultiClass', custom_metric='F1')
model.fit(train_pool)
print(metrics.f1_score(y_train, model.predict(train_pool), average='macro'))
print(metrics.f1_score(y_test, model.predict(test_pool), average='macro'))

0:	learn: 0.4500981	total: 340ms	remaining: 3m 57s
1:	learn: 0.3705955	total: 659ms	remaining: 3m 50s
2:	learn: 0.3296056	total: 1.09s	remaining: 4m 14s
3:	learn: 0.3010868	total: 1.44s	remaining: 4m 10s
4:	learn: 0.2773092	total: 1.75s	remaining: 4m 2s
5:	learn: 0.2626347	total: 2.04s	remaining: 3m 56s
6:	learn: 0.2506253	total: 2.36s	remaining: 3m 53s
7:	learn: 0.2414308	total: 2.65s	remaining: 3m 49s
8:	learn: 0.2323643	total: 2.99s	remaining: 3m 49s
9:	learn: 0.2247710	total: 3.29s	remaining: 3m 47s
10:	learn: 0.2177627	total: 3.65s	remaining: 3m 48s
11:	learn: 0.2137834	total: 3.92s	remaining: 3m 44s
12:	learn: 0.2102821	total: 4.23s	remaining: 3m 43s
13:	learn: 0.2052601	total: 4.59s	remaining: 3m 45s
14:	learn: 0.2009527	total: 4.93s	remaining: 3m 45s
15:	learn: 0.1969740	total: 5.28s	remaining: 3m 45s
16:	learn: 0.1930065	total: 5.63s	remaining: 3m 46s
17:	learn: 0.1894878	total: 5.96s	remaining: 3m 45s
18:	learn: 0.1861672	total: 6.3s	remaining: 3m 45s
19:	learn: 0.1822429	tot

158:	learn: 0.0693029	total: 50.7s	remaining: 2m 52s
159:	learn: 0.0689330	total: 51s	remaining: 2m 52s
160:	learn: 0.0684752	total: 51.4s	remaining: 2m 51s
161:	learn: 0.0681673	total: 51.7s	remaining: 2m 51s
162:	learn: 0.0679999	total: 52s	remaining: 2m 51s
163:	learn: 0.0677871	total: 52.3s	remaining: 2m 50s
164:	learn: 0.0676029	total: 52.5s	remaining: 2m 50s
165:	learn: 0.0674783	total: 52.8s	remaining: 2m 49s
166:	learn: 0.0668131	total: 53.2s	remaining: 2m 49s
167:	learn: 0.0666968	total: 53.5s	remaining: 2m 49s
168:	learn: 0.0663109	total: 53.8s	remaining: 2m 49s
169:	learn: 0.0659165	total: 54.2s	remaining: 2m 48s
170:	learn: 0.0655729	total: 54.5s	remaining: 2m 48s
171:	learn: 0.0652658	total: 54.8s	remaining: 2m 48s
172:	learn: 0.0651540	total: 55s	remaining: 2m 47s
173:	learn: 0.0650174	total: 55.3s	remaining: 2m 47s
174:	learn: 0.0647316	total: 55.7s	remaining: 2m 46s
175:	learn: 0.0643718	total: 56s	remaining: 2m 46s
176:	learn: 0.0640686	total: 56.3s	remaining: 2m 46s
1

312:	learn: 0.0405406	total: 1m 37s	remaining: 2m
313:	learn: 0.0404387	total: 1m 38s	remaining: 2m
314:	learn: 0.0403232	total: 1m 38s	remaining: 2m
315:	learn: 0.0402377	total: 1m 38s	remaining: 1m 59s
316:	learn: 0.0401385	total: 1m 39s	remaining: 1m 59s
317:	learn: 0.0400862	total: 1m 39s	remaining: 1m 59s
318:	learn: 0.0399613	total: 1m 39s	remaining: 1m 58s
319:	learn: 0.0398275	total: 1m 39s	remaining: 1m 58s
320:	learn: 0.0396903	total: 1m 40s	remaining: 1m 58s
321:	learn: 0.0396442	total: 1m 40s	remaining: 1m 58s
322:	learn: 0.0395513	total: 1m 40s	remaining: 1m 57s
323:	learn: 0.0392926	total: 1m 41s	remaining: 1m 57s
324:	learn: 0.0392017	total: 1m 41s	remaining: 1m 57s
325:	learn: 0.0390636	total: 1m 41s	remaining: 1m 56s
326:	learn: 0.0388765	total: 1m 42s	remaining: 1m 56s
327:	learn: 0.0388165	total: 1m 42s	remaining: 1m 56s
328:	learn: 0.0387104	total: 1m 42s	remaining: 1m 55s
329:	learn: 0.0385630	total: 1m 43s	remaining: 1m 55s
330:	learn: 0.0384455	total: 1m 43s	rema

465:	learn: 0.0268642	total: 2m 24s	remaining: 1m 12s
466:	learn: 0.0267745	total: 2m 25s	remaining: 1m 12s
467:	learn: 0.0267375	total: 2m 25s	remaining: 1m 12s
468:	learn: 0.0266319	total: 2m 25s	remaining: 1m 11s
469:	learn: 0.0265739	total: 2m 26s	remaining: 1m 11s
470:	learn: 0.0265289	total: 2m 26s	remaining: 1m 11s
471:	learn: 0.0264737	total: 2m 26s	remaining: 1m 10s
472:	learn: 0.0264297	total: 2m 26s	remaining: 1m 10s
473:	learn: 0.0263738	total: 2m 27s	remaining: 1m 10s
474:	learn: 0.0261823	total: 2m 27s	remaining: 1m 9s
475:	learn: 0.0261302	total: 2m 27s	remaining: 1m 9s
476:	learn: 0.0260402	total: 2m 28s	remaining: 1m 9s
477:	learn: 0.0259952	total: 2m 28s	remaining: 1m 9s
478:	learn: 0.0258857	total: 2m 28s	remaining: 1m 8s
479:	learn: 0.0258148	total: 2m 29s	remaining: 1m 8s
480:	learn: 0.0257675	total: 2m 29s	remaining: 1m 8s
481:	learn: 0.0257102	total: 2m 29s	remaining: 1m 7s
482:	learn: 0.0256707	total: 2m 30s	remaining: 1m 7s
483:	learn: 0.0256480	total: 2m 30s	r

621:	learn: 0.0187341	total: 3m 13s	remaining: 24.2s
622:	learn: 0.0187004	total: 3m 13s	remaining: 23.9s
623:	learn: 0.0186319	total: 3m 13s	remaining: 23.6s
624:	learn: 0.0185836	total: 3m 14s	remaining: 23.3s
625:	learn: 0.0185123	total: 3m 14s	remaining: 23s
626:	learn: 0.0185027	total: 3m 14s	remaining: 22.7s
627:	learn: 0.0184335	total: 3m 15s	remaining: 22.4s
628:	learn: 0.0183768	total: 3m 15s	remaining: 22.1s
629:	learn: 0.0183640	total: 3m 15s	remaining: 21.8s
630:	learn: 0.0183279	total: 3m 16s	remaining: 21.4s
631:	learn: 0.0182908	total: 3m 16s	remaining: 21.1s
632:	learn: 0.0182619	total: 3m 16s	remaining: 20.8s
633:	learn: 0.0182146	total: 3m 17s	remaining: 20.5s
634:	learn: 0.0181391	total: 3m 17s	remaining: 20.2s
635:	learn: 0.0180909	total: 3m 17s	remaining: 19.9s
636:	learn: 0.0180421	total: 3m 17s	remaining: 19.6s
637:	learn: 0.0180071	total: 3m 18s	remaining: 19.3s
638:	learn: 0.0179417	total: 3m 18s	remaining: 19s
639:	learn: 0.0178942	total: 3m 18s	remaining: 18.

In [0]:
print(metrics.f1_score(y_train, model.predict(train_pool), average='macro'))
print(metrics.f1_score(y_test, model.predict(test_pool), average='macro'))

0.9907213644696256
0.8654063816609154


In [0]:
pickle.dump(model, open('catboost_best.p', 'wb'))

## lightgbm

Cross-validation

In [0]:
params = {"objective" : "multiclass",
          "num_class" : len(set(y)),
          "learning_rate" : 0.05}

In [0]:
cv = StratifiedKFold(n_splits=3, random_state=SEED, shuffle=True)
results = []
    
for (train, test), i in zip(cv.split(X_train, y_train), range(3)):
    cv_train_pool = lgb.Dataset(X_train_dense.iloc[train, :], y_train[train])
    cv_test_pool = lgb.Dataset(X_train_dense.iloc[test, :], y_train[test], reference=cv_train_pool)
    gbm = lgb.train(params,
                    cv_train_pool,
                    num_boost_round=100,
                    valid_sets=cv_test_pool,
                    early_stopping_rounds=5)
    lgb_pred = gbm.predict(X_train_dense.iloc[test, :], num_iteration=gbm.best_iteration)
    pred = []
    for x in lgb_pred:
        pred.append(np.argmax(x))
    results.append(metrics.f1_score(y_train[test], pred, average='macro'))

[1]	valid_0's multi_logloss: 0.599915
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 0.546661
[3]	valid_0's multi_logloss: 0.501008
[4]	valid_0's multi_logloss: 0.468844
[5]	valid_0's multi_logloss: 0.442685
[6]	valid_0's multi_logloss: 0.419831
[7]	valid_0's multi_logloss: 0.399646
[8]	valid_0's multi_logloss: 0.381652
[9]	valid_0's multi_logloss: 0.365684
[10]	valid_0's multi_logloss: 0.351159
[11]	valid_0's multi_logloss: 0.337737
[12]	valid_0's multi_logloss: 0.325476
[13]	valid_0's multi_logloss: 0.313977
[14]	valid_0's multi_logloss: 0.30321
[15]	valid_0's multi_logloss: 0.293177
[16]	valid_0's multi_logloss: 0.283878
[17]	valid_0's multi_logloss: 0.275193
[18]	valid_0's multi_logloss: 0.267042
[19]	valid_0's multi_logloss: 0.259353
[20]	valid_0's multi_logloss: 0.252255
[21]	valid_0's multi_logloss: 0.245413
[22]	valid_0's multi_logloss: 0.239012
[23]	valid_0's multi_logloss: 0.233108
[24]	valid_0's multi_logloss: 0.227378
[25]	valid_0'

[3]	valid_0's multi_logloss: 0.50444
[4]	valid_0's multi_logloss: 0.47222
[5]	valid_0's multi_logloss: 0.445509
[6]	valid_0's multi_logloss: 0.422203
[7]	valid_0's multi_logloss: 0.401537
[8]	valid_0's multi_logloss: 0.382883
[9]	valid_0's multi_logloss: 0.366364
[10]	valid_0's multi_logloss: 0.351417
[11]	valid_0's multi_logloss: 0.337815
[12]	valid_0's multi_logloss: 0.325205
[13]	valid_0's multi_logloss: 0.313675
[14]	valid_0's multi_logloss: 0.30296
[15]	valid_0's multi_logloss: 0.292795
[16]	valid_0's multi_logloss: 0.28328
[17]	valid_0's multi_logloss: 0.274465
[18]	valid_0's multi_logloss: 0.266154
[19]	valid_0's multi_logloss: 0.258363
[20]	valid_0's multi_logloss: 0.251126
[21]	valid_0's multi_logloss: 0.244239
[22]	valid_0's multi_logloss: 0.237771
[23]	valid_0's multi_logloss: 0.231644
[24]	valid_0's multi_logloss: 0.225815
[25]	valid_0's multi_logloss: 0.220288
[26]	valid_0's multi_logloss: 0.215113
[27]	valid_0's multi_logloss: 0.210215
[28]	valid_0's multi_logloss: 0.2055

In [0]:
print(np.mean(results))

0.8007655663935349


Refit on the entire dataset.

In [0]:
cv_train_pool = lgb.Dataset(X_train_dense, y_train)
gbm = lgb.train(params,
                cv_train_pool,
                num_boost_round=100)

# train f1
lgb_pred = gbm.predict(X_train_dense, num_iteration=gbm.best_iteration)
pred = []
for x in lgb_pred:
    pred.append(np.argmax(x))
print(metrics.f1_score(y_train, pred, average='macro'))

# test f1
lgb_pred = gbm.predict(X_test_dense, num_iteration=gbm.best_iteration)
pred = []
for x in lgb_pred:
    pred.append(np.argmax(x))
print(metrics.f1_score(y_test, pred, average='macro'))

0.9887626860049901
0.8297961920567984


In [0]:
pickle.dump(gbm, open('gbm_best.p', 'wb'))

## xgboost

In [0]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [0]:
param = {
    'max_depth': 5,
    'eta': 0.1,
    'objective': 'multi:softprob',
    'num_class': len(set(y))}
num_round = 50
bst = xgb.train(param, dtrain, num_round)

In [0]:
preds = bst.predict(dtest)
best_preds = np.asarray([np.argmax(line) for line in preds])

In [0]:
metrics.f1_score(y_test, best_preds, average='macro')

0.6767117443800256

Simple grid search.

In [0]:
model = XGBClassifier()
n_estimators = range(50, 400, 50)
param_grid = dict(n_estimators=n_estimators)
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
grid_search = GridSearchCV(model, param_grid, scoring="f1_macro", n_jobs=2, cv=kfold)
grid_result = grid_search.fit(X_train, y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.808794 using {'n_estimators': 350}


In [0]:
pickle.dump(grid_result.best_estimator_, open('best_xgboost.p', 'wb'))

In [0]:
metrics.f1_score(y_test, grid_result.best_estimator_.predict(X_test), average='macro')

0.8656513444188269

Refit on the entire dataset.

In [0]:
bestxgbmodel = XGBClassifier(n_estimators=350)
bestxgbmodel.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=350, n_jobs=1, nthread=None,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [0]:
metrics.f1_score(y_test, bestxgbmodel.predict(X_test), average='macro')

0.8656513444188269