In [4]:
import gensim
import pandas as pd
import numpy as np

In [5]:
model = gensim.models.KeyedVectors.load_word2vec_format('all.bin', binary=True)

In [6]:
def read_data(file):
    with open(file) as f:
        sentences = [sentence.split('\n') for sentence in f.read().split('\n\n')]
        df = pd.DataFrame([line.split() for sentence in sentences for line in sentence], columns=['word', 'tag'])
        df['tag'] = df['tag'].map(change_tag)
    return df


def change_tag(tag):
    if tag == 'O':
        return tag
    tag = tag[2:]
    return tag

In [21]:
train = read_data('data/collection_3/train_sa.txt')
valid = read_data('data/collection_3/valid_sa.txt')
test = read_data('data/collection_3/test_sa.txt')

In [8]:
train['tag'].value_counts()

O      133250
PER     11433
ORG      6096
LOC      4360
Name: tag, dtype: int64

In [9]:
test['tag'].value_counts()

O      28405
PER     2590
ORG     1307
LOC     1026
Name: tag, dtype: int64

In [28]:
def get_embedding(word):
    try:
        return model.get_vector(word.lower()).tolist()
    except KeyError:
        return np.zeros(150).tolist()
    

def get_features(df):
    X = pd.DataFrame(df['word'].apply(lambda x: get_embedding(x)).tolist())
    X['word'] = df['word'].map(lambda x: ' '.join(x))
    X['cap'] = X['word'].map(lambda x: x.istitle())
    X['alpha'] = X['word'].map(lambda x: x.isalpha())
    return X

In [29]:
X_train = get_features(train)
X_valid = get_features(valid)
X_test = get_features(test)

y_train = train['tag']
y_valid = valid['tag']
y_test = test['tag']

In [25]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, f1_score

In [34]:
clf = CatBoostClassifier(iterations=10000, task_type='GPU')

In [35]:
clf.fit(X_train, y_train, text_features=['word'], eval_set=(X_valid, y_valid), use_best_model=True, early_stopping_rounds=10)

Learning rate set to 0.063987
0:	learn: 1.2458455	test: 1.2432762	best: 1.2432762 (0)	total: 19.9ms	remaining: 3m 19s
1:	learn: 1.1343088	test: 1.1295961	best: 1.1295961 (1)	total: 34.4ms	remaining: 2m 51s
2:	learn: 1.0432859	test: 1.0369071	best: 1.0369071 (2)	total: 47.3ms	remaining: 2m 37s
3:	learn: 0.9670181	test: 0.9589070	best: 0.9589070 (3)	total: 62.3ms	remaining: 2m 35s
4:	learn: 0.9017302	test: 0.8926726	best: 0.8926726 (4)	total: 77.1ms	remaining: 2m 34s
5:	learn: 0.8454434	test: 0.8354443	best: 0.8354443 (5)	total: 93.4ms	remaining: 2m 35s
6:	learn: 0.7964836	test: 0.7856080	best: 0.7856080 (6)	total: 108ms	remaining: 2m 33s
7:	learn: 0.7532099	test: 0.7419414	best: 0.7419414 (7)	total: 122ms	remaining: 2m 32s
8:	learn: 0.7151744	test: 0.7035241	best: 0.7035241 (8)	total: 137ms	remaining: 2m 31s
9:	learn: 0.6810275	test: 0.6689777	best: 0.6689777 (9)	total: 151ms	remaining: 2m 31s
10:	learn: 0.6503095	test: 0.6379523	best: 0.6379523 (10)	total: 166ms	remaining: 2m 30s
11:	l

95:	learn: 0.2889828	test: 0.2785846	best: 0.2785846 (95)	total: 1.23s	remaining: 2m 7s
96:	learn: 0.2884026	test: 0.2780166	best: 0.2780166 (96)	total: 1.25s	remaining: 2m 7s
97:	learn: 0.2880578	test: 0.2776536	best: 0.2776536 (97)	total: 1.26s	remaining: 2m 7s
98:	learn: 0.2876894	test: 0.2773811	best: 0.2773811 (98)	total: 1.27s	remaining: 2m 7s
99:	learn: 0.2874157	test: 0.2770995	best: 0.2770995 (99)	total: 1.28s	remaining: 2m 7s
100:	learn: 0.2869978	test: 0.2767682	best: 0.2767682 (100)	total: 1.3s	remaining: 2m 7s
101:	learn: 0.2867401	test: 0.2764871	best: 0.2764871 (101)	total: 1.31s	remaining: 2m 6s
102:	learn: 0.2865412	test: 0.2763303	best: 0.2763303 (102)	total: 1.32s	remaining: 2m 6s
103:	learn: 0.2862058	test: 0.2759551	best: 0.2759551 (103)	total: 1.33s	remaining: 2m 6s
104:	learn: 0.2858973	test: 0.2756896	best: 0.2756896 (104)	total: 1.34s	remaining: 2m 6s
105:	learn: 0.2855324	test: 0.2753055	best: 0.2753055 (105)	total: 1.35s	remaining: 2m 5s
106:	learn: 0.2852839

187:	learn: 0.2694746	test: 0.2599274	best: 0.2599274 (187)	total: 2.25s	remaining: 1m 57s
188:	learn: 0.2693170	test: 0.2597562	best: 0.2597562 (188)	total: 2.26s	remaining: 1m 57s
189:	learn: 0.2691267	test: 0.2595630	best: 0.2595630 (189)	total: 2.27s	remaining: 1m 57s
190:	learn: 0.2690402	test: 0.2594949	best: 0.2594949 (190)	total: 2.28s	remaining: 1m 57s
191:	learn: 0.2689492	test: 0.2594205	best: 0.2594205 (191)	total: 2.29s	remaining: 1m 57s
192:	learn: 0.2687038	test: 0.2591152	best: 0.2591152 (192)	total: 2.3s	remaining: 1m 57s
193:	learn: 0.2683429	test: 0.2588429	best: 0.2588429 (193)	total: 2.32s	remaining: 1m 57s
194:	learn: 0.2681509	test: 0.2586651	best: 0.2586651 (194)	total: 2.33s	remaining: 1m 57s
195:	learn: 0.2680806	test: 0.2586144	best: 0.2586144 (195)	total: 2.34s	remaining: 1m 57s
196:	learn: 0.2679191	test: 0.2584589	best: 0.2584589 (196)	total: 2.35s	remaining: 1m 56s
197:	learn: 0.2677506	test: 0.2583312	best: 0.2583312 (197)	total: 2.36s	remaining: 1m 56s


289:	learn: 0.2594218	test: 0.2508834	best: 0.2508834 (289)	total: 3.25s	remaining: 1m 48s
290:	learn: 0.2593667	test: 0.2508603	best: 0.2508603 (290)	total: 3.26s	remaining: 1m 48s
291:	learn: 0.2592686	test: 0.2507652	best: 0.2507652 (291)	total: 3.27s	remaining: 1m 48s
292:	learn: 0.2592483	test: 0.2507358	best: 0.2507358 (292)	total: 3.28s	remaining: 1m 48s
293:	learn: 0.2592273	test: 0.2507347	best: 0.2507347 (293)	total: 3.29s	remaining: 1m 48s
294:	learn: 0.2591762	test: 0.2506792	best: 0.2506792 (294)	total: 3.3s	remaining: 1m 48s
295:	learn: 0.2590080	test: 0.2505819	best: 0.2505819 (295)	total: 3.31s	remaining: 1m 48s
296:	learn: 0.2588633	test: 0.2504507	best: 0.2504507 (296)	total: 3.32s	remaining: 1m 48s
297:	learn: 0.2588196	test: 0.2504210	best: 0.2504210 (297)	total: 3.33s	remaining: 1m 48s
298:	learn: 0.2587830	test: 0.2503722	best: 0.2503722 (298)	total: 3.33s	remaining: 1m 48s
299:	learn: 0.2587213	test: 0.2503388	best: 0.2503388 (299)	total: 3.34s	remaining: 1m 48s


399:	learn: 0.2527666	test: 0.2456547	best: 0.2456547 (399)	total: 4.26s	remaining: 1m 42s
400:	learn: 0.2526601	test: 0.2455899	best: 0.2455899 (400)	total: 4.27s	remaining: 1m 42s
401:	learn: 0.2526437	test: 0.2455883	best: 0.2455883 (401)	total: 4.28s	remaining: 1m 42s
402:	learn: 0.2526111	test: 0.2455643	best: 0.2455643 (402)	total: 4.29s	remaining: 1m 42s
403:	learn: 0.2525956	test: 0.2455565	best: 0.2455565 (403)	total: 4.3s	remaining: 1m 42s
404:	learn: 0.2525823	test: 0.2455523	best: 0.2455523 (404)	total: 4.31s	remaining: 1m 42s
405:	learn: 0.2525430	test: 0.2455092	best: 0.2455092 (405)	total: 4.32s	remaining: 1m 41s
406:	learn: 0.2524992	test: 0.2454814	best: 0.2454814 (406)	total: 4.32s	remaining: 1m 41s
407:	learn: 0.2524145	test: 0.2454109	best: 0.2454109 (407)	total: 4.33s	remaining: 1m 41s
408:	learn: 0.2523840	test: 0.2453963	best: 0.2453963 (408)	total: 4.34s	remaining: 1m 41s
409:	learn: 0.2523344	test: 0.2453127	best: 0.2453127 (409)	total: 4.35s	remaining: 1m 41s


491:	learn: 0.2492262	test: 0.2428921	best: 0.2428921 (491)	total: 5.06s	remaining: 1m 37s
492:	learn: 0.2491943	test: 0.2428574	best: 0.2428574 (492)	total: 5.07s	remaining: 1m 37s
493:	learn: 0.2491632	test: 0.2428427	best: 0.2428427 (493)	total: 5.08s	remaining: 1m 37s
494:	learn: 0.2491441	test: 0.2428345	best: 0.2428345 (494)	total: 5.09s	remaining: 1m 37s
495:	learn: 0.2491295	test: 0.2428194	best: 0.2428194 (495)	total: 5.1s	remaining: 1m 37s
496:	learn: 0.2490919	test: 0.2427772	best: 0.2427772 (496)	total: 5.11s	remaining: 1m 37s
497:	learn: 0.2490691	test: 0.2427525	best: 0.2427525 (497)	total: 5.12s	remaining: 1m 37s
498:	learn: 0.2490286	test: 0.2427380	best: 0.2427380 (498)	total: 5.13s	remaining: 1m 37s
499:	learn: 0.2490154	test: 0.2427298	best: 0.2427298 (499)	total: 5.13s	remaining: 1m 37s
500:	learn: 0.2490069	test: 0.2427338	best: 0.2427298 (499)	total: 5.14s	remaining: 1m 37s
501:	learn: 0.2489923	test: 0.2427238	best: 0.2427238 (501)	total: 5.15s	remaining: 1m 37s


593:	learn: 0.2459141	test: 0.2405405	best: 0.2405405 (593)	total: 6.08s	remaining: 1m 36s
594:	learn: 0.2459002	test: 0.2405337	best: 0.2405337 (594)	total: 6.09s	remaining: 1m 36s
595:	learn: 0.2458570	test: 0.2404878	best: 0.2404878 (595)	total: 6.1s	remaining: 1m 36s
596:	learn: 0.2458247	test: 0.2404540	best: 0.2404540 (596)	total: 6.11s	remaining: 1m 36s
597:	learn: 0.2457681	test: 0.2403998	best: 0.2403998 (597)	total: 6.13s	remaining: 1m 36s
598:	learn: 0.2457208	test: 0.2403489	best: 0.2403489 (598)	total: 6.14s	remaining: 1m 36s
599:	learn: 0.2457068	test: 0.2403282	best: 0.2403282 (599)	total: 6.15s	remaining: 1m 36s
600:	learn: 0.2456991	test: 0.2403280	best: 0.2403280 (600)	total: 6.16s	remaining: 1m 36s
601:	learn: 0.2456821	test: 0.2403175	best: 0.2403175 (601)	total: 6.17s	remaining: 1m 36s
602:	learn: 0.2456663	test: 0.2402915	best: 0.2402915 (602)	total: 6.18s	remaining: 1m 36s
603:	learn: 0.2456343	test: 0.2402814	best: 0.2402814 (603)	total: 6.19s	remaining: 1m 36s


691:	learn: 0.2435009	test: 0.2389608	best: 0.2389608 (691)	total: 7.08s	remaining: 1m 35s
692:	learn: 0.2434954	test: 0.2389571	best: 0.2389571 (692)	total: 7.09s	remaining: 1m 35s
693:	learn: 0.2434796	test: 0.2389561	best: 0.2389561 (693)	total: 7.1s	remaining: 1m 35s
694:	learn: 0.2434761	test: 0.2389541	best: 0.2389541 (694)	total: 7.11s	remaining: 1m 35s
695:	learn: 0.2434699	test: 0.2389512	best: 0.2389512 (695)	total: 7.12s	remaining: 1m 35s
696:	learn: 0.2434442	test: 0.2389397	best: 0.2389397 (696)	total: 7.13s	remaining: 1m 35s
697:	learn: 0.2434222	test: 0.2389209	best: 0.2389209 (697)	total: 7.14s	remaining: 1m 35s
698:	learn: 0.2433949	test: 0.2388930	best: 0.2388930 (698)	total: 7.15s	remaining: 1m 35s
699:	learn: 0.2433586	test: 0.2388606	best: 0.2388606 (699)	total: 7.16s	remaining: 1m 35s
700:	learn: 0.2433363	test: 0.2388508	best: 0.2388508 (700)	total: 7.17s	remaining: 1m 35s
701:	learn: 0.2433248	test: 0.2388289	best: 0.2388289 (701)	total: 7.18s	remaining: 1m 35s


795:	learn: 0.2410102	test: 0.2372957	best: 0.2372957 (795)	total: 8.08s	remaining: 1m 33s
796:	learn: 0.2409712	test: 0.2372692	best: 0.2372692 (796)	total: 8.09s	remaining: 1m 33s
797:	learn: 0.2409357	test: 0.2372291	best: 0.2372291 (797)	total: 8.11s	remaining: 1m 33s
798:	learn: 0.2409065	test: 0.2372015	best: 0.2372015 (798)	total: 8.12s	remaining: 1m 33s
799:	learn: 0.2409010	test: 0.2372007	best: 0.2372007 (799)	total: 8.13s	remaining: 1m 33s
800:	learn: 0.2408895	test: 0.2371810	best: 0.2371810 (800)	total: 8.14s	remaining: 1m 33s
801:	learn: 0.2408827	test: 0.2371799	best: 0.2371799 (801)	total: 8.15s	remaining: 1m 33s
802:	learn: 0.2408729	test: 0.2371773	best: 0.2371773 (802)	total: 8.16s	remaining: 1m 33s
803:	learn: 0.2408645	test: 0.2371758	best: 0.2371758 (803)	total: 8.17s	remaining: 1m 33s
804:	learn: 0.2408533	test: 0.2371658	best: 0.2371658 (804)	total: 8.18s	remaining: 1m 33s
805:	learn: 0.2408202	test: 0.2371315	best: 0.2371315 (805)	total: 8.2s	remaining: 1m 33s


904:	learn: 0.2388867	test: 0.2357430	best: 0.2357430 (904)	total: 9.09s	remaining: 1m 31s
905:	learn: 0.2388777	test: 0.2357422	best: 0.2357422 (905)	total: 9.1s	remaining: 1m 31s
906:	learn: 0.2388599	test: 0.2357197	best: 0.2357197 (906)	total: 9.11s	remaining: 1m 31s
907:	learn: 0.2388483	test: 0.2357069	best: 0.2357069 (907)	total: 9.12s	remaining: 1m 31s
908:	learn: 0.2388340	test: 0.2357027	best: 0.2357027 (908)	total: 9.13s	remaining: 1m 31s
909:	learn: 0.2388022	test: 0.2356786	best: 0.2356786 (909)	total: 9.14s	remaining: 1m 31s
910:	learn: 0.2387495	test: 0.2356270	best: 0.2356270 (910)	total: 9.15s	remaining: 1m 31s
911:	learn: 0.2387421	test: 0.2356223	best: 0.2356223 (911)	total: 9.16s	remaining: 1m 31s
912:	learn: 0.2387366	test: 0.2356214	best: 0.2356214 (912)	total: 9.16s	remaining: 1m 31s
913:	learn: 0.2387334	test: 0.2356214	best: 0.2356214 (912)	total: 9.17s	remaining: 1m 31s
914:	learn: 0.2387037	test: 0.2356023	best: 0.2356023 (914)	total: 9.18s	remaining: 1m 31s


1015:	learn: 0.2369599	test: 0.2344881	best: 0.2344881 (1015)	total: 10.1s	remaining: 1m 29s
1016:	learn: 0.2369520	test: 0.2344871	best: 0.2344871 (1016)	total: 10.1s	remaining: 1m 29s
1017:	learn: 0.2369240	test: 0.2344774	best: 0.2344774 (1017)	total: 10.1s	remaining: 1m 29s
1018:	learn: 0.2369022	test: 0.2344518	best: 0.2344518 (1018)	total: 10.1s	remaining: 1m 29s
1019:	learn: 0.2368997	test: 0.2344516	best: 0.2344516 (1019)	total: 10.1s	remaining: 1m 29s
1020:	learn: 0.2368841	test: 0.2344394	best: 0.2344394 (1020)	total: 10.1s	remaining: 1m 29s
1021:	learn: 0.2368560	test: 0.2344296	best: 0.2344296 (1021)	total: 10.2s	remaining: 1m 29s
1022:	learn: 0.2368425	test: 0.2344233	best: 0.2344233 (1022)	total: 10.2s	remaining: 1m 29s
1023:	learn: 0.2368278	test: 0.2344107	best: 0.2344107 (1023)	total: 10.2s	remaining: 1m 29s
1024:	learn: 0.2368111	test: 0.2343982	best: 0.2343982 (1024)	total: 10.2s	remaining: 1m 29s
1025:	learn: 0.2367847	test: 0.2343909	best: 0.2343909 (1025)	total: 1

1123:	learn: 0.2353002	test: 0.2335496	best: 0.2335496 (1123)	total: 11.1s	remaining: 1m 27s
1124:	learn: 0.2352959	test: 0.2335495	best: 0.2335495 (1124)	total: 11.1s	remaining: 1m 27s
1125:	learn: 0.2352857	test: 0.2335393	best: 0.2335393 (1125)	total: 11.1s	remaining: 1m 27s
1126:	learn: 0.2352637	test: 0.2335101	best: 0.2335101 (1126)	total: 11.1s	remaining: 1m 27s
1127:	learn: 0.2352578	test: 0.2335030	best: 0.2335030 (1127)	total: 11.1s	remaining: 1m 27s
1128:	learn: 0.2352509	test: 0.2335014	best: 0.2335014 (1128)	total: 11.2s	remaining: 1m 27s
1129:	learn: 0.2352227	test: 0.2334730	best: 0.2334730 (1129)	total: 11.2s	remaining: 1m 27s
1130:	learn: 0.2352165	test: 0.2334650	best: 0.2334650 (1130)	total: 11.2s	remaining: 1m 27s
1131:	learn: 0.2351791	test: 0.2334491	best: 0.2334491 (1131)	total: 11.2s	remaining: 1m 27s
1132:	learn: 0.2351683	test: 0.2334471	best: 0.2334471 (1132)	total: 11.2s	remaining: 1m 27s
1133:	learn: 0.2351653	test: 0.2334474	best: 0.2334471 (1132)	total: 1

1213:	learn: 0.2340068	test: 0.2328161	best: 0.2328161 (1213)	total: 11.9s	remaining: 1m 26s
1214:	learn: 0.2340003	test: 0.2328085	best: 0.2328085 (1214)	total: 11.9s	remaining: 1m 26s
1215:	learn: 0.2339939	test: 0.2328031	best: 0.2328031 (1215)	total: 11.9s	remaining: 1m 26s
1216:	learn: 0.2339856	test: 0.2328039	best: 0.2328031 (1215)	total: 11.9s	remaining: 1m 26s
1217:	learn: 0.2339510	test: 0.2327784	best: 0.2327784 (1217)	total: 12s	remaining: 1m 26s
1218:	learn: 0.2339251	test: 0.2327744	best: 0.2327744 (1218)	total: 12s	remaining: 1m 26s
1219:	learn: 0.2339124	test: 0.2327725	best: 0.2327725 (1219)	total: 12s	remaining: 1m 26s
1220:	learn: 0.2339096	test: 0.2327735	best: 0.2327725 (1219)	total: 12s	remaining: 1m 26s
1221:	learn: 0.2339067	test: 0.2327746	best: 0.2327725 (1219)	total: 12s	remaining: 1m 26s
1222:	learn: 0.2338951	test: 0.2327709	best: 0.2327709 (1222)	total: 12s	remaining: 1m 26s
1223:	learn: 0.2338833	test: 0.2327695	best: 0.2327695 (1223)	total: 12s	remaining

1314:	learn: 0.2327499	test: 0.2321444	best: 0.2321444 (1314)	total: 12.9s	remaining: 1m 25s
1315:	learn: 0.2327395	test: 0.2321358	best: 0.2321358 (1315)	total: 12.9s	remaining: 1m 25s
1316:	learn: 0.2327287	test: 0.2321313	best: 0.2321313 (1316)	total: 12.9s	remaining: 1m 25s
1317:	learn: 0.2327116	test: 0.2321237	best: 0.2321237 (1317)	total: 12.9s	remaining: 1m 25s
1318:	learn: 0.2327028	test: 0.2321206	best: 0.2321206 (1318)	total: 13s	remaining: 1m 25s
1319:	learn: 0.2326660	test: 0.2320828	best: 0.2320828 (1319)	total: 13s	remaining: 1m 25s
1320:	learn: 0.2326365	test: 0.2320647	best: 0.2320647 (1320)	total: 13s	remaining: 1m 25s
1321:	learn: 0.2326341	test: 0.2320642	best: 0.2320642 (1321)	total: 13s	remaining: 1m 25s
1322:	learn: 0.2326173	test: 0.2320636	best: 0.2320636 (1322)	total: 13s	remaining: 1m 25s
1323:	learn: 0.2326068	test: 0.2320526	best: 0.2320526 (1323)	total: 13s	remaining: 1m 25s
1324:	learn: 0.2325988	test: 0.2320528	best: 0.2320526 (1323)	total: 13s	remaining

1404:	learn: 0.2316731	test: 0.2315557	best: 0.2315557 (1404)	total: 13.7s	remaining: 1m 24s
1405:	learn: 0.2316653	test: 0.2315520	best: 0.2315520 (1405)	total: 13.7s	remaining: 1m 23s
1406:	learn: 0.2316611	test: 0.2315484	best: 0.2315484 (1406)	total: 13.7s	remaining: 1m 23s
1407:	learn: 0.2316468	test: 0.2315259	best: 0.2315259 (1407)	total: 13.8s	remaining: 1m 23s
1408:	learn: 0.2316291	test: 0.2315132	best: 0.2315132 (1408)	total: 13.8s	remaining: 1m 23s
1409:	learn: 0.2316253	test: 0.2315108	best: 0.2315108 (1409)	total: 13.8s	remaining: 1m 23s
1410:	learn: 0.2316224	test: 0.2315115	best: 0.2315108 (1409)	total: 13.8s	remaining: 1m 23s
1411:	learn: 0.2316017	test: 0.2315012	best: 0.2315012 (1411)	total: 13.8s	remaining: 1m 23s
1412:	learn: 0.2315717	test: 0.2314909	best: 0.2314909 (1412)	total: 13.8s	remaining: 1m 23s
1413:	learn: 0.2315640	test: 0.2314827	best: 0.2314827 (1413)	total: 13.8s	remaining: 1m 23s
1414:	learn: 0.2315601	test: 0.2314783	best: 0.2314783 (1414)	total: 1

1493:	learn: 0.2306940	test: 0.2309746	best: 0.2309746 (1493)	total: 14.5s	remaining: 1m 22s
1494:	learn: 0.2306772	test: 0.2309799	best: 0.2309746 (1493)	total: 14.5s	remaining: 1m 22s
1495:	learn: 0.2306657	test: 0.2309765	best: 0.2309746 (1493)	total: 14.6s	remaining: 1m 22s
1496:	learn: 0.2306596	test: 0.2309797	best: 0.2309746 (1493)	total: 14.6s	remaining: 1m 22s
1497:	learn: 0.2306578	test: 0.2309790	best: 0.2309746 (1493)	total: 14.6s	remaining: 1m 22s
1498:	learn: 0.2306562	test: 0.2309797	best: 0.2309746 (1493)	total: 14.6s	remaining: 1m 22s
1499:	learn: 0.2306538	test: 0.2309742	best: 0.2309742 (1499)	total: 14.6s	remaining: 1m 22s
1500:	learn: 0.2306497	test: 0.2309734	best: 0.2309734 (1500)	total: 14.6s	remaining: 1m 22s
1501:	learn: 0.2306456	test: 0.2309688	best: 0.2309688 (1501)	total: 14.6s	remaining: 1m 22s
1502:	learn: 0.2306145	test: 0.2309517	best: 0.2309517 (1502)	total: 14.6s	remaining: 1m 22s
1503:	learn: 0.2305810	test: 0.2309401	best: 0.2309401 (1503)	total: 1

1582:	learn: 0.2298177	test: 0.2305489	best: 0.2305489 (1582)	total: 15.3s	remaining: 1m 21s
1583:	learn: 0.2297890	test: 0.2305195	best: 0.2305195 (1583)	total: 15.4s	remaining: 1m 21s
1584:	learn: 0.2297724	test: 0.2305144	best: 0.2305144 (1584)	total: 15.4s	remaining: 1m 21s
1585:	learn: 0.2297610	test: 0.2305032	best: 0.2305032 (1585)	total: 15.4s	remaining: 1m 21s
1586:	learn: 0.2297428	test: 0.2304984	best: 0.2304984 (1586)	total: 15.4s	remaining: 1m 21s
1587:	learn: 0.2297085	test: 0.2304893	best: 0.2304893 (1587)	total: 15.4s	remaining: 1m 21s
1588:	learn: 0.2297031	test: 0.2304858	best: 0.2304858 (1588)	total: 15.4s	remaining: 1m 21s
1589:	learn: 0.2296991	test: 0.2304864	best: 0.2304858 (1588)	total: 15.4s	remaining: 1m 21s
1590:	learn: 0.2296959	test: 0.2304779	best: 0.2304779 (1590)	total: 15.4s	remaining: 1m 21s
1591:	learn: 0.2296818	test: 0.2304729	best: 0.2304729 (1591)	total: 15.4s	remaining: 1m 21s
1592:	learn: 0.2296749	test: 0.2304709	best: 0.2304709 (1592)	total: 1

1687:	learn: 0.2287629	test: 0.2300541	best: 0.2300523 (1686)	total: 16.3s	remaining: 1m 20s
1688:	learn: 0.2287560	test: 0.2300530	best: 0.2300523 (1686)	total: 16.4s	remaining: 1m 20s
1689:	learn: 0.2287535	test: 0.2300534	best: 0.2300523 (1686)	total: 16.4s	remaining: 1m 20s
1690:	learn: 0.2287353	test: 0.2300398	best: 0.2300398 (1690)	total: 16.4s	remaining: 1m 20s
1691:	learn: 0.2287267	test: 0.2300337	best: 0.2300337 (1691)	total: 16.4s	remaining: 1m 20s
1692:	learn: 0.2287140	test: 0.2300222	best: 0.2300222 (1692)	total: 16.4s	remaining: 1m 20s
1693:	learn: 0.2287049	test: 0.2300213	best: 0.2300213 (1693)	total: 16.4s	remaining: 1m 20s
1694:	learn: 0.2287024	test: 0.2300215	best: 0.2300213 (1693)	total: 16.4s	remaining: 1m 20s
1695:	learn: 0.2286956	test: 0.2300210	best: 0.2300210 (1695)	total: 16.4s	remaining: 1m 20s
1696:	learn: 0.2286894	test: 0.2300149	best: 0.2300149 (1696)	total: 16.4s	remaining: 1m 20s
1697:	learn: 0.2286860	test: 0.2300134	best: 0.2300134 (1697)	total: 1

1776:	learn: 0.2279247	test: 0.2296351	best: 0.2296346 (1775)	total: 17.2s	remaining: 1m 19s
1777:	learn: 0.2279161	test: 0.2296217	best: 0.2296217 (1777)	total: 17.2s	remaining: 1m 19s
1778:	learn: 0.2279081	test: 0.2296185	best: 0.2296185 (1778)	total: 17.2s	remaining: 1m 19s
1779:	learn: 0.2278981	test: 0.2296149	best: 0.2296149 (1779)	total: 17.2s	remaining: 1m 19s
1780:	learn: 0.2278941	test: 0.2296120	best: 0.2296120 (1780)	total: 17.2s	remaining: 1m 19s
1781:	learn: 0.2278914	test: 0.2296106	best: 0.2296106 (1781)	total: 17.2s	remaining: 1m 19s
1782:	learn: 0.2278817	test: 0.2296077	best: 0.2296077 (1782)	total: 17.2s	remaining: 1m 19s
1783:	learn: 0.2278791	test: 0.2296080	best: 0.2296077 (1782)	total: 17.2s	remaining: 1m 19s
1784:	learn: 0.2278711	test: 0.2296022	best: 0.2296022 (1784)	total: 17.2s	remaining: 1m 19s
1785:	learn: 0.2278539	test: 0.2295972	best: 0.2295972 (1785)	total: 17.2s	remaining: 1m 19s
1786:	learn: 0.2278359	test: 0.2295978	best: 0.2295972 (1785)	total: 1

1868:	learn: 0.2271448	test: 0.2292557	best: 0.2292511 (1865)	total: 18s	remaining: 1m 18s
1869:	learn: 0.2271339	test: 0.2292470	best: 0.2292470 (1869)	total: 18s	remaining: 1m 18s
1870:	learn: 0.2271319	test: 0.2292465	best: 0.2292465 (1870)	total: 18s	remaining: 1m 18s
1871:	learn: 0.2271291	test: 0.2292475	best: 0.2292465 (1870)	total: 18s	remaining: 1m 18s
1872:	learn: 0.2271242	test: 0.2292459	best: 0.2292459 (1872)	total: 18s	remaining: 1m 18s
1873:	learn: 0.2271030	test: 0.2292428	best: 0.2292428 (1873)	total: 18s	remaining: 1m 18s
1874:	learn: 0.2270929	test: 0.2292401	best: 0.2292401 (1874)	total: 18s	remaining: 1m 18s
1875:	learn: 0.2270882	test: 0.2292404	best: 0.2292401 (1874)	total: 18s	remaining: 1m 18s
1876:	learn: 0.2270806	test: 0.2292371	best: 0.2292371 (1876)	total: 18s	remaining: 1m 18s
1877:	learn: 0.2270651	test: 0.2292389	best: 0.2292371 (1876)	total: 18.1s	remaining: 1m 18s
1878:	learn: 0.2270623	test: 0.2292360	best: 0.2292360 (1878)	total: 18.1s	remaining: 1m

1978:	learn: 0.2261913	test: 0.2287975	best: 0.2287975 (1978)	total: 19s	remaining: 1m 16s
1979:	learn: 0.2261893	test: 0.2287975	best: 0.2287975 (1979)	total: 19s	remaining: 1m 16s
1980:	learn: 0.2261794	test: 0.2287924	best: 0.2287924 (1980)	total: 19s	remaining: 1m 16s
1981:	learn: 0.2261715	test: 0.2287909	best: 0.2287909 (1981)	total: 19s	remaining: 1m 16s
1982:	learn: 0.2261663	test: 0.2287920	best: 0.2287909 (1981)	total: 19s	remaining: 1m 16s
1983:	learn: 0.2261560	test: 0.2287852	best: 0.2287852 (1983)	total: 19s	remaining: 1m 16s
1984:	learn: 0.2261450	test: 0.2287834	best: 0.2287834 (1984)	total: 19s	remaining: 1m 16s
1985:	learn: 0.2261364	test: 0.2287789	best: 0.2287789 (1985)	total: 19s	remaining: 1m 16s
1986:	learn: 0.2261319	test: 0.2287703	best: 0.2287703 (1986)	total: 19.1s	remaining: 1m 16s
1987:	learn: 0.2261305	test: 0.2287710	best: 0.2287703 (1986)	total: 19.1s	remaining: 1m 16s
1988:	learn: 0.2261235	test: 0.2287654	best: 0.2287654 (1988)	total: 19.1s	remaining: 

2089:	learn: 0.2251631	test: 0.2283405	best: 0.2283405 (2089)	total: 20s	remaining: 1m 15s
2090:	learn: 0.2251551	test: 0.2283411	best: 0.2283405 (2089)	total: 20s	remaining: 1m 15s
2091:	learn: 0.2251434	test: 0.2283373	best: 0.2283373 (2091)	total: 20s	remaining: 1m 15s
2092:	learn: 0.2251370	test: 0.2283372	best: 0.2283372 (2092)	total: 20s	remaining: 1m 15s
2093:	learn: 0.2251325	test: 0.2283343	best: 0.2283343 (2093)	total: 20s	remaining: 1m 15s
2094:	learn: 0.2251283	test: 0.2283345	best: 0.2283343 (2093)	total: 20s	remaining: 1m 15s
2095:	learn: 0.2251172	test: 0.2283356	best: 0.2283343 (2093)	total: 20s	remaining: 1m 15s
2096:	learn: 0.2251150	test: 0.2283353	best: 0.2283343 (2093)	total: 20.1s	remaining: 1m 15s
2097:	learn: 0.2250987	test: 0.2283312	best: 0.2283312 (2097)	total: 20.1s	remaining: 1m 15s
2098:	learn: 0.2250916	test: 0.2283295	best: 0.2283295 (2098)	total: 20.1s	remaining: 1m 15s
2099:	learn: 0.2250854	test: 0.2283240	best: 0.2283240 (2099)	total: 20.1s	remaining

2180:	learn: 0.2244279	test: 0.2280805	best: 0.2280805 (2180)	total: 20.8s	remaining: 1m 14s
2181:	learn: 0.2244172	test: 0.2280690	best: 0.2280690 (2181)	total: 20.8s	remaining: 1m 14s
2182:	learn: 0.2244107	test: 0.2280691	best: 0.2280690 (2181)	total: 20.8s	remaining: 1m 14s
2183:	learn: 0.2244044	test: 0.2280650	best: 0.2280650 (2183)	total: 20.8s	remaining: 1m 14s
2184:	learn: 0.2244014	test: 0.2280623	best: 0.2280623 (2184)	total: 20.8s	remaining: 1m 14s
2185:	learn: 0.2243978	test: 0.2280619	best: 0.2280619 (2185)	total: 20.8s	remaining: 1m 14s
2186:	learn: 0.2243836	test: 0.2280606	best: 0.2280606 (2186)	total: 20.9s	remaining: 1m 14s
2187:	learn: 0.2243805	test: 0.2280579	best: 0.2280579 (2187)	total: 20.9s	remaining: 1m 14s
2188:	learn: 0.2243486	test: 0.2280514	best: 0.2280514 (2188)	total: 20.9s	remaining: 1m 14s
2189:	learn: 0.2243444	test: 0.2280500	best: 0.2280500 (2189)	total: 20.9s	remaining: 1m 14s
2190:	learn: 0.2243405	test: 0.2280411	best: 0.2280411 (2190)	total: 2

2270:	learn: 0.2237525	test: 0.2277783	best: 0.2277783 (2270)	total: 21.6s	remaining: 1m 13s
2271:	learn: 0.2237483	test: 0.2277744	best: 0.2277744 (2271)	total: 21.6s	remaining: 1m 13s
2272:	learn: 0.2237468	test: 0.2277727	best: 0.2277727 (2272)	total: 21.6s	remaining: 1m 13s
2273:	learn: 0.2237401	test: 0.2277730	best: 0.2277727 (2272)	total: 21.6s	remaining: 1m 13s
2274:	learn: 0.2237236	test: 0.2277705	best: 0.2277705 (2274)	total: 21.6s	remaining: 1m 13s
2275:	learn: 0.2237147	test: 0.2277642	best: 0.2277642 (2275)	total: 21.7s	remaining: 1m 13s
2276:	learn: 0.2236955	test: 0.2277545	best: 0.2277545 (2276)	total: 21.7s	remaining: 1m 13s
2277:	learn: 0.2236900	test: 0.2277512	best: 0.2277512 (2277)	total: 21.7s	remaining: 1m 13s
2278:	learn: 0.2236835	test: 0.2277465	best: 0.2277465 (2278)	total: 21.7s	remaining: 1m 13s
2279:	learn: 0.2236766	test: 0.2277258	best: 0.2277258 (2279)	total: 21.7s	remaining: 1m 13s
2280:	learn: 0.2236695	test: 0.2277192	best: 0.2277192 (2280)	total: 2

2376:	learn: 0.2230093	test: 0.2274430	best: 0.2274418 (2375)	total: 22.6s	remaining: 1m 12s
2377:	learn: 0.2230012	test: 0.2274384	best: 0.2274384 (2377)	total: 22.6s	remaining: 1m 12s
2378:	learn: 0.2229997	test: 0.2274385	best: 0.2274384 (2377)	total: 22.6s	remaining: 1m 12s
2379:	learn: 0.2229867	test: 0.2274387	best: 0.2274384 (2377)	total: 22.6s	remaining: 1m 12s
2380:	learn: 0.2229719	test: 0.2274331	best: 0.2274331 (2380)	total: 22.7s	remaining: 1m 12s
2381:	learn: 0.2229555	test: 0.2274190	best: 0.2274190 (2381)	total: 22.7s	remaining: 1m 12s
2382:	learn: 0.2229430	test: 0.2274143	best: 0.2274143 (2382)	total: 22.7s	remaining: 1m 12s
2383:	learn: 0.2229372	test: 0.2274122	best: 0.2274122 (2383)	total: 22.7s	remaining: 1m 12s
2384:	learn: 0.2229348	test: 0.2274135	best: 0.2274122 (2383)	total: 22.7s	remaining: 1m 12s
2385:	learn: 0.2229311	test: 0.2274135	best: 0.2274122 (2383)	total: 22.7s	remaining: 1m 12s
2386:	learn: 0.2229216	test: 0.2274174	best: 0.2274122 (2383)	total: 2

2467:	learn: 0.2223750	test: 0.2271615	best: 0.2271612 (2466)	total: 23.4s	remaining: 1m 11s
2468:	learn: 0.2223627	test: 0.2271512	best: 0.2271512 (2468)	total: 23.4s	remaining: 1m 11s
2469:	learn: 0.2223589	test: 0.2271529	best: 0.2271512 (2468)	total: 23.4s	remaining: 1m 11s
2470:	learn: 0.2223578	test: 0.2271516	best: 0.2271512 (2468)	total: 23.4s	remaining: 1m 11s
2471:	learn: 0.2223429	test: 0.2271543	best: 0.2271512 (2468)	total: 23.5s	remaining: 1m 11s
2472:	learn: 0.2223346	test: 0.2271497	best: 0.2271497 (2472)	total: 23.5s	remaining: 1m 11s
2473:	learn: 0.2223203	test: 0.2271359	best: 0.2271359 (2473)	total: 23.5s	remaining: 1m 11s
2474:	learn: 0.2223080	test: 0.2271284	best: 0.2271284 (2474)	total: 23.5s	remaining: 1m 11s
2475:	learn: 0.2223055	test: 0.2271274	best: 0.2271274 (2475)	total: 23.5s	remaining: 1m 11s
2476:	learn: 0.2222967	test: 0.2271318	best: 0.2271274 (2475)	total: 23.5s	remaining: 1m 11s
2477:	learn: 0.2222850	test: 0.2271199	best: 0.2271199 (2477)	total: 2

2558:	learn: 0.2217139	test: 0.2268984	best: 0.2268984 (2558)	total: 24.2s	remaining: 1m 10s
2559:	learn: 0.2217101	test: 0.2268908	best: 0.2268908 (2559)	total: 24.2s	remaining: 1m 10s
2560:	learn: 0.2217052	test: 0.2268937	best: 0.2268908 (2559)	total: 24.2s	remaining: 1m 10s
2561:	learn: 0.2217002	test: 0.2268875	best: 0.2268875 (2561)	total: 24.3s	remaining: 1m 10s
2562:	learn: 0.2216969	test: 0.2268833	best: 0.2268833 (2562)	total: 24.3s	remaining: 1m 10s
2563:	learn: 0.2216913	test: 0.2268836	best: 0.2268833 (2562)	total: 24.3s	remaining: 1m 10s
2564:	learn: 0.2216643	test: 0.2268767	best: 0.2268767 (2564)	total: 24.3s	remaining: 1m 10s
2565:	learn: 0.2216590	test: 0.2268754	best: 0.2268754 (2565)	total: 24.3s	remaining: 1m 10s
2566:	learn: 0.2216564	test: 0.2268739	best: 0.2268739 (2566)	total: 24.3s	remaining: 1m 10s
2567:	learn: 0.2216498	test: 0.2268678	best: 0.2268678 (2567)	total: 24.3s	remaining: 1m 10s
2568:	learn: 0.2216482	test: 0.2268680	best: 0.2268678 (2567)	total: 2

<catboost.core.CatBoostClassifier at 0x7fa8f4e93fd0>

In [37]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

         LOC       0.74      0.61      0.67      1026
           O       0.94      0.98      0.96     28405
         ORG       0.79      0.41      0.54      1307
         PER       0.81      0.65      0.72      2590

    accuracy                           0.92     33328
   macro avg       0.82      0.66      0.72     33328
weighted avg       0.91      0.92      0.91     33328



In [33]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

         LOC       0.74      0.61      0.67      1026
           O       0.93      0.98      0.96     28405
         ORG       0.77      0.39      0.51      1307
         PER       0.81      0.65      0.72      2590

    accuracy                           0.92     33328
   macro avg       0.81      0.65      0.71     33328
weighted avg       0.91      0.92      0.91     33328



In [42]:
list(zip(test['word'], test['tag'], clf.predict(X_test)[:, 0]))

[('А', 'PER', 'PER'),
 ('.', 'PER', 'O'),
 ('Силуанов', 'PER', 'PER'),
 ('Арассыыйаттан', 'LOC', 'O'),
 ('ИВФ', 'O', 'O'),
 ('-', 'O', 'O'),
 ('га', 'O', 'O'),
 ('управляющайынан', 'O', 'O'),
 ('ананна', 'O', 'O'),
 ('Россия', 'LOC', 'O'),
 ('Президенэ', 'O', 'O'),
 ('Дмитрий', 'PER', 'PER'),
 ('Медведев', 'O', 'PER'),
 ('ыйааҕынан', 'O', 'O'),
 ('РФ', 'LOC', 'LOC'),
 ('үбүн', 'O', 'O'),
 ('миниистирэ', 'O', 'O'),
 ('Антон', 'PER', 'PER'),
 ('Силуанов', 'O', 'PER'),
 ('аан', 'O', 'O'),
 ('дойдутааҕы', 'O', 'O'),
 ('валютнай', 'O', 'O'),
 ('фондаҕа', 'O', 'O'),
 (',', 'O', 'O'),
 ('ону', 'O', 'O'),
 ('тэҥэ', 'O', 'O'),
 ('аан', 'O', 'O'),
 ('дойдутааҕы', 'O', 'O'),
 ('баан', 'O', 'O'),
 ('бөлөҕүн', 'O', 'O'),
 ('тэрилтэлэригэр', 'O', 'O'),
 ('анаата', 'O', 'O'),
 ('.', 'O', 'O'),
 ('"', 'O', 'O'),
 ('РФ', 'LOC', 'LOC'),
 ('үбүн', 'O', 'O'),
 ('миниистирэ', 'O', 'O'),
 ('А', 'PER', 'PER'),
 ('.', 'PER', 'O'),
 ('Силуанова', 'PER', 'PER'),
 ('РФ', 'LOC', 'LOC'),
 ('-', 'O', 'O'),
 ('тан',

In [44]:
list(zip(train['word'], train['tag'], clf.predict(X_train)[:, 0]))

[('Эбии', 'O', 'O'),
 (':', 'O', 'O'),
 ('Д', 'PER', 'PER'),
 ('.', 'PER', 'O'),
 ('Медведев', 'PER', 'PER'),
 ('полиция', 'O', 'O'),
 ('үлэһиттэрин', 'O', 'O'),
 ('аатын', 'O', 'O'),
 ('-', 'O', 'O'),
 ('суолун', 'O', 'O'),
 ('ылан', 'O', 'O'),
 ('РФ', 'O', 'LOC'),
 ('ИДьМ', 'ORG', 'ORG'),
 (',', 'O', 'O'),
 ('ГУВД', 'ORG', 'ORG'),
 (',', 'O', 'O'),
 ('ИДьМ', 'O', 'ORG'),
 ('субъектарыгар', 'O', 'O'),
 ('ИДьМ', 'O', 'ORG'),
 ('салайааччыларын', 'O', 'O'),
 ('ыҥырда', 'O', 'O'),
 ('.', 'O', 'O'),
 ('Россия', 'LOC', 'O'),
 ('Президенэ', 'O', 'O'),
 ('Дмитрий', 'PER', 'PER'),
 ('Медведев', 'PER', 'PER'),
 ('полиция', 'O', 'O'),
 ('үлэһиттэрин', 'O', 'O'),
 ('аатын', 'O', 'O'),
 ('иҥэрдэ', 'O', 'O'),
 ('уонна', 'O', 'O'),
 ('РФ', 'LOC', 'LOC'),
 ('субъектарыгар', 'O', 'O'),
 ('уонна', 'O', 'O'),
 ('федеральнай', 'O', 'O'),
 ('уокуруктарга', 'O', 'O'),
 ('ис', 'O', 'O'),
 ('дьыала', 'O', 'O'),
 ('министерствотын', 'O', 'O'),
 ('14', 'O', 'O'),
 ('управлениеларыгар', 'O', 'O'),
 (',', 'O', 

In [44]:
list(zip(test['word'], clf.predict(test['emb'])[:, 0]))[:100]

[('А', 'O'),
 ('.', 'O'),
 ('Силуанов', 'PER'),
 ('Арассыыйаттан', 'O'),
 ('ИВФ', 'O'),
 ('-', 'O'),
 ('га', 'O'),
 ('управляющайынан', 'O'),
 ('ананна', 'O'),
 ('Россия', 'O'),
 ('Президенэ', 'O'),
 ('Дмитрий', 'PER'),
 ('Медведев', 'PER'),
 ('ыйааҕынан', 'O'),
 ('РФ', 'LOC'),
 ('үбүн', 'O'),
 ('миниистирэ', 'O'),
 ('Антон', 'PER'),
 ('Силуанов', 'PER'),
 ('аан', 'O'),
 ('дойдутааҕы', 'O'),
 ('валютнай', 'O'),
 ('фондаҕа', 'O'),
 (',', 'O'),
 ('ону', 'O'),
 ('тэҥэ', 'O'),
 ('аан', 'O'),
 ('дойдутааҕы', 'O'),
 ('баан', 'O'),
 ('бөлөҕүн', 'O'),
 ('тэрилтэлэригэр', 'O'),
 ('анаата', 'O'),
 ('.', 'O'),
 ('"', 'O'),
 ('РФ', 'LOC'),
 ('үбүн', 'O'),
 ('миниистирэ', 'O'),
 ('А', 'O'),
 ('.', 'O'),
 ('Силуанова', 'O'),
 ('РФ', 'LOC'),
 ('-', 'O'),
 ('тан', 'O'),
 ('аан', 'O'),
 ('дойдутааҕы', 'O'),
 ('валютнай', 'O'),
 ('фондаҕа', 'O'),
 (',', 'O'),
 ('инвестициялары', 'O'),
 ('мэктиэлээһиҥҥэ', 'O'),
 ('уонна', 'O'),
 ('киэҥ', 'O'),
 ('өрүттээх', 'O'),
 ('агентствоҕа', 'O'),
 ('үлэлиир', 'O'),