In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import catboost
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE  

In [3]:
def read_and_drop(filename):
    df = pd.read_csv(filename)
    df = df.drop('Unnamed: 0', axis=1)
    df = df.drop('Unnamed: 0.1', axis=1)
    df = df.drop('abbreviation', axis=1)
    df = df.drop('definition', axis=1)
    df = df.drop('begin', axis=1)
    df = df.drop('end', axis=1)
    df = df.drop('abbreviation_place', axis=1)
    df['almost_synonyms'].fillna(int(df['almost_synonyms'].min()), inplace=True)
    df.head()
    return df

In [4]:
def print_output(model, y_train, y_train_hat, y_test, y_test_hat):
    print(model)
    print('Train performance')
    print('-------------------------------------------------------')
    print(classification_report(y_train, y_train_hat))
    print('Test performance')
    print('-------------------------------------------------------')
    print(classification_report(y_test, y_test_hat))
    print('Roc_auc score')
    print('-------------------------------------------------------')
    print(roc_auc_score(y_test, y_test_hat))
    print('')
    print('Confusion matrix')
    print('-------------------------------------------------------')
    print(confusion_matrix(y_test, y_test_hat))

In [5]:
df = read_and_drop("../dataset/ready_data/potential_pairs_with_features.csv")
df_test = read_and_drop("../dataset/ready_data/6_potential_pairs_with_features.csv")

In [6]:
df.head()

Unnamed: 0,distance,is_it_correct,first_letters,parenthesis,almost_synonyms,lcs_feature
0,14,0,1.5,0,0.43829,1.0
1,13,0,1.5,0,0.394688,1.0
2,14,1,1.0,0,0.437462,1.0
3,13,0,1.0,0,0.395456,1.0
4,12,0,1.0,0,0.343793,1.0


In [7]:
X, y = df.drop('is_it_correct', axis=1), df['is_it_correct']

In [8]:
smote = SMOTE(sampling_strategy='minority', random_state=50, k_neighbors=5)
X_train, y_train = smote.fit_resample(X, y)

In [9]:
X_test, y_test = df_test.drop('is_it_correct', axis=1), df_test['is_it_correct']

In [20]:
model = RandomForestClassifier(min_samples_split=6, min_samples_leaf=4)
model.fit(X_train, y_train)
y_train_hat = model.predict(X_train)
y_test_hat_1 = model.predict(X_test)
print_output(model, y_train, y_train_hat, y_test, y_test_hat_1)

RandomForestClassifier(min_samples_leaf=4, min_samples_split=6)
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      6118
           1       0.98      1.00      0.99      6118

    accuracy                           0.99     12236
   macro avg       0.99      0.99      0.99     12236
weighted avg       0.99      0.99      0.99     12236

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1459
           1       0.32      0.46      0.37        13

    accuracy                           0.99      1472
   macro avg       0.66      0.73      0.68      1472
weighted avg       0.99      0.99      0.99      1472

Roc_auc score
-------------------------------------------------------
0.7263141245320821

Confusion matrix
----------------------------

In [67]:
params = {
    'n_estimators': 500,
    'depth':5,
    'loss_function': 'Logloss',
    'verbose': 1000, 
    'learning_rate':0.011871,
    
}

In [68]:
 model = catboost.CatBoostClassifier(**params)

In [69]:
model.fit(
    X_train, y_train
)
y_train_hat = model.predict(X_train)
y_test_hat_2 = model.predict(X_test)
print_output(model, y_train, y_train_hat, y_test, y_test_hat_2)

0:	learn: 0.6709525	total: 7.79ms	remaining: 3.89s
499:	learn: 0.0642719	total: 4.12s	remaining: 0us
<catboost.core.CatBoostClassifier object at 0x000001B1DF8F4EB8>
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      6118
           1       0.97      1.00      0.98      6118

    accuracy                           0.98     12236
   macro avg       0.98      0.98      0.98     12236
weighted avg       0.98      0.98      0.98     12236

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1459
           1       0.19      0.46      0.27        13

    accuracy                           0.98      1472
   macro avg       0.59      0.72      0.63      1472
weighted avg       0.99      0.98      0.98      1472

Roc_auc score
--------------------

In [70]:
y_test_hat_sum = [int(vi or wi) for vi, wi in zip(y_test_hat_1, y_test_hat_2)]
print_output("ensemble", y_train, y_train_hat, y_test, y_test_hat_sum)

ensemble
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      6118
           1       0.97      1.00      0.98      6118

    accuracy                           0.98     12236
   macro avg       0.98      0.98      0.98     12236
weighted avg       0.98      0.98      0.98     12236

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1459
           1       0.19      0.46      0.27        13

    accuracy                           0.98      1472
   macro avg       0.59      0.72      0.63      1472
weighted avg       0.99      0.98      0.98      1472

Roc_auc score
-------------------------------------------------------
0.7218590182949334

Confusion matrix
-------------------------------------------------------
[[1433   26]
 [   7    6]]


In [26]:
for n in np.arange(1, 10):
    model = catboost.CatBoostClassifier(n_estimators=200, depth=6, learning_rate=n)
    model.fit(X_train, y_train)
    y_train_hat = model.predict(X_train)
    y_test_hat_1 = model.predict(X_test)
    p(y_test_hat_1)


Learning rate set to 0.131314
0:	learn: 0.6171838	total: 5.94ms	remaining: 1.18s
1:	learn: 0.5423084	total: 11.3ms	remaining: 1.12s
2:	learn: 0.4849536	total: 16.6ms	remaining: 1.09s
3:	learn: 0.4481312	total: 22.5ms	remaining: 1.1s
4:	learn: 0.4070687	total: 32.1ms	remaining: 1.25s
5:	learn: 0.3823889	total: 42.7ms	remaining: 1.38s
6:	learn: 0.3619184	total: 53.5ms	remaining: 1.47s
7:	learn: 0.3414989	total: 63.4ms	remaining: 1.52s
8:	learn: 0.3274544	total: 73.6ms	remaining: 1.56s
9:	learn: 0.3129133	total: 83.7ms	remaining: 1.59s
10:	learn: 0.2992215	total: 93.8ms	remaining: 1.61s
11:	learn: 0.2862777	total: 104ms	remaining: 1.63s
12:	learn: 0.2771630	total: 113ms	remaining: 1.63s
13:	learn: 0.2655882	total: 124ms	remaining: 1.65s
14:	learn: 0.2583309	total: 133ms	remaining: 1.64s
15:	learn: 0.2502124	total: 143ms	remaining: 1.65s
16:	learn: 0.2423551	total: 153ms	remaining: 1.65s
17:	learn: 0.2373806	total: 162ms	remaining: 1.64s
18:	learn: 0.2336968	total: 171ms	remaining: 1.63s
1

163:	learn: 0.1231507	total: 1.52s	remaining: 334ms
164:	learn: 0.1230170	total: 1.53s	remaining: 324ms
165:	learn: 0.1229699	total: 1.53s	remaining: 314ms
166:	learn: 0.1228998	total: 1.54s	remaining: 304ms
167:	learn: 0.1227772	total: 1.54s	remaining: 294ms
168:	learn: 0.1226729	total: 1.56s	remaining: 286ms
169:	learn: 0.1226147	total: 1.57s	remaining: 278ms
170:	learn: 0.1225421	total: 1.58s	remaining: 269ms
171:	learn: 0.1224885	total: 1.59s	remaining: 259ms
172:	learn: 0.1224417	total: 1.6s	remaining: 250ms
173:	learn: 0.1223875	total: 1.61s	remaining: 241ms
174:	learn: 0.1223181	total: 1.62s	remaining: 231ms
175:	learn: 0.1222489	total: 1.63s	remaining: 223ms
176:	learn: 0.1221968	total: 1.64s	remaining: 213ms
177:	learn: 0.1221392	total: 1.65s	remaining: 204ms
178:	learn: 0.1220502	total: 1.66s	remaining: 195ms
179:	learn: 0.1219845	total: 1.67s	remaining: 185ms
180:	learn: 0.1218975	total: 1.68s	remaining: 176ms
181:	learn: 0.1218320	total: 1.69s	remaining: 167ms
182:	learn: 0

131:	learn: 0.0811505	total: 1.52s	remaining: 783ms
132:	learn: 0.0806406	total: 1.52s	remaining: 768ms
133:	learn: 0.0804232	total: 1.55s	remaining: 763ms
134:	learn: 0.0803436	total: 1.57s	remaining: 755ms
135:	learn: 0.0802374	total: 1.57s	remaining: 741ms
136:	learn: 0.0801617	total: 1.58s	remaining: 729ms
137:	learn: 0.0800983	total: 1.6s	remaining: 719ms
138:	learn: 0.0796655	total: 1.61s	remaining: 707ms
139:	learn: 0.0793006	total: 1.62s	remaining: 696ms
140:	learn: 0.0791185	total: 1.63s	remaining: 684ms
141:	learn: 0.0789223	total: 1.65s	remaining: 672ms
142:	learn: 0.0787931	total: 1.66s	remaining: 660ms
143:	learn: 0.0784256	total: 1.67s	remaining: 648ms
144:	learn: 0.0783266	total: 1.68s	remaining: 635ms
145:	learn: 0.0779893	total: 1.69s	remaining: 624ms
146:	learn: 0.0776358	total: 1.69s	remaining: 611ms
147:	learn: 0.0775083	total: 1.7s	remaining: 599ms
148:	learn: 0.0772032	total: 1.71s	remaining: 585ms
149:	learn: 0.0771199	total: 1.72s	remaining: 574ms
150:	learn: 0.

93:	learn: 0.0695137	total: 1.25s	remaining: 1.41s
94:	learn: 0.0693251	total: 1.26s	remaining: 1.39s
95:	learn: 0.0690435	total: 1.27s	remaining: 1.37s
96:	learn: 0.0688466	total: 1.27s	remaining: 1.35s
97:	learn: 0.0683707	total: 1.29s	remaining: 1.34s
98:	learn: 0.0681966	total: 1.3s	remaining: 1.32s
99:	learn: 0.0679316	total: 1.31s	remaining: 1.31s
100:	learn: 0.0676366	total: 1.32s	remaining: 1.3s
101:	learn: 0.0674576	total: 1.34s	remaining: 1.29s
102:	learn: 0.0672360	total: 1.35s	remaining: 1.27s
103:	learn: 0.0669657	total: 1.37s	remaining: 1.26s
104:	learn: 0.0668385	total: 1.38s	remaining: 1.25s
105:	learn: 0.0654518	total: 1.39s	remaining: 1.24s
106:	learn: 0.0649111	total: 1.41s	remaining: 1.22s
107:	learn: 0.0646365	total: 1.42s	remaining: 1.21s
108:	learn: 0.0643963	total: 1.43s	remaining: 1.2s
109:	learn: 0.0640143	total: 1.45s	remaining: 1.18s
110:	learn: 0.0638934	total: 1.45s	remaining: 1.17s
111:	learn: 0.0636946	total: 1.47s	remaining: 1.15s
112:	learn: 0.0635067	

53:	learn: 0.0703679	total: 904ms	remaining: 2.44s
54:	learn: 0.0698260	total: 914ms	remaining: 2.41s
55:	learn: 0.0693692	total: 924ms	remaining: 2.38s
56:	learn: 0.0683680	total: 953ms	remaining: 2.39s
57:	learn: 0.0681067	total: 971ms	remaining: 2.38s
58:	learn: 0.0678571	total: 988ms	remaining: 2.36s
59:	learn: 0.0666657	total: 999ms	remaining: 2.33s
60:	learn: 0.0662424	total: 1.01s	remaining: 2.3s
61:	learn: 0.0658164	total: 1.03s	remaining: 2.28s
62:	learn: 0.0651853	total: 1.05s	remaining: 2.28s
63:	learn: 0.0649362	total: 1.07s	remaining: 2.27s
64:	learn: 0.0645725	total: 1.08s	remaining: 2.25s
65:	learn: 0.0644233	total: 1.1s	remaining: 2.23s
66:	learn: 0.0640701	total: 1.11s	remaining: 2.21s
67:	learn: 0.0638330	total: 1.13s	remaining: 2.19s
68:	learn: 0.0632395	total: 1.15s	remaining: 2.18s
69:	learn: 0.0628630	total: 1.16s	remaining: 2.15s
70:	learn: 0.0625700	total: 1.17s	remaining: 2.12s
71:	learn: 0.0619465	total: 1.2s	remaining: 2.13s
72:	learn: 0.0616513	total: 1.21s	

23:	learn: 0.0800675	total: 404ms	remaining: 2.96s
24:	learn: 0.0791062	total: 426ms	remaining: 2.98s
25:	learn: 0.0772535	total: 448ms	remaining: 3s
26:	learn: 0.0764208	total: 466ms	remaining: 2.98s
27:	learn: 0.0757200	total: 484ms	remaining: 2.97s
28:	learn: 0.0743579	total: 502ms	remaining: 2.96s
29:	learn: 0.0722075	total: 521ms	remaining: 2.95s
30:	learn: 0.0719559	total: 537ms	remaining: 2.93s
31:	learn: 0.0706098	total: 550ms	remaining: 2.89s
32:	learn: 0.0700664	total: 568ms	remaining: 2.88s
33:	learn: 0.0694306	total: 583ms	remaining: 2.85s
34:	learn: 0.0687988	total: 598ms	remaining: 2.82s
35:	learn: 0.0683895	total: 614ms	remaining: 2.8s
36:	learn: 0.0674825	total: 633ms	remaining: 2.79s
37:	learn: 0.0669363	total: 653ms	remaining: 2.78s
38:	learn: 0.0664852	total: 670ms	remaining: 2.77s
39:	learn: 0.0661403	total: 688ms	remaining: 2.75s
40:	learn: 0.0656431	total: 706ms	remaining: 2.74s
41:	learn: 0.0651975	total: 723ms	remaining: 2.72s
42:	learn: 0.0647926	total: 741ms	r

191:	learn: 0.0362091	total: 3.19s	remaining: 133ms
192:	learn: 0.0361184	total: 3.21s	remaining: 116ms
193:	learn: 0.0359493	total: 3.21s	remaining: 99.4ms
194:	learn: 0.0357427	total: 3.23s	remaining: 82.9ms
195:	learn: 0.0356220	total: 3.26s	remaining: 66.5ms
196:	learn: 0.0355577	total: 3.27s	remaining: 49.9ms
197:	learn: 0.0354838	total: 3.29s	remaining: 33.3ms
198:	learn: 0.0353888	total: 3.31s	remaining: 16.6ms
199:	learn: 0.0353044	total: 3.33s	remaining: 0us
5   0.7249433226129594
Learning rate set to 0.131314
0:	learn: 0.4678410	total: 33.3ms	remaining: 6.63s
1:	learn: 0.3223330	total: 65.5ms	remaining: 6.48s
2:	learn: 0.2466646	total: 95.6ms	remaining: 6.28s
3:	learn: 0.2024517	total: 117ms	remaining: 5.73s
4:	learn: 0.1763253	total: 140ms	remaining: 5.45s
5:	learn: 0.1475254	total: 160ms	remaining: 5.17s
6:	learn: 0.1331351	total: 180ms	remaining: 4.95s
7:	learn: 0.1211705	total: 197ms	remaining: 4.73s
8:	learn: 0.1137684	total: 215ms	remaining: 4.55s
9:	learn: 0.1062272	to

158:	learn: 0.0334867	total: 3.03s	remaining: 782ms
159:	learn: 0.0333809	total: 3.04s	remaining: 761ms
160:	learn: 0.0332769	total: 3.07s	remaining: 745ms
161:	learn: 0.0331514	total: 3.1s	remaining: 726ms
162:	learn: 0.0330147	total: 3.12s	remaining: 708ms
163:	learn: 0.0329141	total: 3.14s	remaining: 689ms
164:	learn: 0.0328038	total: 3.16s	remaining: 671ms
165:	learn: 0.0327079	total: 3.18s	remaining: 652ms
166:	learn: 0.0325984	total: 3.2s	remaining: 632ms
167:	learn: 0.0325104	total: 3.21s	remaining: 612ms
168:	learn: 0.0324199	total: 3.25s	remaining: 595ms
169:	learn: 0.0323399	total: 3.26s	remaining: 576ms
170:	learn: 0.0322242	total: 3.28s	remaining: 557ms
171:	learn: 0.0321451	total: 3.3s	remaining: 537ms
172:	learn: 0.0320744	total: 3.32s	remaining: 518ms
173:	learn: 0.0320203	total: 3.33s	remaining: 498ms
174:	learn: 0.0319181	total: 3.35s	remaining: 479ms
175:	learn: 0.0317654	total: 3.37s	remaining: 460ms
176:	learn: 0.0316958	total: 3.39s	remaining: 441ms
177:	learn: 0.0

122:	learn: 0.0347972	total: 2.82s	remaining: 1.76s
123:	learn: 0.0345780	total: 2.83s	remaining: 1.74s
124:	learn: 0.0345250	total: 2.86s	remaining: 1.72s
125:	learn: 0.0344388	total: 2.88s	remaining: 1.69s
126:	learn: 0.0343021	total: 2.91s	remaining: 1.67s
127:	learn: 0.0341555	total: 2.93s	remaining: 1.65s
128:	learn: 0.0340632	total: 2.96s	remaining: 1.63s
129:	learn: 0.0339989	total: 2.98s	remaining: 1.6s
130:	learn: 0.0337474	total: 3s	remaining: 1.58s
131:	learn: 0.0335059	total: 3.02s	remaining: 1.55s
132:	learn: 0.0334153	total: 3.05s	remaining: 1.54s
133:	learn: 0.0333046	total: 3.08s	remaining: 1.52s
134:	learn: 0.0331953	total: 3.1s	remaining: 1.49s
135:	learn: 0.0329714	total: 3.12s	remaining: 1.47s
136:	learn: 0.0328573	total: 3.15s	remaining: 1.45s
137:	learn: 0.0327980	total: 3.17s	remaining: 1.42s
138:	learn: 0.0326188	total: 3.19s	remaining: 1.4s
139:	learn: 0.0324821	total: 3.21s	remaining: 1.38s
140:	learn: 0.0323356	total: 3.23s	remaining: 1.35s
141:	learn: 0.0322

84:	learn: 0.0396436	total: 2.36s	remaining: 3.19s
85:	learn: 0.0393823	total: 2.38s	remaining: 3.16s
86:	learn: 0.0392541	total: 2.41s	remaining: 3.13s
87:	learn: 0.0389748	total: 2.43s	remaining: 3.09s
88:	learn: 0.0388059	total: 2.46s	remaining: 3.07s
89:	learn: 0.0386792	total: 2.49s	remaining: 3.04s
90:	learn: 0.0384120	total: 2.52s	remaining: 3.02s
91:	learn: 0.0381889	total: 2.54s	remaining: 2.99s
92:	learn: 0.0379237	total: 2.56s	remaining: 2.95s
93:	learn: 0.0377452	total: 2.58s	remaining: 2.92s
94:	learn: 0.0376754	total: 2.61s	remaining: 2.88s
95:	learn: 0.0373258	total: 2.63s	remaining: 2.85s
96:	learn: 0.0371814	total: 2.67s	remaining: 2.83s
97:	learn: 0.0370491	total: 2.68s	remaining: 2.79s
98:	learn: 0.0368139	total: 2.72s	remaining: 2.77s
99:	learn: 0.0365882	total: 2.75s	remaining: 2.75s
100:	learn: 0.0363780	total: 2.78s	remaining: 2.72s
101:	learn: 0.0361752	total: 2.8s	remaining: 2.69s
102:	learn: 0.0358474	total: 2.83s	remaining: 2.67s
103:	learn: 0.0356586	total: 

47:	learn: 0.0473163	total: 1.97s	remaining: 6.23s
48:	learn: 0.0468775	total: 2s	remaining: 6.17s
49:	learn: 0.0467056	total: 2.04s	remaining: 6.12s
50:	learn: 0.0464696	total: 2.08s	remaining: 6.08s
51:	learn: 0.0461180	total: 2.12s	remaining: 6.04s
52:	learn: 0.0453725	total: 2.16s	remaining: 6s
53:	learn: 0.0451389	total: 2.21s	remaining: 5.97s
54:	learn: 0.0447179	total: 2.25s	remaining: 5.94s
55:	learn: 0.0443574	total: 2.3s	remaining: 5.9s
56:	learn: 0.0440320	total: 2.34s	remaining: 5.87s
57:	learn: 0.0438381	total: 2.38s	remaining: 5.82s
58:	learn: 0.0434062	total: 2.42s	remaining: 5.78s
59:	learn: 0.0431719	total: 2.46s	remaining: 5.74s
60:	learn: 0.0431719	total: 2.47s	remaining: 5.63s
61:	learn: 0.0430540	total: 2.51s	remaining: 5.59s
62:	learn: 0.0428960	total: 2.56s	remaining: 5.56s
63:	learn: 0.0425522	total: 2.6s	remaining: 5.53s
64:	learn: 0.0421886	total: 2.65s	remaining: 5.5s
65:	learn: 0.0419488	total: 2.69s	remaining: 5.46s
66:	learn: 0.0419325	total: 2.71s	remaini

In [39]:
recall = 0.71
precision = 0.97

In [40]:
2*recall*precision/(recall+precision)

0.8198809523809524

In [32]:
f

2.0