In [3]:
import numpy as np 
import pandas as pd 
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/names-data/train.csv


In [4]:
df = pd.read_csv('/kaggle/input/names-data/train.csv')

In [3]:
# new features 

df['len_str_1'] =  df['name_1'].apply(lambda x: len(x))
df['len_words_1'] =  df['name_1'].apply(lambda x: len(x.split(' ')))

df['len_str_2'] =  df['name_2'].apply(lambda x: len(x))
df['len_words_2'] =  df['name_2'].apply(lambda x: len(x.split(' ')))

df['len_intersection'] = df.apply(lambda x:  len(set(x['name_1'].split(' ')) & set(x['name_2'].split(' '))), axis = 1)

In [4]:
list(df)

['pair_id',
 'name_1',
 'name_2',
 'is_duplicate',
 'len_str_1',
 'len_words_1',
 'len_str_2',
 'len_words_2',
 'len_intersection']

In [5]:
X_train, X_val, y_train, y_val = train_test_split(df[['name_1', 'name_2', 'len_intersection', 'len_str_1', 'len_words_1', 'len_str_2', 'len_words_2']], df['is_duplicate'], test_size = 0.2, stratify=df['is_duplicate'] )

In [6]:
X_val, X_test, y_val, y_test = train_test_split(X_val[['name_1', 'name_2', 'len_intersection', 'len_str_1', 'len_words_1', 'len_str_2', 'len_words_2']], y_val, test_size = 0.5, stratify=y_val )

In [7]:
target_col = 'is_duplicate'
text_cols = ['name_1', 'name_2']
num_cols = ['len_intersection', 'len_str_1', 'len_words_1', 'len_str_2', 'len_words_2']

In [8]:
catboost_params = {
    'iterations': 5000,
    'eval_metric': 'Logloss',
    'task_type': 'GPU',
    'early_stopping_rounds': 10,
    'use_best_model': True,
    'verbose': 100
}

train_pool = Pool(
    X_train, 
    y_train, 
    text_features=text_cols,
    feature_names=list(X_train)
)
valid_pool = Pool(
    X_val, 
    y_val, 
    text_features=text_cols,
    feature_names=list(X_val)
)

model = CatBoostClassifier(**catboost_params)
model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.021986
0:	learn: 0.6281377	test: 0.6275299	best: 0.6275299 (0)	total: 32.5ms	remaining: 2m 42s
100:	learn: 0.0180327	test: 0.0165951	best: 0.0165951 (100)	total: 2.55s	remaining: 2m 3s
200:	learn: 0.0142127	test: 0.0132828	best: 0.0132828 (200)	total: 4.91s	remaining: 1m 57s
300:	learn: 0.0126297	test: 0.0120887	best: 0.0120887 (300)	total: 7.7s	remaining: 2m
400:	learn: 0.0116491	test: 0.0113233	best: 0.0113233 (400)	total: 10.1s	remaining: 1m 56s
500:	learn: 0.0109189	test: 0.0107401	best: 0.0107401 (500)	total: 12.8s	remaining: 1m 55s
700:	learn: 0.0098605	test: 0.0099220	best: 0.0099220 (700)	total: 18s	remaining: 1m 50s
800:	learn: 0.0094580	test: 0.0096595	best: 0.0096594 (799)	total: 21s	remaining: 1m 49s
900:	learn: 0.0091381	test: 0.0094260	best: 0.0094260 (900)	total: 23.3s	remaining: 1m 46s
1000:	learn: 0.0088435	test: 0.0092425	best: 0.0092425 (1000)	total: 25.7s	remaining: 1m 42s
1100:	learn: 0.0085742	test: 0.0090492	best: 0.0090492 (1100)	total: 28

<catboost.core.CatBoostClassifier at 0x7f235033c650>

In [9]:
### Val 

y_pred_proba = model.predict_proba(X_val)[:, 1]
y_pred = model.predict(X_val)

print('roc_auc:', roc_auc_score(y_val, y_pred_proba))
print('accuracy:', accuracy_score(y_val, y_pred))
print('f1:', f1_score(y_val, y_pred))
print('recall:', recall_score(y_val, y_pred),)
print('precision', precision_score(y_val, y_pred))


roc_auc: 0.9837387572087888
accuracy: 0.9984331686151621
f1: 0.8814589665653496
recall: 0.7923497267759563
precision 0.9931506849315068


In [10]:
### Test 

y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

print('roc_auc:', roc_auc_score(y_test, y_pred_proba))
print('accuracy:', accuracy_score(y_test, y_pred))
print('f1:', f1_score(y_test, y_pred))
print('recall:', recall_score(y_test, y_pred),)
print('precision', precision_score(y_test, y_pred))


roc_auc: 0.9894162727764111
accuracy: 0.9983126431240207
f1: 0.8711656441717792
recall: 0.7759562841530054
precision 0.993006993006993


### Catboost + text only

In [13]:
X_train, X_val, y_train, y_val = train_test_split(df[['name_1', 'name_2']], df['is_duplicate'], test_size = 0.2, stratify=df['is_duplicate'] )
X_val, X_test, y_val, y_test = train_test_split(X_val[['name_1', 'name_2']], y_val, test_size = 0.5, stratify=y_val )

train_pool = Pool(
    X_train, 
    y_train, 
    text_features=text_cols,
)
valid_pool = Pool(
    X_val, 
    y_val, 
    text_features=text_cols,
)

model = CatBoostClassifier(**catboost_params)
model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.021986
0:	learn: 0.6313831	test: 0.6306168	best: 0.6306168 (0)	total: 24.6ms	remaining: 2m 3s
100:	learn: 0.0215200	test: 0.0203724	best: 0.0203724 (100)	total: 2.26s	remaining: 1m 49s
200:	learn: 0.0166148	test: 0.0163283	best: 0.0163283 (200)	total: 4.38s	remaining: 1m 44s
300:	learn: 0.0144298	test: 0.0147526	best: 0.0147523 (299)	total: 6.89s	remaining: 1m 47s
400:	learn: 0.0130759	test: 0.0138851	best: 0.0138851 (400)	total: 9.02s	remaining: 1m 43s
500:	learn: 0.0121359	test: 0.0133093	best: 0.0133093 (500)	total: 11.2s	remaining: 1m 40s
600:	learn: 0.0114356	test: 0.0128016	best: 0.0128016 (600)	total: 13.3s	remaining: 1m 37s
700:	learn: 0.0108938	test: 0.0123503	best: 0.0123503 (700)	total: 15.5s	remaining: 1m 34s
800:	learn: 0.0104666	test: 0.0119503	best: 0.0119503 (800)	total: 17.9s	remaining: 1m 33s
900:	learn: 0.0101321	test: 0.0116876	best: 0.0116876 (900)	total: 20s	remaining: 1m 31s
1000:	learn: 0.0098612	test: 0.0115013	best: 0.0115013 (1000)	tota

<catboost.core.CatBoostClassifier at 0x7f2383ab5c50>

In [14]:
print("Validation: ")
y_pred_proba = model.predict_proba(X_val)[:, 1]
y_pred = model.predict(X_val)

print('roc_auc:', roc_auc_score(y_val, y_pred_proba))
print('accuracy:', accuracy_score(y_val, y_pred))
print('f1:', f1_score(y_val, y_pred))
print('recall:', recall_score(y_val, y_pred),)
print('precision', precision_score(y_val, y_pred))

print('\n')
print("test: ")

y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

print('roc_auc:', roc_auc_score(y_test, y_pred_proba))
print('accuracy:', accuracy_score(y_test, y_pred))
print('f1:', f1_score(y_test, y_pred))
print('recall:', recall_score(y_test, y_pred),)
print('precision', precision_score(y_test, y_pred))


Validation: 
roc_auc: 0.9654349136714641
accuracy: 0.99825238037845
f1: 0.8651162790697675
recall: 0.7622950819672131
precision 1.0


test: 
roc_auc: 0.9741065259719868
accuracy: 0.9981519424691656
f1: 0.8575851393188854
recall: 0.7568306010928961
precision 0.9892857142857143


## Tf-Idf only

In [37]:
corpus1 = np.array(df['name_1'].apply(lambda x: ' '.join(list(x.lower()))))
corpus2 = np.array(df['name_2'].apply(lambda x: ' '.join(list(x.lower()))))

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='char')
X1 = vectorizer.fit_transform(corpus1)
X2 = vectorizer.transform(corpus2)


In [39]:
X_vect = np.concatenate((X1.toarray(), X2.toarray()), axis=1)
X_vect.shape

(497819, 322)

In [40]:
X_vect = pd.DataFrame(X_vect)
X_vect.columns = [str(x) for x in list(X_vect)]

In [41]:
X_train, X_val, y_train, y_val = train_test_split(pd.DataFrame(X_vect), df['is_duplicate'], test_size = 0.2, stratify=df['is_duplicate'] )
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size = 0.5, stratify=y_val )

In [42]:
target_col = 'is_duplicate'
#text_cols = ['name_1', 'name_2']
num_cols = list(X_train.columns)

In [43]:
catboost_params = {
    'iterations': 5000,
    'eval_metric': 'Logloss',
    'task_type': 'GPU',
    'early_stopping_rounds': 10,
    'use_best_model': True,
    'verbose': 100
}

train_pool = Pool(
    X_train, 
    y_train, 
    feature_names=list(X_train)
)
valid_pool = Pool(
    X_val, 
    y_val, 
    feature_names=list(X_val)
)

model = CatBoostClassifier(**catboost_params)
model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.021986
0:	learn: 0.6409093	test: 0.6409169	best: 0.6409169 (0)	total: 14.8ms	remaining: 1m 14s
100:	learn: 0.0395953	test: 0.0397391	best: 0.0397391 (100)	total: 1.36s	remaining: 1m 5s
200:	learn: 0.0337631	test: 0.0342057	best: 0.0342057 (200)	total: 2.71s	remaining: 1m 4s
300:	learn: 0.0313488	test: 0.0318105	best: 0.0318105 (300)	total: 4.09s	remaining: 1m 3s
400:	learn: 0.0291155	test: 0.0296950	best: 0.0296950 (400)	total: 5.44s	remaining: 1m 2s
500:	learn: 0.0271546	test: 0.0278673	best: 0.0278673 (500)	total: 7.16s	remaining: 1m 4s
600:	learn: 0.0257760	test: 0.0265161	best: 0.0265161 (600)	total: 8.72s	remaining: 1m 3s
700:	learn: 0.0244227	test: 0.0252068	best: 0.0252068 (700)	total: 10.8s	remaining: 1m 5s
800:	learn: 0.0232663	test: 0.0240560	best: 0.0240560 (800)	total: 12.1s	remaining: 1m 3s
900:	learn: 0.0223418	test: 0.0231677	best: 0.0231677 (900)	total: 13.5s	remaining: 1m 1s
1000:	learn: 0.0213405	test: 0.0222376	best: 0.0222376 (1000)	total: 14.

<catboost.core.CatBoostClassifier at 0x7fd9019aa090>

In [44]:
### Val 

y_pred_proba = model.predict_proba(X_val)[:, 1]
y_pred = model.predict(X_val)

print('roc_auc:', roc_auc_score(y_val, y_pred_proba))
print('accuracy:', accuracy_score(y_val, y_pred))
print('f1:', f1_score(y_val, y_pred))
print('recall:', recall_score(y_val, y_pred),)
print('precision', precision_score(y_val, y_pred))


roc_auc: 0.9894825662093912
accuracy: 0.9969868627214656
f1: 0.7448979591836735
recall: 0.5983606557377049
precision 0.9864864864864865


In [45]:
### Test 

y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

print('roc_auc:', roc_auc_score(y_test, y_pred_proba))
print('accuracy:', accuracy_score(y_test, y_pred))
print('f1:', f1_score(y_test, y_pred))
print('recall:', recall_score(y_test, y_pred),)
print('precision', precision_score(y_test, y_pred))


roc_auc: 0.9813619800582276
accuracy: 0.9967859869028967
f1: 0.7269624573378839
recall: 0.5819672131147541
precision 0.9681818181818181


### Добавить сгенеренные фичи

In [46]:
X_vect = pd.concat([df[['len_intersection', 'len_str_1', 'len_words_1', 'len_str_2', 'len_words_2']], X_vect], axis = 1)

In [47]:
X_train, X_val, y_train, y_val = train_test_split((X_vect), df['is_duplicate'], test_size = 0.2, stratify=df['is_duplicate'] )
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size = 0.5, stratify=y_val )

In [48]:
target_col = 'is_duplicate'
#text_cols = ['name_1', 'name_2']
num_cols = list(X_train)

In [49]:
catboost_params = {
    'iterations': 5000,
    'eval_metric': 'Logloss',
    'task_type': 'GPU',
    'early_stopping_rounds': 10,
    'use_best_model': True,
    'verbose': 100
}

train_pool = Pool(
    X_train, 
    y_train, 
    feature_names=list(X_train)
)
valid_pool = Pool(
    X_val, 
    y_val, 
    feature_names=list(X_val)
)

model = CatBoostClassifier(**catboost_params)
model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.021986
0:	learn: 0.6411990	test: 0.6412102	best: 0.6412102 (0)	total: 14.7ms	remaining: 1m 13s
100:	learn: 0.0370806	test: 0.0376483	best: 0.0376483 (100)	total: 1.37s	remaining: 1m 6s
200:	learn: 0.0312080	test: 0.0319514	best: 0.0319514 (200)	total: 3.18s	remaining: 1m 15s
300:	learn: 0.0282800	test: 0.0289579	best: 0.0289579 (300)	total: 4.57s	remaining: 1m 11s
400:	learn: 0.0259527	test: 0.0265747	best: 0.0265747 (400)	total: 5.93s	remaining: 1m 8s
500:	learn: 0.0240460	test: 0.0246476	best: 0.0246476 (500)	total: 7.29s	remaining: 1m 5s
600:	learn: 0.0226458	test: 0.0232883	best: 0.0232883 (600)	total: 8.67s	remaining: 1m 3s
700:	learn: 0.0215427	test: 0.0223170	best: 0.0223170 (700)	total: 10s	remaining: 1m 1s
800:	learn: 0.0205853	test: 0.0215062	best: 0.0215062 (800)	total: 11.4s	remaining: 59.8s
900:	learn: 0.0197724	test: 0.0208561	best: 0.0208561 (900)	total: 13s	remaining: 59.2s
1000:	learn: 0.0189478	test: 0.0201575	best: 0.0201575 (1000)	total: 14.6s

<catboost.core.CatBoostClassifier at 0x7fd88fb0abd0>

In [50]:
### Val 

y_pred_proba = model.predict_proba(X_val)[:, 1]
y_pred = model.predict(X_val)

print('roc_auc:', roc_auc_score(y_val, y_pred_proba))
print('accuracy:', accuracy_score(y_val, y_pred))
print('f1:', f1_score(y_val, y_pred))
print('recall:', recall_score(y_val, y_pred),)
print('precision', precision_score(y_val, y_pred))


roc_auc: 0.9761088751591264
accuracy: 0.9964846731750432
f1: 0.6891651865008881
recall: 0.5300546448087432
precision 0.9847715736040609


In [51]:
X_val

Unnamed: 0,len_intersection,len_str_1,len_words_1,len_str_2,len_words_2,0,1,2,3,4,...,312,313,314,315,316,317,318,319,320,321
314031,0,17,3,12,2,0.894622,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
157333,0,24,3,52,8,0.913806,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
149519,0,9,2,20,3,0.747379,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
430613,0,17,4,18,2,0.881042,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62692,1,35,4,25,4,0.914073,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394832,3,36,5,31,5,0.926816,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159479,0,3,1,50,7,0.520402,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35257,0,23,3,29,3,0.910716,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
359496,1,28,4,23,2,0.924496,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Levinstien distance approach

In [15]:
from nltk import edit_distance

In [28]:
def apply_levenstein(first_word, second_word, n_to_success):
    res = edit_distance(first_word.lower(), second_word.lower())
    
    if res > n_to_success:
        return 0
    
    return 1

In [29]:
df['lev_0'] = df.apply(lambda x: apply_levenstein(x['name_1'], x['name_2'], 0), axis = 1)
df['lev_1'] = df.apply(lambda x: apply_levenstein(x['name_1'], x['name_2'], 1), axis = 1)
df['lev_2'] = df.apply(lambda x: apply_levenstein(x['name_1'], x['name_2'], 2), axis = 1)

In [30]:
for i in ['lev_0', 'lev_1', 'lev_2']:
    print(i)
    print('accuracy:', accuracy_score(df['is_duplicate'], df[i]))
    print('f1:', f1_score(df['is_duplicate'], df[i]))
    print('recall:', recall_score(df['is_duplicate'], df[i]))
    print('precision', precision_score(df['is_duplicate'], df[i]))
    print('\n')

lev_0
accuracy: 0.9927122106629116
f1: 0.01626898047722343
recall: 0.008201202843083653
precision 1.0


lev_1
accuracy: 0.9926579740829499
f1: 0.028700504916290195
recall: 0.014762165117550574
precision 0.5142857142857142


lev_2
accuracy: 0.9925374483497014
f1: 0.05108556832694764
recall: 0.02733734281027884
precision 0.38910505836575876




### Embeddings + CatBoost

In [5]:
pip install compress-fasttext

[0mNote: you may need to restart the kernel to use updated packages.


In [6]:
import compress_fasttext
small_model = compress_fasttext.models.CompressedFastTextKeyedVectors.load(
'https://github.com/avidale/compress-fasttext/releases/download/v0.0.4/cc.en.300.compressed.bin')


In [7]:
name1_ft = small_model[df.name_1.values]
name2_ft = small_model[df.name_2.values]

In [50]:
name1_ft.shape

(497819, 300)

In [52]:
df_ft = pd.concat([pd.DataFrame(name1_ft), pd.DataFrame(name2_ft)], axis = 1)
df_ft.columns = [str(x) for x in range(df_ft.shape[1])]

In [59]:
X_train, X_val, y_train, y_val = train_test_split(df_ft, df['is_duplicate'], test_size = 0.2, stratify=df['is_duplicate'] )
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size = 0.5, stratify=y_val )

In [60]:
target_col = 'is_duplicate'
num_cols = list(df_ft)

In [61]:
catboost_params = {
    'iterations': 5000,
    'eval_metric': 'Logloss',
    'task_type': 'GPU',
    'early_stopping_rounds': 10,
    'use_best_model': True,
    'verbose': 100
}

train_pool = Pool(
    X_train, 
    y_train, 
    feature_names=list(X_train)
)
valid_pool = Pool(
    X_val, 
    y_val, 
    feature_names=list(X_val)
)

model = CatBoostClassifier(**catboost_params)
model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.021986
0:	learn: 0.6393088	test: 0.6392835	best: 0.6392835 (0)	total: 27.1ms	remaining: 2m 15s
100:	learn: 0.0330894	test: 0.0327474	best: 0.0327474 (100)	total: 2.96s	remaining: 2m 23s
200:	learn: 0.0275833	test: 0.0273519	best: 0.0273519 (200)	total: 5.16s	remaining: 2m 3s
300:	learn: 0.0244647	test: 0.0244216	best: 0.0244216 (300)	total: 7.32s	remaining: 1m 54s
400:	learn: 0.0219372	test: 0.0221254	best: 0.0221254 (400)	total: 9.54s	remaining: 1m 49s
500:	learn: 0.0202025	test: 0.0206259	best: 0.0206259 (500)	total: 12s	remaining: 1m 47s
600:	learn: 0.0187131	test: 0.0194185	best: 0.0194185 (600)	total: 14.2s	remaining: 1m 43s
700:	learn: 0.0175898	test: 0.0185120	best: 0.0185120 (700)	total: 16.3s	remaining: 1m 39s
800:	learn: 0.0166560	test: 0.0177715	best: 0.0177715 (800)	total: 18.4s	remaining: 1m 36s
900:	learn: 0.0158177	test: 0.0171191	best: 0.0171191 (900)	total: 20.5s	remaining: 1m 33s
1000:	learn: 0.0151066	test: 0.0165664	best: 0.0165664 (1000)	tota

<catboost.core.CatBoostClassifier at 0x7f230d221e10>

In [62]:
print("Validation: ")
y_pred_proba = model.predict_proba(X_val)[:, 1]
y_pred = model.predict(X_val)

print('roc_auc:', roc_auc_score(y_val, y_pred_proba))
print('accuracy:', accuracy_score(y_val, y_pred))
print('f1:', f1_score(y_val, y_pred))
print('recall:', recall_score(y_val, y_pred),)
print('precision', precision_score(y_val, y_pred))

print('\n')
print("test: ")

y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

print('roc_auc:', roc_auc_score(y_test, y_pred_proba))
print('accuracy:', accuracy_score(y_test, y_pred))
print('f1:', f1_score(y_test, y_pred))
print('recall:', recall_score(y_test, y_pred),)
print('precision', precision_score(y_test, y_pred))


Validation: 
roc_auc: 0.9759899727174048
accuracy: 0.9973685267767466
f1: 0.7834710743801654
recall: 0.6475409836065574
precision 0.9916317991631799


test: 
roc_auc: 0.9739478419414167
accuracy: 0.996926599975895
f1: 0.7366609294320139
recall: 0.5846994535519126
precision 0.9953488372093023


### Embeddings distance + KNN or Catboost

Катбуст

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
from operator import itemgetter
from tqdm.notebook import tqdm
    
def get_cosine_similarity(feature_vec_1, feature_vec_2):    
    return cosine_similarity(feature_vec_1.reshape(1, -1), feature_vec_2.reshape(1, -1))[0][0]


In [88]:
def search_most_similar(word, all_words, N):
    res = {}
    for i in tqdm(all_words):
        res[tuple(i)] = get_cosine_similarity(word, i)
        
    return dict(sorted(res.items(), key = itemgetter(1), reverse = True)[:N]).keys()


In [90]:
#search_most_similar(name1_ft[1], name2_ft, 5)

In [9]:
name_diff = name1_ft - name2_ft

In [10]:
X_train, X_val, y_train, y_val = train_test_split(pd.DataFrame(name_diff), df['is_duplicate'], test_size = 0.2, stratify=df['is_duplicate'] )
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size = 0.5, stratify=y_val )

target_col = 'is_duplicate'
num_cols = list(X_train)

catboost_params = {
    'iterations': 5000,
    'eval_metric': 'Logloss',
    'task_type': 'GPU',
    'early_stopping_rounds': 10,
    'use_best_model': True,
    'verbose': 100
}

train_pool = Pool(
    X_train, 
    y_train, 
    feature_names=list(X_train)
)
valid_pool = Pool(
    X_val, 
    y_val, 
    feature_names=list(X_val)
)

model = CatBoostClassifier(**catboost_params)
model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.021986
0:	learn: 0.6406860	test: 0.6407272	best: 0.6407272 (0)	total: 26.4ms	remaining: 2m 12s
100:	learn: 0.0421585	test: 0.0428264	best: 0.0428264 (100)	total: 1.59s	remaining: 1m 17s
200:	learn: 0.0387629	test: 0.0401272	best: 0.0401272 (200)	total: 3.15s	remaining: 1m 15s
300:	learn: 0.0364541	test: 0.0385203	best: 0.0385203 (300)	total: 4.75s	remaining: 1m 14s
400:	learn: 0.0345786	test: 0.0371725	best: 0.0371725 (400)	total: 6.4s	remaining: 1m 13s
500:	learn: 0.0329875	test: 0.0361133	best: 0.0361133 (500)	total: 8.45s	remaining: 1m 15s
600:	learn: 0.0315914	test: 0.0352342	best: 0.0352342 (600)	total: 10.1s	remaining: 1m 13s
700:	learn: 0.0303160	test: 0.0344131	best: 0.0344131 (700)	total: 11.7s	remaining: 1m 11s
800:	learn: 0.0292162	test: 0.0337173	best: 0.0337173 (800)	total: 13.3s	remaining: 1m 9s
900:	learn: 0.0281189	test: 0.0330346	best: 0.0330346 (900)	total: 15s	remaining: 1m 8s
1000:	learn: 0.0271968	test: 0.0324842	best: 0.0324842 (1000)	total:

<catboost.core.CatBoostClassifier at 0x7fb754000fd0>

In [11]:
print("Validation: ")
y_pred_proba = model.predict_proba(X_val)[:, 1]
y_pred = model.predict(X_val)

print('roc_auc:', roc_auc_score(y_val, y_pred_proba))
print('accuracy:', accuracy_score(y_val, y_pred))
print('f1:', f1_score(y_val, y_pred))
print('recall:', recall_score(y_val, y_pred),)
print('precision', precision_score(y_val, y_pred))

print('\n')
print("test: ")

y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

print('roc_auc:', roc_auc_score(y_test, y_pred_proba))
print('accuracy:', accuracy_score(y_test, y_pred))
print('f1:', f1_score(y_test, y_pred))
print('recall:', recall_score(y_test, y_pred),)
print('precision', precision_score(y_test, y_pred))


Validation: 
roc_auc: 0.9473421143657372
accuracy: 0.9937929372062191
f1: 0.2797202797202797
recall: 0.16393442622950818
precision 0.9523809523809523


test: 
roc_auc: 0.9323672350982979
accuracy: 0.9941746012615001
f1: 0.3468468468468469
recall: 0.2103825136612022
precision 0.9871794871794872


KNN

In [12]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)


KNeighborsClassifier()

In [13]:
print("Validation: ")
y_pred = model.predict(X_val)

print('accuracy:', accuracy_score(y_val, y_pred))
print('f1:', f1_score(y_val, y_pred))
print('recall:', recall_score(y_val, y_pred),)
print('precision', precision_score(y_val, y_pred))

print('\n')
print("test: ")

y_pred = model.predict(X_test)

print('accuracy:', accuracy_score(y_test, y_pred))
print('f1:', f1_score(y_test, y_pred))
print('recall:', recall_score(y_test, y_pred),)
print('precision', precision_score(y_test, y_pred))


Validation: 
accuracy: 0.9939134626973605
f1: 0.4399260628465804
recall: 0.3251366120218579
precision 0.68


test: 
accuracy: 0.9944960025712105
f1: 0.49446494464944646
recall: 0.366120218579235
precision 0.7613636363636364
