In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score, recall_score, precision_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import catboost
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE  
from statistics import median, mean

In [4]:
def read_and_drop(filename):
    df = pd.read_csv(filename)
    df = df.drop('Unnamed: 0', axis=1)
    df = df.drop('Unnamed: 0.1', axis=1)
    df = df.drop('abbreviation', axis=1)
    df = df.drop('definition', axis=1)
    df = df.drop('begin', axis=1)
    df = df.drop('end', axis=1)
    df = df.drop('abbreviation_place', axis=1)
    df['almost_synonyms'].fillna(int(df['almost_synonyms'].min()), inplace=True)
    df.head()
    return df

In [5]:
def print_output(model, y_train, y_train_hat, y_test, y_test_hat):
    print(model)
    print('Train performance')
    print('-------------------------------------------------------')
    print(classification_report(y_train, y_train_hat))
    print('Test performance')
    print('-------------------------------------------------------')
    print(classification_report(y_test, y_test_hat))
    print('Roc_auc score')
    print('-------------------------------------------------------')
    print(roc_auc_score(y_test, y_test_hat))
    print('')
    print('Confusion matrix')
    print('-------------------------------------------------------')
    print(confusion_matrix(y_test, y_test_hat))

In [6]:
df = read_and_drop("/content/drive/MyDrive/ColabNotebooks/data/potential_pairs_with_features.csv")
df_test = read_and_drop("/content/drive/MyDrive/ColabNotebooks/data/6_potential_pairs_with_features.csv")

In [8]:
X, y = df.drop('is_it_correct', axis=1), df['is_it_correct']

In [9]:
X_test, y_test = df_test.drop('is_it_correct', axis=1), df_test['is_it_correct']


smote = SMOTE(sampling_strategy='minority', random_state=95, k_neighbors=1)
X_train, y_train = smote.fit_resample(X, y)


In [14]:
rf_rc = []
rf_ac = []
rf_fs = []

for i in range(30):
    rf_model = RandomForestClassifier(min_samples_leaf=7)
    rf_model.fit(X_train, y_train)
    y_train_hat = rf_model.predict(X_train)
    y_test_hat_1 = rf_model.predict(X_test)
    
    rf_rc.append(roc_auc_score(y_test, y_test_hat_1))
    rf_ac.append(accuracy_score(y_test, y_test_hat_1))
    rf_fs.append(f1_score(y_test, y_test_hat_1))

In [17]:
print('roc auc = ', median(rf_rc))
print('accuracy = ', median(rf_ac))
print('f1 = ', median(rf_fs))

roc auc =  0.9259570494864613
accuracy =  0.9935233160621761
f1 =  0.7058823529411764


In [20]:
svm_rc = []
svm_ac = []
svm_fs = []
pre = []
rec = []

for i in range(30):
    svm_model = SVC()
    svm_model.fit(X_train, y_train)
    y_train_hat = svm_model.predict(X_train)
    y_test_hat_3 = svm_model.predict(X_test)
    
    svm_rc.append(roc_auc_score(y_test, y_test_hat_3))
    svm_ac.append(accuracy_score(y_test, y_test_hat_3))
    svm_fs.append(f1_score(y_test, y_test_hat_3))
    pre.append(recall_score(y_test, y_test_hat_3))
    rec.append(precision_score(y_test, y_test_hat_3))

In [22]:
print('roc auc = ', median(svm_rc))
print('accuracy = ', median(svm_ac))
print('f1 = ', median(svm_fs))

roc auc =  0.9239962651727358
accuracy =  0.9896373056994818
f1 =  0.6


In [10]:
params = {
    'n_estimators': 500,
    'depth':5,
    'loss_function': 'Logloss',
    'verbose': 1000, 
    'learning_rate':0.011871,
    
}

In [11]:
 model = catboost.CatBoostClassifier(**params)

In [12]:
gb_rc = []
gb_ac = []
gb_fs = []

for i in range(30):
    model.fit(
      X_train, y_train
      )
    y_train_hat = model.predict(X_train)
    y_test_hat_2 = model.predict(X_test)
    gb_rc.append(roc_auc_score(y_test, y_test_hat_2))
    gb_ac.append(accuracy_score(y_test, y_test_hat_2))
    gb_fs.append(f1_score(y_test, y_test_hat_2))

0:	learn: 0.6679281	total: 47.7ms	remaining: 23.8s
499:	learn: 0.0261084	total: 759ms	remaining: 0us
0:	learn: 0.6679281	total: 1.46ms	remaining: 728ms
499:	learn: 0.0261084	total: 705ms	remaining: 0us
0:	learn: 0.6679281	total: 1.6ms	remaining: 801ms
499:	learn: 0.0261084	total: 701ms	remaining: 0us
0:	learn: 0.6679281	total: 1.5ms	remaining: 750ms
499:	learn: 0.0261084	total: 704ms	remaining: 0us
0:	learn: 0.6679281	total: 1.53ms	remaining: 765ms
499:	learn: 0.0261084	total: 709ms	remaining: 0us
0:	learn: 0.6679281	total: 1.41ms	remaining: 705ms
499:	learn: 0.0261084	total: 697ms	remaining: 0us
0:	learn: 0.6679281	total: 1.48ms	remaining: 740ms
499:	learn: 0.0261084	total: 689ms	remaining: 0us
0:	learn: 0.6679281	total: 1.44ms	remaining: 718ms
499:	learn: 0.0261084	total: 712ms	remaining: 0us
0:	learn: 0.6679281	total: 1.54ms	remaining: 768ms
499:	learn: 0.0261084	total: 689ms	remaining: 0us
0:	learn: 0.6679281	total: 1.61ms	remaining: 803ms
499:	learn: 0.0261084	total: 705ms	remaini

In [13]:
print('roc auc = ', median(gb_rc))
print('accuracy = ', median(gb_ac))
print('f1 = ', median(gb_fs))

roc auc =  0.9259570494864613
accuracy =  0.9935233160621761
f1 =  0.7058823529411764


In [None]:
y_test_hat_sum = [int(vi or wi) for vi, wi in zip(y_test_hat_1, y_test_hat_2)]
print_output("ensemble", y_train, y_train_hat, y_test, y_test_hat_sum)

ensemble
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1840
           1       0.99      1.00      0.99      1840

    accuracy                           0.99      3680
   macro avg       0.99      0.99      0.99      3680
weighted avg       0.99      0.99      0.99      3680

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       765
           1       0.60      0.86      0.71         7

    accuracy                           0.99       772
   macro avg       0.80      0.93      0.85       772
weighted avg       1.00      0.99      0.99       772

Roc_auc score
-------------------------------------------------------
0.9259570494864613

Confusion matrix
-------------------------------------------------------
[[761   4]
 [  1   6]]
