In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import catboost
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE  

In [2]:
def read_and_drop(filename):
    df = pd.read_csv(filename)
    df = df.drop('Unnamed: 0', axis=1)
    df = df.drop('Unnamed: 0.1', axis=1)
    df = df.drop('abbreviation', axis=1)
    df = df.drop('definition', axis=1)
    df = df.drop('begin', axis=1)
    df = df.drop('end', axis=1)
    df = df.drop('abbreviation_place', axis=1)
    df['almost_synonyms'].fillna(int(df['almost_synonyms'].min()), inplace=True)
    df.head()
    return df

In [3]:
def print_output(model, y_train, y_train_hat, y_test, y_test_hat):
    print(model)
    print('Train performance')
    print('-------------------------------------------------------')
    print(classification_report(y_train, y_train_hat))
    print('Test performance')
    print('-------------------------------------------------------')
    print(classification_report(y_test, y_test_hat))
    print('Roc_auc score')
    print('-------------------------------------------------------')
    print(roc_auc_score(y_test, y_test_hat))
    print('')
    print('Confusion matrix')
    print('-------------------------------------------------------')
    print(confusion_matrix(y_test, y_test_hat))

In [4]:
df = read_and_drop("../dataset/ready_data/potential_pairs_with_features.csv")
df_test = read_and_drop("../dataset/ready_data/6_potential_pairs_with_features.csv")

In [5]:
df.head()

Unnamed: 0,distance,is_it_correct,first_letters,parenthesis,almost_synonyms,lcs_feature
0,14,0,1.5,0,0.43829,1.0
1,13,0,1.5,0,0.394688,1.0
2,14,1,1.0,0,0.437462,1.0
3,13,0,1.0,0,0.395456,1.0
4,12,0,1.0,0,0.343793,1.0


In [64]:
X, y = df.drop('is_it_correct', axis=1), df['is_it_correct']

In [65]:
smote = SMOTE(sampling_strategy='minority', random_state=50, k_neighbors=5)
X_train, y_train = smote.fit_resample(X, y)

In [26]:
X_test, y_test = df_test.drop('is_it_correct', axis=1), df_test['is_it_correct']

In [66]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_train_hat = model.predict(X_train)
y_test_hat_1 = model.predict(X_test)
print_output(model, y_train, y_train_hat, y_test, y_test_hat_1)

RandomForestClassifier()
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6118
           1       1.00      1.00      1.00      6118

    accuracy                           1.00     12236
   macro avg       1.00      1.00      1.00     12236
weighted avg       1.00      1.00      1.00     12236

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1459
           1       0.32      0.46      0.37        13

    accuracy                           0.99      1472
   macro avg       0.66      0.73      0.68      1472
weighted avg       0.99      0.99      0.99      1472

Roc_auc score
-------------------------------------------------------
0.7263141245320821

Confusion matrix
-------------------------------------------------------
[[1446   13

In [67]:
params = {
    'n_estimators': 500,
    'depth':5,
    'loss_function': 'Logloss',
    'verbose': 1000, 
    'learning_rate':0.011871,
    
}

In [68]:
 model = catboost.CatBoostClassifier(**params)

In [69]:
model.fit(
    X_train, y_train
)
y_train_hat = model.predict(X_train)
y_test_hat_2 = model.predict(X_test)
print_output(model, y_train, y_train_hat, y_test, y_test_hat_2)

0:	learn: 0.6709525	total: 7.79ms	remaining: 3.89s
499:	learn: 0.0642719	total: 4.12s	remaining: 0us
<catboost.core.CatBoostClassifier object at 0x000001B1DF8F4EB8>
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      6118
           1       0.97      1.00      0.98      6118

    accuracy                           0.98     12236
   macro avg       0.98      0.98      0.98     12236
weighted avg       0.98      0.98      0.98     12236

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1459
           1       0.19      0.46      0.27        13

    accuracy                           0.98      1472
   macro avg       0.59      0.72      0.63      1472
weighted avg       0.99      0.98      0.98      1472

Roc_auc score
--------------------

In [70]:
y_test_hat_sum = [int(vi or wi) for vi, wi in zip(y_test_hat_1, y_test_hat_2)]
print_output("ensemble", y_train, y_train_hat, y_test, y_test_hat_sum)

ensemble
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      6118
           1       0.97      1.00      0.98      6118

    accuracy                           0.98     12236
   macro avg       0.98      0.98      0.98     12236
weighted avg       0.98      0.98      0.98     12236

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1459
           1       0.19      0.46      0.27        13

    accuracy                           0.98      1472
   macro avg       0.59      0.72      0.63      1472
weighted avg       0.99      0.98      0.98      1472

Roc_auc score
-------------------------------------------------------
0.7218590182949334

Confusion matrix
-------------------------------------------------------
[[1433   26]
 [   7    6]]
