In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import catboost
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier

In [37]:
def read_and_drop(filename):
    df = pd.read_csv(filename)
    df = df.drop('Unnamed: 0', axis=1)
    df = df.drop('Unnamed: 0.1', axis=1)
    df = df.drop('abbreviation', axis=1)
    df = df.drop('definition', axis=1)
    df = df.drop('begin', axis=1)
    df = df.drop('end', axis=1)
    df = df.drop('abbreviation_place', axis=1)
    df['almost_synonyms'].fillna(int(df['almost_synonyms'].min()), inplace=True)
    df.head()
    return df

In [47]:
def print_output(model, y_train, y_train_hat, y_test, y_test_hat):
    print(model)
    print('Train performance')
    print('-------------------------------------------------------')
    print(classification_report(y_train, y_train_hat))
    print('Test performance')
    print('-------------------------------------------------------')
    print(classification_report(y_test, y_test_hat))
    print('Roc_auc score')
    print('-------------------------------------------------------')
    print(roc_auc_score(y_test, y_test_hat))
    print('')
    print('Confusion matrix')
    print('-------------------------------------------------------')
    print(confusion_matrix(y_test, y_test_hat))

In [39]:
df = read_and_drop("../dataset/ready_data/potential_pairs_with_features.csv")
df_test = read_and_drop("../dataset/ready_data/6_potential_pairs_with_features.csv")

In [40]:
df.head()

Unnamed: 0,distance,is_it_correct,first_letters,parenthesis,almost_synonyms,lcs_feature
0,14,0,1.5,0,0.43829,1.0
1,13,0,1.5,0,0.394688,1.0
2,14,1,1.0,0,0.437462,1.0
3,13,0,1.0,0,0.395456,1.0
4,12,0,1.0,0,0.343793,1.0


In [41]:
X_train, y_train = df.drop('is_it_correct', axis=1), df['is_it_correct']
X_test, y_test = df_test.drop('is_it_correct', axis=1), df_test['is_it_correct']

In [48]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_train_hat = model.predict(X_train)
y_test_hat_1 = model.predict(X_test)
print_output(model, y_train, y_train_hat, y_test, y_test_hat_1)

RandomForestClassifier()
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6118
           1       1.00      0.98      0.99        57

    accuracy                           1.00      6175
   macro avg       1.00      0.99      1.00      6175
weighted avg       1.00      1.00      1.00      6175

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2476
           1       0.25      0.06      0.09        18

    accuracy                           0.99      2494
   macro avg       0.62      0.53      0.54      2494
weighted avg       0.99      0.99      0.99      2494

Roc_auc score
-------------------------------------------------------
0.5271719619457907

Confusion matrix
-------------------------------------------------------
[[2473    3

In [50]:
params = {
    'n_estimators': 500,
    'depth':5,
    'loss_function': 'Logloss',
    'verbose': 1000, 
    'learning_rate':0.011871,
    
}

In [51]:
 model = catboost.CatBoostClassifier(**params)

In [53]:
model.fit(
    X_train, y_train
)
y_train_hat = model.predict(X_train)
y_test_hat_2 = model.predict(X_test)
print_output(model, y_train, y_train_hat, y_test, y_test_hat_2)

0:	learn: 0.6571240	total: 3.85ms	remaining: 1.92s
499:	learn: 0.0146692	total: 3.7s	remaining: 0us
<catboost.core.CatBoostClassifier object at 0x0000020094146FD0>
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6118
           1       0.87      0.58      0.69        57

    accuracy                           1.00      6175
   macro avg       0.93      0.79      0.85      6175
weighted avg       0.99      1.00      0.99      6175

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2476
           1       0.67      0.22      0.33        18

    accuracy                           0.99      2494
   macro avg       0.83      0.61      0.67      2494
weighted avg       0.99      0.99      0.99      2494

Roc_auc score
---------------------

In [54]:
y_test_hat_sum = [int(vi or wi) for vi, wi in zip(y_test_hat_1, y_test_hat_2)]
print_output("ensemble", y_train, y_train_hat, y_test, y_test_hat_sum)

ensemble
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6118
           1       0.87      0.58      0.69        57

    accuracy                           1.00      6175
   macro avg       0.93      0.79      0.85      6175
weighted avg       0.99      1.00      0.99      6175

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2476
           1       0.57      0.22      0.32        18

    accuracy                           0.99      2494
   macro avg       0.78      0.61      0.66      2494
weighted avg       0.99      0.99      0.99      2494

Roc_auc score
-------------------------------------------------------
0.610505295279124

Confusion matrix
-------------------------------------------------------
[[2473    3]
 [  14    4]]
