In [270]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import catboost
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

In [2]:
def read_and_drop(filename):
    df = pd.read_csv(filename)
    df = df.drop('Unnamed: 0', axis=1)
    df = df.drop('Unnamed: 0.1', axis=1)
    df = df.drop('token', axis=1)
    df.head()
    return df

In [3]:
def print_output(model, y_train, y_train_hat, y_test, y_test_hat):
    print(model)
    print('Train performance')
    print('-------------------------------------------------------')
    print(classification_report(y_train, y_train_hat))
    print('Test performance')
    print('-------------------------------------------------------')
    print(classification_report(y_test, y_test_hat))
    print('Roc_auc score')
    print('-------------------------------------------------------')
    print(roc_auc_score(y_test, y_test_hat))
    print('')
    print('Confusion matrix')
    print('-------------------------------------------------------')
    print(confusion_matrix(y_test, y_test_hat))

In [4]:
df = read_and_drop(r'../dataset/ready_data/merges_data_without_stopwords_with_features.csv')
df_test = read_and_drop(r'../dataset/ready_data/6_without_stopwords_with_features.csv')

In [5]:
X_train, y_train = df.drop('is_abbreviation', axis=1), df['is_abbreviation']
X_test, y_test = df_test.drop('is_abbreviation', axis=1), df_test['is_abbreviation']

In [279]:
rc = []
ac = []
fs = []

for i in range(20):
    svm_model = SVC()
    svm_model.fit(X_train, y_train)
    y_train_hat = svm_model.predict(X_train)
    y_test_hat_1 = svm_model.predict(X_test)
    
    rc.append(roc_auc_score(y_test, y_test_hat_1))
    ac.append(accuracy_score(y_test, y_test_hat_1))
    fs.append(f1_score(y_test, y_test_hat_1))

In [280]:
print('roc auc = ', median(rc))
print('accuracy = ', median(ac))
print('f1 = ', median(fs))

roc auc =  0.8438446659186147
accuracy =  0.9773109243697479
f1 =  0.7923076923076923


In [281]:
rf_rc = []
rf_ac = []
rf_fs = []

for i in range(20):
    rf_model = RandomForestClassifier(min_samples_leaf=7)
    rf_model.fit(X_train, y_train)
    y_train_hat = rf_model.predict(X_train)
    y_test_hat_2 = rf_model.predict(X_test)
    
    rf_rc.append(roc_auc_score(y_test, y_test_hat_2))
    rf_ac.append(accuracy_score(y_test, y_test_hat_2))
    rf_fs.append(f1_score(y_test, y_test_hat_2))

In [282]:
print('roc auc = ', mean(rf_rc))
print('accuracy = ', mean(rf_ac))
print('f1 = ', mean(rf_fs))

roc auc =  0.9076917986035696
accuracy =  0.9845798319327731
f1 =  0.8693783126219244


In [283]:
gb_rc = []
gb_ac = []
gb_fs = []

for i in range(20):
    gb_model = xgb.XGBClassifier(max_depth=1)
    gb_model.fit(X_train, y_train)
    y_train_hat = gb_model.predict(X_train)
    y_test_hat_3 = gb_model.predict(X_test)

    gb_rc.append(roc_auc_score(y_test, y_test_hat_3))
    gb_ac.append(accuracy_score(y_test, y_test_hat_3))
    gb_fs.append(f1_score(y_test, y_test_hat_3))

  "memory consumption")




  "memory consumption")
  "memory consumption")




  "memory consumption")
  "memory consumption")




  "memory consumption")
  "memory consumption")




  "memory consumption")
  "memory consumption")




  "memory consumption")




  "memory consumption")
  "memory consumption")




  "memory consumption")
  "memory consumption")




  "memory consumption")
  "memory consumption")




  "memory consumption")
  "memory consumption")
  "memory consumption")




  "memory consumption")


In [284]:
print('roc auc = ', mean(rf_rc))
print('accuracy = ', mean(rf_ac))
print('f1 = ', mean(rf_fs))

roc auc =  0.8530153210255732
accuracy =  0.976890756302521
f1 =  0.7940074906367042


In [29]:
y_test_hat_sum = [int(vi or wi or ki) for vi, wi, ki in zip(y_test_hat_1, y_test_hat_2, y_test_hat_3)]
print_output("ensemble", y_train, y_train_hat, y_test, y_test_hat_sum)

ensemble
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     11175
           1       0.92      0.74      0.82       721

    accuracy                           0.98     11896
   macro avg       0.95      0.87      0.90     11896
weighted avg       0.98      0.98      0.98     11896

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4010
           1       0.85      0.87      0.86       239

    accuracy                           0.98      4249
   macro avg       0.92      0.93      0.93      4249
weighted avg       0.98      0.98      0.98      4249

Roc_auc score
-------------------------------------------------------
0.932500339110383

Confusion matrix
-------------------------------------------------------
[[3972   38]
 [  30  209]]


In [255]:
data_msl = []

for msl in range(1, 12):
    roc_aucs = []
    for i in range (20):
        X, y = df.drop('is_abbreviation', axis=1), df['is_abbreviation']
    
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.8, 
                                                    random_state=42)

        rf_model = RandomForestClassifier(min_samples_split=13, min_samples_leaf=msl)
        rf_model.fit(X_train, y_train)

        gb_model = xgb.XGBClassifier(max_depth=5)
        gb_model.fit(X_train, y_train)

        svm_model = SVC()
        svm_model.fit(X_train, y_train)
    
        y_test_hat_1 = svm_model.predict(X_test)
        y_test_hat_2 = rf_model.predict(X_test)
        y_test_hat_3 = gb_model.predict(X_test)
        y_test_hat_sum = [int(vi or wi or ki) for vi, wi, ki in zip(y_test_hat_1, y_test_hat_2, y_test_hat_3)]
        roc_aucs.append(roc_auc_score(y_test, y_test_hat_2))
    data_msl.append(roc_aucs)





  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")




  "memory consumption")
