In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import catboost
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

In [12]:
def read_and_drop(filename):
    df = pd.read_csv(filename)
    df = df.drop('Unnamed: 0', axis=1)
    df = df.drop('Unnamed: 0.1', axis=1)
    df = df.drop('token', axis=1)
    df.head()
    return df

In [13]:
def print_output(model, y_train, y_train_hat, y_test, y_test_hat):
    print(model)
    print('Train performance')
    print('-------------------------------------------------------')
    print(classification_report(y_train, y_train_hat))
    print('Test performance')
    print('-------------------------------------------------------')
    print(classification_report(y_test, y_test_hat))
    print('Roc_auc score')
    print('-------------------------------------------------------')
    print(roc_auc_score(y_test, y_test_hat))
    print('')
    print('Confusion matrix')
    print('-------------------------------------------------------')
    print(confusion_matrix(y_test, y_test_hat))

In [14]:
df = read_and_drop(r'../dataset/ready_data/merges_data_without_stopwords_with_features.csv')
df_test = read_and_drop(r'../dataset/ready_data/6_without_stopwords_with_features.csv')

In [15]:
X_train, y_train = df.drop('is_abbreviation', axis=1), df['is_abbreviation']
X_test, y_test = df_test.drop('is_abbreviation', axis=1), df_test['is_abbreviation']

In [16]:
X_train.head()

Unnamed: 0,special_symbols,letters_numbers_combination,vowels_consonants_combination,get_token_length,upper_letters_rate,upper_letters_inside,is_in_dictionary
0,0,1,3,15,0.066667,0,1
1,0,1,3,7,0.0,0,1
2,0,1,3,8,0.0,0,1
3,0,1,0,1,1.0,0,0
4,0,1,3,6,0.0,0,1


In [7]:
X_test.head()

Unnamed: 0,special_symbols,letters_numbers_combination,vowels_consonants_combination,get_token_length,upper_letters_rate,upper_letters_inside,is_in_dictionary
0,0,1,3,9,1.0,1,1
1,0,1,3,1,1.0,0,1
2,0,1,3,9,1.0,1,0
3,0,1,3,7,1.0,1,1
4,0,1,3,6,1.0,1,1


In [17]:
model = SVC()
model.fit(X_train, y_train)
y_train_hat = model.predict(X_train)
y_test_hat_1 = model.predict(X_test)
print_output(model, y_train, y_train_hat, y_test, y_test_hat_1)

SVC()
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     11175
           1       0.93      0.72      0.81       721

    accuracy                           0.98     11896
   macro avg       0.96      0.86      0.90     11896
weighted avg       0.98      0.98      0.98     11896

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4010
           1       0.94      0.79      0.86       239

    accuracy                           0.99      4249
   macro avg       0.97      0.89      0.93      4249
weighted avg       0.99      0.99      0.99      4249

Roc_auc score
-------------------------------------------------------
0.8940259184674297

Confusion matrix
-------------------------------------------------------
[[3999   11]
 [  50  189]]


In [9]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_train_hat = model.predict(X_train)
y_test_hat_2 = model.predict(X_test)
print_output(model, y_train, y_train_hat, y_test, y_test_hat_2)

RandomForestClassifier()
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     11175
           1       0.94      0.91      0.92       721

    accuracy                           0.99     11896
   macro avg       0.97      0.95      0.96     11896
weighted avg       0.99      0.99      0.99     11896

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      4010
           1       0.78      0.51      0.62       239

    accuracy                           0.96      4249
   macro avg       0.88      0.75      0.80      4249
weighted avg       0.96      0.96      0.96      4249

Roc_auc score
-------------------------------------------------------
0.7509907240267532

Confusion matrix
-------------------------------------------------------
[[3976   34

In [19]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_train_hat = model.predict(X_train)
y_test_hat_3 = model.predict(X_test)
print_output(model, y_train, y_train_hat, y_test, y_test_hat_3)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     11175
           1       0.94      0.91      0.92       721

    accuracy                           0.99     11896
   macro avg       0.97      0.95      0.96     11896
weighted avg       0.99      0.99      0.99     11896

Test performance


  "memory consumption")


In [18]:
y_test_hat_sum = [int(vi or wi or ki) for vi, wi, ki in zip(y_test_hat_1, y_test_hat_2, y_test_hat_3)]
print_output("ensemble", y_train, y_train_hat, y_test, y_test_hat_sum)

ensemble
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     11175
           1       0.93      0.72      0.81       721

    accuracy                           0.98     11896
   macro avg       0.96      0.86      0.90     11896
weighted avg       0.98      0.98      0.98     11896

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4010
           1       0.85      0.89      0.87       239

    accuracy                           0.98      4249
   macro avg       0.92      0.94      0.93      4249
weighted avg       0.99      0.98      0.99      4249

Roc_auc score
-------------------------------------------------------
0.9389011780172998

Confusion matrix
-------------------------------------------------------
[[3973   37]
 [  27  212]]
