In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from data import *

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

## Загрузим данные

In [3]:
X, y = load_pkl('stats/X.pkl'), load_pkl('stats/y.pkl')

In [4]:
X['para'] = load_pkl('stats/para.pkl')

In [5]:
X['topp'] = [int(18 <= para_ind <= 23) for para_ind in X.para]

In [6]:
y = y[X.topp < 1]
X = X[X.topp < 1]

In [7]:
print(f'Размер датасета: {len(X)}')

Размер датасета: 199


In [8]:
X = X.drop(['topp', 'para'], axis=1)

In [9]:
X.head()

Unnamed: 0,bleu,rouge-l,wmd,pos,rouge-1,rouge-2,Levenshtein similarity,Additions proportion,Deletions proportion
0,48.762546,0.734177,1.067048,3.102178,0.756098,0.55,0.848352,0.227273,0.295455
1,47.34655,0.607595,1.055076,3.102178,0.780488,0.575,0.631579,0.25,0.340909
2,49.978468,0.65,0.747549,3.102178,0.829268,0.625,0.63913,0.227273,0.295455
4,56.111216,0.75,0.800583,3.006511,0.8,0.714286,0.81,0.277778,0.166667
5,80.650086,0.866667,0.403395,0.0,0.866667,0.857143,0.899471,0.125,0.125


In [10]:
X.describe()

Unnamed: 0,bleu,rouge-l,wmd,pos,rouge-1,rouge-2,Levenshtein similarity,Additions proportion,Deletions proportion
count,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0
mean,50.945534,0.763147,0.666727,0.687342,0.798013,0.630711,0.843565,0.18818,0.213026
std,23.788701,0.139507,0.467887,1.221732,0.131163,0.19377,0.102335,0.116735,0.129217
min,0.0,0.333333,0.0,0.0,0.4,0.0,0.533333,0.0,0.0
25%,38.638457,0.666667,0.330474,0.0,0.714286,0.5,0.78335,0.105263,0.116516
50%,50.957115,0.789474,0.585016,0.0,0.827586,0.625,0.860215,0.166667,0.2
75%,66.773448,0.870833,0.938782,1.099617,0.892857,0.76,0.914507,0.25,0.3125
max,100.0,1.0,1.968615,6.214542,1.0,1.0,1.0,0.666667,0.535714


## Поделим на классы

In [11]:
zero = y < 7
one = y == 7
two = y > 7

In [12]:
y[zero] = 0
y[one] = 1
y[two] = 2

In [13]:
y = y.astype(int)

In [14]:
np.unique(y, return_counts=True)

(array([0, 1, 2]), array([63, 53, 83]))

## Поделим на train / test

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=213123, shuffle=True, stratify=y)

In [16]:
len(X_train), len(X_test)

(149, 50)

In [17]:
print(f'Train:\n{y_train.value_counts()}\n')
print(f'Test:\n{y_test.value_counts()}')

Train:
2    62
0    47
1    40
Name: label, dtype: int64

Test:
2    21
0    16
1    13
Name: label, dtype: int64


## Вспомогательная функция для Grid Search и вывода результатов

In [18]:
def classification(model, params, scale=None):
    if scale:
        if scale == 'm':
            print('MinMaxScaler')
            pipe = Pipeline([('scaler', MinMaxScaler()), model])
        else:
            print('StandardScaler')
            pipe = Pipeline([('scaler', StandardScaler()), model])
    else:
        print('No scaling')
        pipe = Pipeline([model])
    
    grid = GridSearchCV(pipe, params, cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    
    y_pred = grid.predict(X_test)
    print(f'Лучшие параметры: {grid.best_params_}')
    print(f'Accuracy:{accuracy_score(y_test, y_pred)}, F1:{f1_score(y_test, y_pred, average="macro")}')
    print('Confusion matrix:')
    print(pd.DataFrame(confusion_matrix(y_test, y_pred), columns=(0, 1, 2)))
    
    return grid

## Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
log_reg_l2 = classification(('logreg', LogisticRegression()), scale='m', params={'logreg__C': [1e-3, 1e-2, 1e-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})

MinMaxScaler
Лучшие параметры: {'logreg__C': 2}
Accuracy:0.66, F1:0.5805539489750017
Confusion matrix:
    0  1   2
0  15  1   0
1   6  2   5
2   2  3  16


In [21]:
print(list(X.columns))
print(log_reg_l2.best_estimator_[-1].coef_)

['bleu', 'rouge-l', 'wmd', 'pos', 'rouge-1', 'rouge-2', 'Levenshtein similarity', 'Additions proportion', 'Deletions proportion']
[[-5.29711640e-02  4.67948370e-01 -7.55892815e-01 -3.91755677e-01
   9.06854437e-01  3.95260165e-01  9.45897583e-01 -6.30567555e-01
   5.26489271e-01]
 [-6.00253869e-01  1.08156246e-01 -1.83272281e-03  1.01507902e+00
   5.64964256e-02  1.59392550e-01  1.22960722e+00  4.66487163e-01
  -1.82232713e-01]
 [ 6.53225033e-01 -5.76104615e-01  7.57725538e-01 -6.23323340e-01
  -9.63350863e-01 -5.54652715e-01 -2.17550481e+00  1.64080392e-01
  -3.44256558e-01]]


## Деревья

In [22]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
tree = classification(('tree', DecisionTreeClassifier()), {'tree__max_depth': range(1, 10)})

No scaling
Лучшие параметры: {'tree__max_depth': 2}
Accuracy:0.62, F1:0.631297964631298
Confusion matrix:
    0  1   2
0  10  6   0
1   1  9   3
2   0  9  12


In [24]:
print(tree.best_estimator_[-1].max_depth)
for col, val in zip(X.columns, tree.best_estimator_[-1].feature_importances_):
    if val:
        print(f'{col}:{val},',end=' ')

2
Levenshtein similarity:1.0, 

## Лес

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
forest = RandomForestClassifier(max_depth=5)
forest.fit(X_train, y_train)

y_pred = forest.predict(X_test)
print(f'Accuracy:{accuracy_score(y_test, y_pred)}, F1:{f1_score(y_test, y_pred, average="macro")}')
print('Confusion matrix:')
print(pd.DataFrame(confusion_matrix(y_test, y_pred), columns=(0, 1, 2)),end='\n\n')

for val, col in sorted(zip(forest.feature_importances_, X.columns), key=lambda x: abs(x[0]), reverse=True):
    print(f'{col}:{val},')

Accuracy:0.66, F1:0.6352991452991453
Confusion matrix:
    0  1   2
0  10  3   3
1   0  5   8
2   0  3  18

Levenshtein similarity:0.26286605741067526,
rouge-l:0.13199027098380592,
rouge-2:0.12775802986381107,
Additions proportion:0.11189211223301068,
wmd:0.0858553890539397,
rouge-1:0.08510101631101896,
bleu:0.07654704445672743,
Deletions proportion:0.07452784170064802,
pos:0.04346223798636318,


## SVM

In [27]:
from sklearn.svm import SVC

In [28]:
param_grid = [
  {'svm__C': [0, 1, 10, 100], 'svm__kernel': ['linear']},
  {'svm__C': [0, 1, 10, 100], 'svm__gamma': [1e-2, 1e-1, 0, 1, 'auto'], 'svm__kernel': ['rbf']},
  {'svm__C': [0, 1, 10, 100], 'svm__gamma': [1e-2, 1e-1, 0, 1, 'auto'], 'svm__kernel': ['poly'], 'svm__degree':[2, 3, 4, 5, 6, 7]}
 ]

In [29]:
svm = classification(('svm', SVC()), scale='s', params=param_grid)

StandardScaler
Лучшие параметры: {'svm__C': 10, 'svm__gamma': 0.01, 'svm__kernel': 'rbf'}
Accuracy:0.64, F1:0.6226082161566032
Confusion matrix:
   0  1   2
0  9  7   0
1  4  7   2
2  2  3  16


In [30]:
svm = classification(('svm', SVC()), scale='m', params=param_grid)

MinMaxScaler
Лучшие параметры: {'svm__C': 1, 'svm__gamma': 1, 'svm__kernel': 'rbf'}
Accuracy:0.62, F1:0.5909090909090909
Confusion matrix:
   0  1   2
0  9  7   0
1  2  5   6
2  1  3  17


## Bagging

In [31]:
from sklearn.ensemble import BaggingClassifier

In [32]:
bag = BaggingClassifier(DecisionTreeClassifier(max_depth=2), 100, random_state=42)
bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)

print(f'Accuracy:{accuracy_score(y_test, y_pred)}, F1:{f1_score(y_test, y_pred, average="macro")}')
print('Confusion matrix:')
print(pd.DataFrame(confusion_matrix(y_test, y_pred), columns=(0, 1, 2)))

imps = np.mean([tree.feature_importances_ for tree in bag.estimators_], axis=0)
sorted([(imp, ft) for imp, ft in zip(imps, X.columns)], key=lambda x: x[0], reverse=True)

Accuracy:0.74, F1:0.718700414102713
Confusion matrix:
    0  1   2
0  10  6   0
1   1  8   4
2   0  2  19


[(0.7320554840016348, 'Levenshtein similarity'),
 (0.11380988744854696, 'rouge-l'),
 (0.04151351830916326, 'Additions proportion'),
 (0.033679494868312655, 'pos'),
 (0.02264576412638545, 'bleu'),
 (0.02172483658441582, 'rouge-2'),
 (0.019518812254613965, 'rouge-1'),
 (0.013854200481250585, 'wmd'),
 (0.0011980019256761973, 'Deletions proportion')]

In [33]:
# save_pkl(bag, 'classification_tree_bag_acc74_f72.pkl')