In [9]:
import pandas as pd
from utils import read_dataset

dataset = read_dataset()

In [10]:
from utils import read_dataset_metadata

target_column, feature_columns, feature_columns_with_delta, feature_columns_only_delta = read_dataset_metadata()

In [11]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from utils import get_X_y, get_X_y_without_duplicates
from datetime import datetime

X_1, y_1 = get_X_y(dataset, feature_columns, target_column)
X_2, y_2 = get_X_y(dataset, feature_columns_with_delta, target_column)
X_3, y_3 = get_X_y(dataset, feature_columns_only_delta, target_column)

def train_(X, y):
    begin = datetime.now()
    print(begin)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    parameters = {'kernel':['rbf'], 'C':[1, 10, 100, 1000]}
    svc = svm.SVC(gamma="scale")
    clf = GridSearchCV(svc, parameters, cv=5)

    clf.fit(X_train, y_train)
    end = datetime.now()
    print(end)
    
    return begin, end, clf, X_train, X_test, y_train, y_test

In [12]:
begin_1, end_1, clf_1, X_train_1, X_test_1, y_train_1, y_test_1 = train_(X_1, y_1)
clf_1.best_params_

2019-10-22 00:13:58.284495
2019-10-22 02:14:07.974682


{'C': 1000, 'kernel': 'rbf'}

In [13]:
begin_2, end_2, clf_2, X_train_2, X_test_2, y_train_2, y_test_2 = train_(X_2, y_2)
clf_2.best_params_

2019-10-22 02:14:08.151346
2019-10-22 04:53:50.740113


{'C': 1000, 'kernel': 'rbf'}

In [14]:
begin_3, end_3, clf_3, X_train_3, X_test_3, y_train_3, y_test_3 = train_(X_3, y_3)
clf_3.best_params_

2019-10-22 04:53:50.745818
2019-10-22 07:41:45.141229


{'C': 1000, 'kernel': 'rbf'}

In [15]:
#begin = datetime.now()
#print(begin)
##result = SVC(kernel='rbf', C=1, gamma='auto').fit(X_train, y_train)
#end = datetime.now()
#print(end)

In [24]:
from joblib import dump

dump(clf_1, "models/svm-gridsearch-cv-k5-featurecolumns.joblib")
dump(clf_2, "models/svm-gridsearch-cv-k5-featurecolumnswithdelta.joblib")
dump(clf_3, "models/svm-gridsearch-cv-k5-featurecolumnsonlydelta.joblib")

['models/svm-gridsearch-cv-k5-featurecolumnsonlydelta.joblib']

In [25]:
from joblib import load
clf_1_loaded = load("models/svm-gridsearch-cv-k5-featurecolumns.joblib")

In [27]:
clf_1_loaded.best_estimator_

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [32]:
from sklearn.metrics import classification_report, confusion_matrix

def test_(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    y_pred = model.predict(X_test)
    return y_pred, y_test
    
y_pred_1, y_test_1 = test_(clf_1.best_estimator_, X_1, y_1)
y_pred_2, y_test_2 = test_(clf_2.best_estimator_, X_2, y_2)
y_pred_3, y_test_3 = test_(clf_3.best_estimator_, X_3, y_3)


In [45]:
from utils import number_for_class
from sklearn.metrics import classification_report, confusion_matrix

print(number_for_class.keys())

def posfix(list, text):
    return [f"{t}_{text}" for t in list]

def print_confusion_matrix(y_pred, y_test):
    classes = number_for_class.keys()
    return pd.DataFrame(confusion_matrix(y_pred, y_test), index=posfix(classes,"pred"), columns=posfix(classes,"true"))    

dict_keys(['stub', 'start', 'b', 'a', 'ga', 'fa'])


In [46]:
print_confusion_matrix(y_pred_1, y_test_1)

Unnamed: 0,stub_true,start_true,b_true,a_true,ga_true,fa_true
stub_pred,1286,278,22,2,2,16
start_pred,255,1087,394,62,124,20
b_pred,11,439,1251,378,247,142
a_pred,0,18,189,521,129,149
ga_pred,6,24,170,144,928,275
fa_pred,1,25,164,414,413,1381


In [49]:
print(classification_report(y_test_1, y_pred_1, target_names = number_for_class.keys()))

              precision    recall  f1-score   support

        stub       0.80      0.82      0.81      1559
       start       0.56      0.58      0.57      1871
           b       0.51      0.57      0.54      2190
           a       0.52      0.34      0.41      1521
          ga       0.60      0.50      0.55      1843
          fa       0.58      0.70      0.63      1983

    accuracy                           0.59     10967
   macro avg       0.59      0.59      0.59     10967
weighted avg       0.59      0.59      0.58     10967



In [47]:
print_confusion_matrix(y_pred_2, y_test_2)

Unnamed: 0,stub_true,start_true,b_true,a_true,ga_true,fa_true
stub_pred,1286,279,19,2,3,16
start_pred,255,1086,394,62,126,20
b_pred,11,439,1249,380,240,147
a_pred,0,18,190,514,137,150
ga_pred,6,24,169,150,922,272
fa_pred,1,25,169,413,415,1378


In [50]:
print(classification_report(y_test_1, y_pred_1, target_names = number_for_class.keys()))

              precision    recall  f1-score   support

        stub       0.80      0.82      0.81      1559
       start       0.56      0.58      0.57      1871
           b       0.51      0.57      0.54      2190
           a       0.52      0.34      0.41      1521
          ga       0.60      0.50      0.55      1843
          fa       0.58      0.70      0.63      1983

    accuracy                           0.59     10967
   macro avg       0.59      0.59      0.59     10967
weighted avg       0.59      0.59      0.58     10967



In [48]:
print_confusion_matrix(y_pred_3, y_test_3)

Unnamed: 0,stub_true,start_true,b_true,a_true,ga_true,fa_true
stub_pred,1548,14,5,2,3,2
start_pred,7,1789,22,10,19,8
b_pred,3,57,2066,52,79,27
a_pred,0,7,53,1404,37,35
ga_pred,1,3,32,39,1678,36
fa_pred,0,1,12,14,27,1875


In [51]:
print(classification_report(y_test_1, y_pred_1, target_names = number_for_class.keys()))

              precision    recall  f1-score   support

        stub       0.80      0.82      0.81      1559
       start       0.56      0.58      0.57      1871
           b       0.51      0.57      0.54      2190
           a       0.52      0.34      0.41      1521
          ga       0.60      0.50      0.55      1843
          fa       0.58      0.70      0.63      1983

    accuracy                           0.59     10967
   macro avg       0.59      0.59      0.59     10967
weighted avg       0.59      0.59      0.58     10967

