In [1]:
import pandas as pd
from utils import read_dataset

dataset = read_dataset()

In [2]:
from utils import read_dataset_metadata

def list_except(list, except_item):
    return [f for f in list if f != except_item]

past_class = "past_category"

target_column, feature_columns, feature_columns_with_delta, feature_columns_only_delta = read_dataset_metadata()
feature_columns = list_except(feature_columns, past_class)
feature_columns_with_delta = list_except(feature_columns_with_delta, past_class)
feature_columns_only_delta = list_except(feature_columns_only_delta, past_class)

In [3]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from utils import get_X_y, get_X_y_without_duplicates
from datetime import datetime

X_1, y_1 = get_X_y(dataset, feature_columns, target_column)
X_2, y_2 = get_X_y(dataset, feature_columns_with_delta, target_column)
X_3, y_3 = get_X_y(dataset, feature_columns_only_delta, target_column)

def train_(X, y):
    begin = datetime.now()
    print(begin)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    parameters = {'kernel':['rbf'], 'C':[1, 10, 100, 1000]}
    svc = svm.SVC(gamma="scale")
    clf = GridSearchCV(svc, parameters, cv=5)

    clf.fit(X_train, y_train)
    end = datetime.now()
    print(end)
    
    return begin, end, clf, X_train, X_test, y_train, y_test

In [4]:
begin_1, end_1, clf_1, X_train_1, X_test_1, y_train_1, y_test_1 = train_(X_1, y_1)
clf_1.best_params_

2019-10-22 11:27:07.599588
2019-10-22 13:27:18.140553


{'C': 1000, 'kernel': 'rbf'}

In [5]:
begin_2, end_2, clf_2, X_train_2, X_test_2, y_train_2, y_test_2 = train_(X_2, y_2)
clf_2.best_params_

2019-10-22 13:27:18.152049
2019-10-22 16:07:13.475777


{'C': 1000, 'kernel': 'rbf'}

In [6]:
begin_3, end_3, clf_3, X_train_3, X_test_3, y_train_3, y_test_3 = train_(X_3, y_3)
clf_3.best_params_

2019-10-22 16:07:13.481609
2019-10-22 18:23:44.196294


{'C': 1000, 'kernel': 'rbf'}

In [7]:
#begin = datetime.now()
#print(begin)
##result = SVC(kernel='rbf', C=1, gamma='auto').fit(X_train, y_train)
#end = datetime.now()
#print(end)

In [8]:
from joblib import dump

dump(clf_1, "models/svm-gridsearch-cv-k5-featurecolumns_wopast.joblib")
dump(clf_2, "models/svm-gridsearch-cv-k5-featurecolumnswithdelta_wopast.joblib")
dump(clf_3, "models/svm-gridsearch-cv-k5-featurecolumnsonlydelta_wopast.joblib")

['models/svm-gridsearch-cv-k5-featurecolumnsonlydelta_wopast.joblib']

In [9]:
from joblib import load
clf_1_loaded = load("models/svm-gridsearch-cv-k5-featurecolumns_wopast.joblib")

In [10]:
clf_1_loaded.best_estimator_

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [11]:
from sklearn.metrics import classification_report, confusion_matrix

def test_(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    y_pred = model.predict(X_test)
    return y_pred, y_test
    
y_pred_1, y_test_1 = test_(clf_1.best_estimator_, X_1, y_1)
y_pred_2, y_test_2 = test_(clf_2.best_estimator_, X_2, y_2)
y_pred_3, y_test_3 = test_(clf_3.best_estimator_, X_3, y_3)


In [12]:
from utils import number_for_class
from sklearn.metrics import classification_report, confusion_matrix

print(number_for_class.keys())

def posfix(list, text):
    return [f"{t}_{text}" for t in list]

def print_confusion_matrix(y_pred, y_test):
    classes = number_for_class.keys()
    return pd.DataFrame(confusion_matrix(y_pred, y_test), index=posfix(classes,"pred"), columns=posfix(classes,"true"))    

dict_keys(['stub', 'start', 'b', 'a', 'ga', 'fa'])


In [13]:
print_confusion_matrix(y_pred_1, y_test_1)

Unnamed: 0,stub_true,start_true,b_true,a_true,ga_true,fa_true
stub_pred,1287,278,22,3,2,16
start_pred,254,1085,395,61,127,20
b_pred,11,439,1242,378,254,146
a_pred,0,19,187,518,129,150
ga_pred,6,25,173,146,917,276
fa_pred,1,25,171,415,414,1375


In [14]:
print(classification_report(y_test_1, y_pred_1, target_names = number_for_class.keys()))

              precision    recall  f1-score   support

        stub       0.80      0.83      0.81      1559
       start       0.56      0.58      0.57      1871
           b       0.50      0.57      0.53      2190
           a       0.52      0.34      0.41      1521
          ga       0.59      0.50      0.54      1843
          fa       0.57      0.69      0.63      1983

    accuracy                           0.59     10967
   macro avg       0.59      0.58      0.58     10967
weighted avg       0.58      0.59      0.58     10967



In [15]:
print_confusion_matrix(y_pred_2, y_test_2)

Unnamed: 0,stub_true,start_true,b_true,a_true,ga_true,fa_true
stub_pred,1286,279,20,2,3,17
start_pred,255,1084,394,62,130,18
b_pred,11,438,1243,380,243,148
a_pred,0,18,187,512,137,152
ga_pred,6,24,169,149,915,269
fa_pred,1,28,177,416,415,1379


In [22]:
print(classification_report(y_test_2, y_pred_2, target_names = number_for_class.keys()))

              precision    recall  f1-score   support

        stub       0.80      0.82      0.81      1559
       start       0.56      0.58      0.57      1871
           b       0.50      0.57      0.53      2190
           a       0.51      0.34      0.41      1521
          ga       0.60      0.50      0.54      1843
          fa       0.57      0.70      0.63      1983

    accuracy                           0.59     10967
   macro avg       0.59      0.58      0.58     10967
weighted avg       0.58      0.59      0.58     10967



In [17]:
print_confusion_matrix(y_pred_3, y_test_3)

Unnamed: 0,stub_true,start_true,b_true,a_true,ga_true,fa_true
stub_pred,53,37,29,21,20,10
start_pred,1141,1132,961,577,770,803
b_pred,55,244,554,397,421,460
a_pred,2,8,37,40,35,37
ga_pred,1,12,32,25,46,38
fa_pred,307,438,577,461,551,635


In [23]:
print(classification_report(y_test_3, y_pred_3, target_names = number_for_class.keys()))

              precision    recall  f1-score   support

        stub       0.31      0.03      0.06      1559
       start       0.21      0.61      0.31      1871
           b       0.26      0.25      0.26      2190
           a       0.25      0.03      0.05      1521
          ga       0.30      0.02      0.05      1843
          fa       0.21      0.32      0.26      1983

    accuracy                           0.22     10967
   macro avg       0.26      0.21      0.16     10967
weighted avg       0.26      0.22      0.17     10967

