In [1]:
import pandas as pd
from utils import read_dataset

dataset = read_dataset()

In [2]:
from utils import read_dataset_metadata

target_column, feature_columns, feature_columns_with_delta, feature_columns_only_delta = read_dataset_metadata()

In [3]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from utils import get_X_y, get_X_y_without_duplicates
from datetime import datetime

X_1, y_1 = get_X_y(dataset, feature_columns, target_column)
X_2, y_2 = get_X_y(dataset, feature_columns_with_delta, target_column)
X_3, y_3 = get_X_y(dataset, feature_columns_only_delta, target_column)

def train_(X, y):
    begin = datetime.now()
    print(begin)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    parameters = {'kernel':['rbf'], 'C':[1, 10, 100, 1000]}
    svc = svm.SVR(gamma="scale")
    clf = GridSearchCV(svc, parameters, cv=5)

    clf.fit(X_train, y_train)
    end = datetime.now()
    print(end)
    
    return begin, end, clf, X_train, X_test, y_train, y_test

In [4]:
begin_1, end_1, clf_1, X_train_1, X_test_1, y_train_1, y_test_1 = train_(X_1, y_1)
clf_1.best_params_

2019-10-22 22:20:14.677150
2019-10-23 00:53:26.742309


{'C': 1000, 'kernel': 'rbf'}

In [5]:
begin_2, end_2, clf_2, X_train_2, X_test_2, y_train_2, y_test_2 = train_(X_2, y_2)
clf_2.best_params_

2019-10-23 00:53:26.786229
2019-10-23 04:20:30.158814


{'C': 1000, 'kernel': 'rbf'}

In [6]:
begin_3, end_3, clf_3, X_train_3, X_test_3, y_train_3, y_test_3 = train_(X_3, y_3)
clf_3.best_params_

2019-10-23 04:20:30.164433
2019-10-23 05:51:49.339010


{'C': 100, 'kernel': 'rbf'}

In [7]:
#begin = datetime.now()
#print(begin)
##result = SVC(kernel='rbf', C=1, gamma='auto').fit(X_train, y_train)
#end = datetime.now()
#print(end)

In [8]:
from joblib import dump

dump(clf_1, "models/svr-gridsearch-cv-k5-featurecolumns.joblib")
dump(clf_2, "models/svr-gridsearch-cv-k5-featurecolumnswithdelta.joblib")
dump(clf_3, "models/svr-gridsearch-cv-k5-featurecolumnsonlydelta.joblib")

['models/svr-gridsearch-cv-k5-featurecolumnsonlydelta.joblib']

In [9]:
from joblib import load
clf_1_loaded = load("models/svr-gridsearch-cv-k5-featurecolumns.joblib")

In [10]:
clf_1_loaded.best_estimator_

SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [11]:
from sklearn.metrics import classification_report, confusion_matrix

def test_(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    y_pred = model.predict(X_test)
    return y_pred, y_test
    
y_pred_1, y_test_1 = test_(clf_1.best_estimator_, X_1, y_1)
y_pred_2, y_test_2 = test_(clf_2.best_estimator_, X_2, y_2)
y_pred_3, y_test_3 = test_(clf_3.best_estimator_, X_3, y_3)


In [21]:
from sklearn.metrics import mean_squared_error, r2_score
mse_1 = mean_squared_error(y_pred_1, y_test_1)
mse_1

0.8607607222176014

In [22]:
r2_1 = r2_score(y_pred_1, y_test_1)
r2_1

0.6191489554996112

In [23]:
mse_2 = mean_squared_error(y_pred_2, y_test_2)
mse_2

0.8615921074333458

In [24]:
r2_2 = r2_score(y_pred_2, y_test_2)
r2_2

0.6189992947762195

In [25]:
mse_3 = mean_squared_error(y_pred_3, y_test_3)
mse_3

0.16279428759831474

In [26]:
r2_3 = r2_score(y_pred_3, y_test_3)
r2_3

0.935561962494553

In [27]:
pd.DataFrame([[mse_1, r2_1],[mse_2, r2_2], [mse_3, r2_3]], columns = ["MSE", "R^2"])

Unnamed: 0,MSE,R^2
0,0.860761,0.619149
1,0.861592,0.618999
2,0.162794,0.935562
