In [1]:
import sys
import time
import pandas as pd
from sklearn import model_selection
from sklearn import tree, linear_model, svm, ensemble, metrics, cluster
import xgboost as xgb
from matplotlib import pyplot as plt
import scipy
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.feature_selection import *
from skrebate import ReliefF, SURF, TuRF, SURFstar
from sklearn.decomposition import *
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
#from sklearn.kernel_approximation import *

start_time = time.time()
#datasets = ["195_auto_price.tsv", "197_cpu_act.tsv", "207_autoPrice.tsv", "192_vineyard.tsv", '210_cloud.tsv']

datasets = ["207_autoPrice.tsv"]

regressor = Pipeline([
  #('preprocessing1', preprocessing.Normalizer()),
  #('feature_selection', SelectFpr(score_func=f_regression)),
  #('feature_selection', SelectKBest(f_regression)),
  #('feature_selection', FastICA()),
  #('feature_selection2', cluster.FeatureAgglomeration()),
  ('regression', svm.NuSVR())
])

'''nuSVR_parameters = [{'feature_selection__k': [1, 2, 3, 4, 5, 8, 15], \
                     'regression__nu':[0.001, 0.005, 0.01, 0.1, 0.5, .1], \
                     'regression__C':[0.01, 0.1, 0.05, 1, 2, 5, 10, 50, 100], \
                     'regression__kernel':['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], \
                     'regression__degree':[2, 3, 4, 5], \
                     'regression__gamma':[0.01, 0.1, 0.05, 'auto'], \
                     'regression__coef0':[0.01, 0.1, 0.05, 1, 2, 5, 10, 50, 100], \
                     'regression__shrinking':[True, False]
                    }]'''

nuSVR_parameters = [{ 
                     #'feature_selection__k': [1, 2, 3, 4, 10], \
                     #'regression__nu':[0.01, 0.1, 1.0], \
                     'regression__C':[0.01, 1], \
                     'regression__kernel':['poly', 'rbf'], \
                     'regression__degree':[2], \
                     #'regression__gamma':[0.01, 'auto'], \
                     #'regression__coef0':[0.01, 1.0], \
                     #'regression__shrinking':[True, False]
                    }]


print('Training started...')
dataset_accuracies = list()
for d_set in datasets:
    print("Processing dataset: %s" % d_set)
    data_path = "data/" + d_set
    df = pd.read_csv(data_path, sep="\t")
    label = df["target"].copy()
    data = df.drop("target", axis=1)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(data, label, test_size=0.25, random_state=42)
    
    X_train = X_train.values
    X_test = X_test.values
    y_train = y_train.values
    y_test = y_test.values
    
    optimized_regressor = GridSearchCV(regressor, nuSVR_parameters, cv=5, error_score=0)
    
    optimized_regressor.fit(X_train, y_train)
    best_regressor = optimized_regressor.best_estimator_
    best_result = optimized_regressor.cv_results_
    
    print(optimized_regressor.best_params_)
    prediction = best_regressor.predict(X_test)
    
    mse = np.mean([np.abs(x-y) for x, y in zip(prediction, y_test)])
    print("Absolute error: %0.2f" % mse)
    dataset_accuracies.append(mse)
    
    plt.figure()
    plt.plot(y_test, color='r')
    plt.plot(prediction, color='g')
    plt.grid(True)
    plt.legend(['True', 'Predicted'])
    plt.show()
    print("Finished dataset: %s" % d_set)
    print("------------------------------------------------------------")
    
print('Training finished')
print("Mean squared error \n", np.mean(dataset_accuracies))
end_time = time.time()
print('Total time taken: %d seconds' % int(end_time - start_time))



  from numpy.core.umath_tests import inner1d


Training started...
Processing dataset: 207_autoPrice.tsv
{'regression__C': 0.01, 'regression__degree': 2, 'regression__kernel': 'rbf'}
Absolute error: 4097.65


<Figure size 640x480 with 1 Axes>

Finished dataset: 207_autoPrice.tsv
------------------------------------------------------------
Training finished
Mean squared error 
 4097.649964816255
Total time taken: 630
