In [None]:
import sys
import time
import pandas as pd
from sklearn import model_selection
from sklearn import tree, linear_model, svm, ensemble, metrics, cluster
import xgboost as xgb
from matplotlib import pyplot as plt
import scipy
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.feature_selection import *
from sklearn.decomposition import *
from sklearn.model_selection import GridSearchCV, KFold

import warnings
warnings.filterwarnings('ignore')

start_time = time.time()
datasets = ["228_elusage.tsv", "485_analcatdata_vehicle.tsv", "523_analcatdata_neavote.tsv", \
            '663_rabe_266.tsv', '687_sleuth_ex1605.tsv']


regressor = Pipeline([
  #('feature_selection', SelectKBest(f_regression)),
  ('regression', svm.NuSVR())
])


parameters = [{'feature_selection__k': [1, 2, 3, 4, 5], \
    'regression__nu':[0.001, 0.005, 0.01, 0.1, 0.5, .1], \
    'regression__C':[0.01, 0.1, 0.05, 1, 2, 5, 10, 50, 100], \
    'regression__kernel':['linear', 'poly', 'rbf'], \
    'regression__degree':[2, 3], \
    #'regression__gamma':[0.01, 0.1, 0.05, 'auto'], \
    #'regression__coef0':[00.0, .01, 0.1, 0.05, 1, 2, 5, 10, 50, 100], \
    #'regression__shrinking':[True, False], \
    #'regression__toI':[0.0001, 0.01, 0.1, 0.5, 1, 2], \
}]


print('Training started...')
dataset_accuracies = list()
r2_scores = list()
for d_set in datasets:
    print("Processing dataset: %s" % d_set)
    data_path = "data/" + d_set
    df = pd.read_csv(data_path, sep="\t")
    label = df["target"].copy()
    data = df.drop("target", axis=1)
    optimized_regressor = GridSearchCV(regressor, parameters, \
                                       cv=KFold(n_splits=3, shuffle=True, random_state=3111696), \
                                       error_score=0, scoring='r2')
    optimized_regressor.fit(data, label)
    best_regressor = optimized_regressor.best_estimator_
    best_result = optimized_regressor.cv_results_
    print(optimized_regressor.best_params_)
    best_score = optimized_regressor.best_score_
    r2_scores.append(best_score)
    print("Best score: ", best_score)
    print("Finished dataset: %s" % d_set)
    print("------------------------------------------------------------")

print('Training finished')
print("Mean R2 square: \n", np.mean(r2_scores))
end_time = time.time()
print('Total time taken: %d seconds' % int(end_time - start_time))

