In [33]:
import numpy as np
from scipy.io import arff
from io import StringIO
import sklearn
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from sklearn.metrics import mean_squared_error

In [34]:
def read(a):
  f = open(a, "r")
  c = StringIO(f.read())
  return np.loadtxt(c, delimiter=',')

In [35]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        j = 0
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            if j > 1:
                break
            j+=1


def randomCV(clf, X, y, param_grid, n_iter, cv):
    random_search = RandomizedSearchCV(clf, param_distributions = param_grid,
          n_iter = n_iter, cv = cv, iid = False)
    random_search.fit(X, y)
    report(random_search.cv_results_)
    return random_search.best_params_

def Rf(x, y):
  clf = RandomForestRegressor()
  param_grid = {
        "n_estimators" : np.arange(2,50),
        "max_depth" : np.arange(1,6),

    # "criterion" : ['mse', 'mae'],
    # "min_samples_split" : np.random.random_sample((100,)),      
    # "min_samples_split" : np.linspace(0.01,1, num = 1000),
    # "min_samples_leaf" : np.linspace(0.01,0.5, num = 100),
    # "bootstrap" : [True, False],
    # "warm_start" : [True, False]
  }
  return randomCV(clf, x, y, param_grid, 50, 6)  


def Dt(x, y):
  clf = DecisionTreeRegressor()
  param_grid = {
      "max_depth" : np.arange(1,6),
      "min_samples_split" : np.linspace(0.01,0.5, num = 1000),
      "min_samples_leaf" : np.linspace(0.01,0.5, num = 1000),

      # "criterion" : ['mse', 'mae', 'friedman_mse'],
      # "splitter" : ['best', 'random'],
  }
  return randomCV(clf, x, y, param_grid, 400, 6)  

def Svr(x, y):
  clf = svm.SVR()
  param_grid = {
      "kernel" : ['poly', 'rbf', 'linear', 'sigmoid'],
      "gamma" : ['scale', 'auto'],
      "shrinking" : [True, False]
  }
  return randomCV(clf, x, y, param_grid, 4, 6)

def Ada(x, y):
  clf = AdaBoostRegressor()
  param_grid = {
      "n_estimators" : np.arange(1,100),
      "loss" : ['linear', 'square', 'exponential'],
      # "learning_rate" : np.arange(1,)
  }
  return randomCV(clf, x, y, param_grid, 250, 6)

def GP(x, y):
  clf = GaussianProcessRegressor()
  param_grid = {
#       "kernel" : ['RBF', 'WhiteKernel'],
      "normalize_y" : [True, False],
      "copy_X_train" : [True, False],
      "alpha" : np.linspace(0, 5, 100),
      
  }
  return randomCV(clf, x, y, param_grid, 25, 6)

def LR(x, y):
  clf = LinearRegression()
  param_grid = {
      "fit_intercept" : [True, False],
      "normalize" : [True, False],
      "copy_X" : [True, False],
  }
  return randomCV(clf, x, y, param_grid, 25, 6)

def NN(x, y):
  clf = MLPRegressor()
  param_grid = {
      "hidden_layer_sizes" : np.arange(1,200),
      "activation" : ['identity', 'logistic', 'tanh', 'relu'],
      "solver" : ['lbfgs', 'sgd', 'adam'],
      "learning_rate" : ['constant', 'invscaling', 'adaptive'],
      "shuffle" : [True, False],
  }
  return randomCV(clf, x, y, param_grid, 30, 6)

In [45]:
data = read("4train.txt")       # read data
data = data[:,1:]                                 # remove id

np.random.shuffle(data)                           # shuffle for fairness

y = data[:,-2:-1]                                 # separating prediction var
x = data[:,0:data.shape[1] -1]

x_train, x_test = np.split(x, [940])              # separating test data
y_train, y_test = np.split(y, [940])

print(x_test.shape)

scaler = StandardScaler()                         # scaling features
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

(100, 27)


In [37]:
# ---------------> 
param = Dt(x_train, y_train)
reg_tree = DecisionTreeRegressor().set_params(**param)
reg_tree.fit(x_train, y_train)
prediction = reg_tree.predict(x_test)

rmse = mean_squared_error(y_test, prediction)
print("RMSE on test data : ", rmse)

print("Score with test data",reg_tree.score(x_test, y_test))


Model with rank: 1
Mean validation score: 0.993 (std: 0.001)
Parameters: {'min_samples_split': 0.055615615615615614, 'min_samples_leaf': 0.013923923923923925, 'max_depth': 3}

Model with rank: 2
Mean validation score: 0.992 (std: 0.001)
Parameters: {'min_samples_split': 0.09289289289289288, 'min_samples_leaf': 0.03207207207207207, 'max_depth': 5}

Model with rank: 3
Mean validation score: 0.981 (std: 0.003)
Parameters: {'min_samples_split': 0.07327327327327327, 'min_samples_leaf': 0.054144144144144146, 'max_depth': 4}

RMSE on test data :  1.895976776983779
Score with test data 0.9931361293921208


In [38]:
# --------------->
param = Rf(x_train, y_train)
reg_rf = RandomForestRegressor().set_params(**param)
reg_rf.fit(x_train, y_train)
prediction = reg_rf.predict(x_test)

rmse = mean_squared_error(y_test, prediction)
print("RMSE on test data : ", rmse)

print("Score with test data",reg_tree.score(x_test, y_test))


Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'n_estimators': 38, 'max_depth': 5}

Model with rank: 2
Mean validation score: 1.000 (std: 0.000)
Parameters: {'n_estimators': 30, 'max_depth': 5}

Model with rank: 3
Mean validation score: 1.000 (std: 0.000)
Parameters: {'n_estimators': 35, 'max_depth': 5}

RMSE on test data :  0.017278439660749503
Score with test data 0.9931361293921208


In [39]:
# --------------->
param = Ada(x_train, y_train)
reg_ada = AdaBoostRegressor().set_params(**param)
reg_ada.fit(x_train, y_train)
prediction = reg_ada.predict(x_test)

rmse = mean_squared_error(y_test, prediction)
print("RMSE on test data : ", rmse)

print("Score with test data",reg_ada.score(x_test, y_test))


Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'n_estimators': 87, 'loss': 'exponential'}

Model with rank: 2
Mean validation score: 1.000 (std: 0.000)
Parameters: {'n_estimators': 93, 'loss': 'exponential'}

Model with rank: 3
Mean validation score: 1.000 (std: 0.000)
Parameters: {'n_estimators': 66, 'loss': 'exponential'}

RMSE on test data :  0.09623227885423638
Score with test data 0.9996516170881546


In [40]:
# --------------->
param = GP(x_train, y_train)
reg_gp = GaussianProcessRegressor().set_params(**param)
reg_gp.fit(x_train, y_train)
prediction = reg_gp.predict(x_test)

rmse = mean_squared_error(y_test, prediction)
print("RMSE on test data : ", rmse)

print("Score with test data",reg_gp.score(x_test, y_test))


Model with rank: 1
Mean validation score: 0.403 (std: 0.027)
Parameters: {'normalize_y': True, 'copy_X_train': False, 'alpha': 0.9090909090909091}

Model with rank: 2
Mean validation score: 0.364 (std: 0.025)
Parameters: {'normalize_y': True, 'copy_X_train': False, 'alpha': 1.4141414141414141}

Model with rank: 3
Mean validation score: 0.360 (std: 0.024)
Parameters: {'normalize_y': True, 'copy_X_train': True, 'alpha': 1.4646464646464645}

RMSE on test data :  140.1745763937787
Score with test data 0.49253589676779164


In [41]:
# --------------->
param = LR(x_train, y_train)
reg_lr = LinearRegression().set_params(**param)
reg_lr.fit(x_train, y_train)
prediction = reg_lr.predict(x_test)

rmse = mean_squared_error(y_test, prediction)
print("RMSE on test data : ", rmse)

print("Score with test data",reg_lr.score(x_test, y_test))


Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': False}

Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': False}

RMSE on test data :  6.46392625738097e-29
Score with test data 1.0




In [42]:
# # --------------->
param = NN(x_train, y_train)
reg_nn = MLPRegressor().set_params(**param)
reg_nn.fit(x_train, y_train)

prediction = reg_nn.predict(x_test)

rmse = mean_squared_error(y_test, prediction)

print("RMSE on test data : ", rmse)
print("Score with test data",reg_nn.score(x_test, y_test))

Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'solver': 'lbfgs', 'shuffle': False, 'learning_rate': 'adaptive', 'hidden_layer_sizes': 72, 'activation': 'identity'}

Model with rank: 2
Mean validation score: 1.000 (std: 0.000)
Parameters: {'solver': 'lbfgs', 'shuffle': True, 'learning_rate': 'invscaling', 'hidden_layer_sizes': 35, 'activation': 'identity'}

Model with rank: 3
Mean validation score: 1.000 (std: 0.000)
Parameters: {'solver': 'adam', 'shuffle': False, 'learning_rate': 'constant', 'hidden_layer_sizes': 198, 'activation': 'identity'}

RMSE on test data :  1.5271366912678248e-07
Score with test data 0.9999999994471415


In [43]:
# ---------------> Support vector regression
param = Svr(x_train, y_train)
reg_nn = svm.SVR().set_params(**param)
reg_nn.fit(x_train, y_train)

prediction = reg_nn.predict(x_test)

rmse = mean_squared_error(y_test, prediction)

print("RMSE on test data : ", rmse)
print("Score with test data",reg_nn.score(x_test, y_test))


Model with rank: 1
Mean validation score: 1.000 (std: 0.000)
Parameters: {'shrinking': False, 'kernel': 'linear', 'gamma': 'auto'}

Model with rank: 2
Mean validation score: 0.361 (std: 0.211)
Parameters: {'shrinking': True, 'kernel': 'sigmoid', 'gamma': 'auto'}

Model with rank: 3
Mean validation score: 0.247 (std: 0.084)
Parameters: {'shrinking': True, 'kernel': 'poly', 'gamma': 'auto'}

Model with rank: 3
Mean validation score: 0.247 (std: 0.084)
Parameters: {'shrinking': False, 'kernel': 'poly', 'gamma': 'auto'}

RMSE on test data :  0.006296609433694639
Score with test data 0.9999772048302775
