In [1]:
import numpy as np
from scipy.io import arff
from io import StringIO
import sklearn
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from sklearn.metrics import mean_squared_error

In [2]:
def read(a):
  f = open(a, "r")
  c = StringIO(f.read())
  return np.loadtxt(c, delimiter=',')

In [38]:
def report(results, n_top=3):
    print("\n\n\n\n\n")
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        j = 0
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            if j > 1:
                break
            j+=1


def randomCV(clf, X, y, param_grid, n_iter, cv):
    random_search = RandomizedSearchCV(clf, param_distributions = param_grid,
          n_iter = n_iter, cv = cv, iid = False, n_jobs = -1)
    random_search.fit(X, y)
    report(random_search.cv_results_)
    return random_search.best_params_

def Rf(x, y):
  clf = RandomForestRegressor()
  param_grid = {
        "n_estimators" : np.arange(2,50),
        "max_depth" : np.arange(1,6),

    "criterion" : ['mse', 'mae'],
    "min_samples_split" : np.random.random_sample((100,)),      
    "min_samples_split" : np.linspace(0.01,1, num = 1000),
    "min_samples_leaf" : np.linspace(0.01,0.5, num = 100),
    "bootstrap" : [True, False],
    "warm_start" : [True, False]
  }
  return randomCV(clf, x, y, param_grid, 400, 6)  


def Dt(x, y):
  clf = DecisionTreeRegressor()
  param_grid = {
      "max_depth" : np.arange(1,6),
      "min_samples_split" : np.linspace(0.01,0.5, num = 1000),
      "min_samples_leaf" : np.linspace(0.01,0.5, num = 1000),
      "criterion" : ['mse', 'mae', 'friedman_mse'],
      "splitter" : ['best', 'random'],
  }
  return randomCV(clf, x, y, param_grid, 400, 6)  

def Svr(x, y):
  clf = svm.SVR()
  param_grid = {
      "kernel" : ['poly', 'rbf', 'linear', 'sigmoid'],
      "gamma" : ['scale', 'auto'],
      "shrinking" : [True, False]
  }
  return randomCV(clf, x, y, param_grid, 15, 6)

def Ada(x, y):
  clf = AdaBoostRegressor()
  param_grid = {
      "n_estimators" : np.arange(1,100),
      "loss" : ['linear', 'square', 'exponential'],
      # "learning_rate" : np.arange(1,)
  }
  return randomCV(clf, x, y, param_grid, 250, 6)

def GP(x, y):
  clf = GaussianProcessRegressor()
  param_grid = {
#       "kernel" : ['RBF', 'WhiteKernel'],
      "normalize_y" : [True, False],
      "copy_X_train" : [True, False],
      "alpha" : np.linspace(0, 5, 100),
      
  }
  return randomCV(clf, x, y, param_grid, 25, 6)

def LR(x, y):
  clf = LinearRegression()
  param_grid = {
      "fit_intercept" : [True, False],
      "normalize" : [True, False],
      "copy_X" : [True, False],
  }
  return randomCV(clf, x, y, param_grid, 25, 6)

def NN(x, y):
  clf = MLPRegressor()
  param_grid = {
      "hidden_layer_sizes" : np.arange(1,200),
      "activation" : ['identity', 'logistic', 'tanh', 'relu'],
      "solver" : ['lbfgs', 'sgd', 'adam'],
      "learning_rate" : ['constant', 'invscaling', 'adaptive'],
      "shuffle" : [True, False],
  }
  return randomCV(clf, x, y, param_grid, 30, 6)

In [36]:
data = read("4train.txt")       # read data
data = data[:,1:]                                 # remove id

np.random.shuffle(data)                           # shuffle for fairness

y = data[:,-2:-1]                                 # separating prediction var

x = data[:,:data.shape[1] -2]

x_train, x_test = np.split(x, [940])              # separating test data
y_train, y_test = np.split(y, [940])

print(x_test.shape)



# scaler = StandardScaler()                         # scaling features
# scaler.fit(x_train)
# x_train = scaler.transform(x_train)
# x_test = scaler.transform(x_test)

(100, 26)


In [39]:
# ---------------> 
param = Dt(x_train, y_train)
reg_tree = DecisionTreeRegressor().set_params(**param)
reg_tree.fit(x_train, y_train)
prediction = reg_tree.predict(x_test)

rmse = mean_squared_error(y_test, prediction)
print("RMSE on test data : ", rmse)

print("Score with test data",reg_tree.score(x_test, y_test))








Model with rank: 1
Mean validation score: 0.065 (std: 0.063)
Parameters: {'splitter': 'best', 'min_samples_split': 0.0919119119119119, 'min_samples_leaf': 0.05708708708708709, 'max_depth': 4, 'criterion': 'friedman_mse'}

Model with rank: 2
Mean validation score: 0.060 (std: 0.029)
Parameters: {'splitter': 'random', 'min_samples_split': 0.4867567567567567, 'min_samples_leaf': 0.046786786786786785, 'max_depth': 3, 'criterion': 'mse'}

Model with rank: 3
Mean validation score: 0.053 (std: 0.022)
Parameters: {'splitter': 'random', 'min_samples_split': 0.24788788788788788, 'min_samples_leaf': 0.03207207207207207, 'max_depth': 5, 'criterion': 'mse'}

RMSE on test data :  227.21258398112758
Score with test data 0.16562773494751115


In [40]:
# --------------->
param = Rf(x_train, y_train)
reg_rf = RandomForestRegressor().set_params(**param)
reg_rf.fit(x_train, y_train)
prediction = reg_rf.predict(x_test)

rmse = mean_squared_error(y_test, prediction)
print("RMSE on test data : ", rmse)

print("Score with test data",reg_tree.score(x_test, y_test))










Model with rank: 1
Mean validation score: 0.129 (std: 0.023)
Parameters: {'n_estimators': 34, 'max_depth': 5}

Model with rank: 2
Mean validation score: 0.127 (std: 0.032)
Parameters: {'n_estimators': 28, 'max_depth': 5}

Model with rank: 3
Mean validation score: 0.126 (std: 0.027)
Parameters: {'n_estimators': 25, 'max_depth': 5}

RMSE on test data :  210.88105895357157
Score with test data 0.16562773494751115


In [22]:
# --------------->
param = Ada(x_train, y_train)
reg_ada = AdaBoostRegressor().set_params(**param)
reg_ada.fit(x_train, y_train)
prediction = reg_ada.predict(x_test)

rmse = mean_squared_error(y_test, prediction)
print("RMSE on test data : ", rmse)

print("Score with test data",reg_ada.score(x_test, y_test))








Model with rank: 1
Mean validation score: -0.120 (std: 0.179)
Parameters: {'n_estimators': 33, 'loss': 'square'}

Model with rank: 2
Mean validation score: -0.138 (std: 0.211)
Parameters: {'n_estimators': 92, 'loss': 'exponential'}

Model with rank: 3
Mean validation score: -0.143 (std: 0.213)
Parameters: {'n_estimators': 64, 'loss': 'square'}

RMSE on test data :  426.9457450536315
Score with test data 0.0


In [31]:
# --------------->
param = GP(x_train, y_train)
reg_gp = GaussianProcessRegressor().set_params(**param)
reg_gp.fit(x_train, y_train)
prediction = reg_gp.predict(x_test)

rmse = mean_squared_error(y_test, prediction)
print("RMSE on test data : ", rmse)

print("Score with test data",reg_gp.score(x_test, y_test))








Model with rank: 1
Mean validation score: -0.007 (std: 0.008)
Parameters: {'normalize_y': True, 'copy_X_train': False, 'alpha': 0.0}

Model with rank: 2
Mean validation score: -0.007 (std: 0.008)
Parameters: {'normalize_y': True, 'copy_X_train': False, 'alpha': 0.6060606060606061}

Model with rank: 3
Mean validation score: -0.007 (std: 0.008)
Parameters: {'normalize_y': True, 'copy_X_train': False, 'alpha': 0.9090909090909091}

RMSE on test data :  213.97143594386594
Score with test data -0.013754005544486692


In [32]:
# --------------->
param = LR(x_train, y_train)
reg_lr = LinearRegression().set_params(**param)
reg_lr.fit(x_train, y_train)
prediction = reg_lr.predict(x_test)

rmse = mean_squared_error(y_test, prediction)
print("RMSE on test data : ", rmse)

print("Score with test data",reg_lr.score(x_test, y_test))








Model with rank: 1
Mean validation score: 0.075 (std: 0.043)
Parameters: {'normalize': True, 'fit_intercept': False, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.075 (std: 0.043)
Parameters: {'normalize': False, 'fit_intercept': False, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.075 (std: 0.043)
Parameters: {'normalize': True, 'fit_intercept': False, 'copy_X': False}

RMSE on test data :  208.28892024135988
Score with test data 0.013168620971401257




In [33]:
# # --------------->
param = NN(x_train, y_train)
reg_nn = MLPRegressor().set_params(**param)
reg_nn.fit(x_train, y_train)

prediction = reg_nn.predict(x_test)

rmse = mean_squared_error(y_test, prediction)

print("RMSE on test data : ", rmse)
print("Score with test data",reg_nn.score(x_test, y_test))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
# ---------------> Support vector regression
param = Svr(x_train, y_train)
reg_nn = svm.SVR().set_params(**param)
reg_nn.fit(x_train, y_train)

prediction = reg_nn.predict(x_test)

rmse = mean_squared_error(y_test, prediction)

print("RMSE on test data : ", rmse)
print("Score with test data",reg_nn.score(x_test, y_test))
