In [2]:
import numpy as np
from scipy.io import arff
from io import StringIO
import sklearn
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from sklearn.metrics import mean_squared_error
import pandas as pd

In [3]:
def read(a):
    b = pd.read_csv(a, delimiter = ',')
    b = b.astype(np.float64)
    return b.to_numpy()

In [4]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        j = 0
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            if j > 1:
                break
            j+=1



def randomCV(clf, X, y, param_grid, n_iter, cv):
    random_search = RandomizedSearchCV(clf, param_distributions = param_grid,
          n_iter = n_iter, cv = cv, iid = False)
    random_search.fit(X, y)
    report(random_search.cv_results_)
    return random_search.best_params_

def Rf(x, y):
  clf = RandomForestRegressor()
  param_grid = {
        "n_estimators" : np.arange(2,50),
        "max_depth" : np.arange(1,6),

    # "criterion" : ['mse', 'mae'],
    # "min_samples_split" : np.random.random_sample((100,)),      
    # "min_samples_split" : np.linspace(0.01,1, num = 1000),
    # "min_samples_leaf" : np.linspace(0.01,0.5, num = 100),
    # "bootstrap" : [True, False],
    # "warm_start" : [True, False]
  }
  return randomCV(clf, x, y, param_grid, 50, 6)  


def Dt(x, y):
  clf = DecisionTreeRegressor()
  param_grid = {
      "max_depth" : np.arange(1,6),
#       "min_samples_split" : np.linspace(0.01,0.5, num = 1000),
#       "min_samples_leaf" : np.linspace(0.01,0.5, num = 1000),
      "criterion" : ['mse', 'mae', 'friedman_mse'],
      "splitter" : ['best', 'random'],
  }
  return randomCV(clf, x, y, param_grid, 5, 4)  

def Svr(x, y):
  clf = svm.SVR()
  param_grid = {
      "kernel" : ['poly', 'rbf', 'linear', 'sigmoid'],
      "gamma" : ['scale', 'auto'],
      "shrinking" : [True, False]
  }
  return randomCV(clf, x, y, param_grid, 4, 6)

def Ada(x, y):
  clf = AdaBoostRegressor()
  param_grid = {
      "n_estimators" : np.arange(1,100),
      "loss" : ['linear', 'square', 'exponential'],
      # "learning_rate" : np.arange(1,)
  }
  return randomCV(clf, x, y, param_grid, 30, 6)

def GP(x, y):
  clf = GaussianProcessRegressor()
  param_grid = {
#       "kernel" : [RBF, WhiteKernel],
      "normalize_y" : [True, False],
      "copy_X_train" : [True, False],
      "alpha" : np.linspace(0, 3, 100),
      
  }
  return randomCV(clf, x, y, param_grid, 4, 6)

def LR(x, y):
  clf = LinearRegression()
  param_grid = {
      "fit_intercept" : [True, False],
      "normalize" : [True, False],
      "copy_X" : [True, False],
  }
  return randomCV(clf, x, y, param_grid, 25, 6)

def NN(x, y):
  clf = MLPRegressor()
  param_grid = {
      "hidden_layer_sizes" : np.arange(1,20),
      "activation" : ['identity', 'logistic', 'tanh', 'relu'],
      "solver" : ['lbfgs', 'sgd', 'adam'],
      "learning_rate" : ['constant', 'invscaling', 'adaptive'],
      "shuffle" : [True, False],
  }
  return randomCV(clf, x, y, param_grid, 30, 6)

In [16]:
data = read("segm.csv")

np.random.shuffle(data)

y = data[:,-4:]
x = data[:,:13]

x_test, x_train = np.split(x, [36240])
y_test, y_train = np.split(y, [36240])

scaler = StandardScaler()                         # scaling features
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

scaler.fit(y_train)
y_train_nn = scaler.transform(y_train)
y_test_nn = scaler.transform(y_test)


# print(y_train.shape, x_train.shape)

# np.where(np.isnan(y_train))

# y_train = np.nan_to_num(y_train)

col_mean = np.nanmean(y_train, axis=0)
inds = np.where(np.isnan(y_train))
y_train[inds] = np.take(col_mean, inds[1])

col_mean = np.nanmean(y_test, axis=0)
inds = np.where(np.isnan(y_test))
y_test[inds] = np.take(col_mean, inds[1])

col_mean = np.nanmean(x_train, axis=0)
inds = np.where(np.isnan(x_train))
x_train[inds] = np.take(col_mean, inds[1])

col_mean = np.nanmean(x_test, axis=0)
inds = np.where(np.isnan(x_test))
x_test[inds] = np.take(col_mean, inds[1])

col_mean = np.nanmean(y_train_nn, axis=0)
inds = np.where(np.isnan(y_train_nn))
y_train_nn[inds] = np.take(col_mean, inds[1])

col_mean = np.nanmean(y_test_nn, axis=0)
inds = np.where(np.isnan(y_test_nn))
y_test_nn[inds] = np.take(col_mean, inds[1])

x_train = x_train[:20000]
y_train = y_train[:20000]
y_train_nn = y_train_nn[:20000]

x_test = x_test[:3000]
y_test = y_test[:3000]
y_test_nn = y_test_nn[:3000]

In [18]:
# ---------------> Run for Decision Tree regressor
j = 0
for i in y_train.T:
    param = Dt(x_train, i)
    reg_tree = DecisionTreeRegressor().set_params(**param)
    reg_tree.fit(x_train, i)
    prediction = reg_tree.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_tree.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1


Model with rank: 1
Mean validation score: 0.600 (std: 0.010)
Parameters: {'splitter': 'best', 'max_depth': 4, 'criterion': 'friedman_mse'}

Model with rank: 1
Mean validation score: 0.600 (std: 0.010)
Parameters: {'splitter': 'best', 'max_depth': 4, 'criterion': 'mse'}

Model with rank: 3
Mean validation score: 0.113 (std: 0.004)
Parameters: {'splitter': 'best', 'max_depth': 1, 'criterion': 'friedman_mse'}

RMSE on test data :  51227.07031289116
Score with test data 0.597846380528909
new data






Model with rank: 1
Mean validation score: 0.634 (std: 0.079)
Parameters: {'splitter': 'random', 'max_depth': 5, 'criterion': 'friedman_mse'}

Model with rank: 2
Mean validation score: 0.485 (std: 0.084)
Parameters: {'splitter': 'random', 'max_depth': 4, 'criterion': 'friedman_mse'}

Model with rank: 3
Mean validation score: 0.388 (std: 0.036)
Parameters: {'splitter': 'best', 'max_depth': 4, 'criterion': 'mae'}

RMSE on test data :  46499.64659619644
Score with test data 0.633729906581598
new

In [19]:
j = 0
for i in y_train.T:
    param = Rf(x_train, i)
    reg_rf = RandomForestRegressor().set_params(**param)
    reg_rf.fit(x_train, i)
    prediction = reg_rf.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_rf.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1


Model with rank: 1
Mean validation score: 0.777 (std: 0.011)
Parameters: {'n_estimators': 10, 'max_depth': 5}

Model with rank: 2
Mean validation score: 0.777 (std: 0.010)
Parameters: {'n_estimators': 35, 'max_depth': 5}

Model with rank: 3
Mean validation score: 0.776 (std: 0.011)
Parameters: {'n_estimators': 32, 'max_depth': 5}

RMSE on test data :  29124.019599508592
Score with test data 0.7713644402470934
new data






Model with rank: 1
Mean validation score: 0.777 (std: 0.009)
Parameters: {'n_estimators': 39, 'max_depth': 5}

Model with rank: 2
Mean validation score: 0.777 (std: 0.010)
Parameters: {'n_estimators': 19, 'max_depth': 5}

Model with rank: 3
Mean validation score: 0.776 (std: 0.010)
Parameters: {'n_estimators': 47, 'max_depth': 5}

RMSE on test data :  30182.30939080593
Score with test data 0.76225889680078
new data






Model with rank: 1
Mean validation score: 0.777 (std: 0.011)
Parameters: {'n_estimators': 22, 'max_depth': 5}

Model with rank: 2
Mean validation s

In [20]:
j = 0
for i in y_train.T:
    param = Svr(x_train, i)
    reg_svr = svm.SVR().set_params(**param)
    reg_svr.fit(x_train, i)
    prediction = reg_svr.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_svr.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1


Model with rank: 1
Mean validation score: 0.235 (std: 0.011)
Parameters: {'shrinking': True, 'kernel': 'linear', 'gamma': 'scale'}

Model with rank: 2
Mean validation score: 0.209 (std: 0.012)
Parameters: {'shrinking': True, 'kernel': 'rbf', 'gamma': 'scale'}

Model with rank: 3
Mean validation score: 0.166 (std: 0.009)
Parameters: {'shrinking': True, 'kernel': 'sigmoid', 'gamma': 'auto'}

RMSE on test data :  96323.17390262117
Score with test data 0.24382337722455238
new data






Model with rank: 1
Mean validation score: 0.234 (std: 0.011)
Parameters: {'shrinking': True, 'kernel': 'linear', 'gamma': 'scale'}

Model with rank: 1
Mean validation score: 0.234 (std: 0.011)
Parameters: {'shrinking': True, 'kernel': 'linear', 'gamma': 'auto'}

Model with rank: 3
Mean validation score: 0.166 (std: 0.009)
Parameters: {'shrinking': True, 'kernel': 'sigmoid', 'gamma': 'auto'}

RMSE on test data :  96068.89338832871
Score with test data 0.24328107562813428
new data






Model with rank: 1
Mea

In [21]:
j = 0
for i in y_train.T:
    param = Ada(x_train, i)
    reg_ada = AdaBoostRegressor().set_params(**param)
    reg_ada.fit(x_train, i)
    prediction = reg_ada.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_ada.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1


Model with rank: 1
Mean validation score: 0.650 (std: 0.015)
Parameters: {'n_estimators': 11, 'loss': 'exponential'}

Model with rank: 2
Mean validation score: 0.635 (std: 0.060)
Parameters: {'n_estimators': 10, 'loss': 'square'}

Model with rank: 3
Mean validation score: 0.629 (std: 0.021)
Parameters: {'n_estimators': 7, 'loss': 'linear'}

RMSE on test data :  50474.88083966188
Score with test data 0.6037513779714642
new data






Model with rank: 1
Mean validation score: 0.664 (std: 0.027)
Parameters: {'n_estimators': 15, 'loss': 'linear'}

Model with rank: 2
Mean validation score: 0.652 (std: 0.027)
Parameters: {'n_estimators': 18, 'loss': 'linear'}

Model with rank: 3
Mean validation score: 0.610 (std: 0.027)
Parameters: {'n_estimators': 7, 'loss': 'exponential'}

RMSE on test data :  50272.186758007476
Score with test data 0.604014226170293
new data






Model with rank: 1
Mean validation score: 0.639 (std: 0.029)
Parameters: {'n_estimators': 16, 'loss': 'exponential'}

Model wi

In [22]:
j = 0
for i in y_train.T:
    param = LR(x_train, i)
    reg_lr = LinearRegression().set_params(**param)
    reg_lr.fit(x_train, i)
    prediction = reg_lr.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_lr.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1




Model with rank: 1
Mean validation score: 0.405 (std: 0.013)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.405 (std: 0.013)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.405 (std: 0.013)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': False}

RMSE on test data :  75006.6588197098
Score with test data 0.41116680800719585
new data










Model with rank: 1
Mean validation score: 0.404 (std: 0.013)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.404 (std: 0.013)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': False}

Model with rank: 3
Mean validation score: 0.404 (std: 0.013)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 3
Mean validation score: 0.404 (std: 0.013)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': False}

RMSE on test data :  74757.12077488816
Score with test data 0.4111504148044693
new data










Model with rank: 1
Mean validation score: 0.404 (std: 0.013)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.404 (std: 0.013)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.404 (std: 0.013)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': False}

RMSE on test data :  74776.27710591424
Score with test data 0.4112669392544873
new data










Model with rank: 1
Mean validation score: 0.404 (std: 0.013)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.404 (std: 0.013)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.404 (std: 0.013)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': False}

RMSE on test data :  74668.02655348669
Score with test data 0.4115040084123234
new data








In [18]:
j = 0
for i in y_train.T:
    param = GP(x_train, i)
    reg_gp = GaussianProcessRegressor().set_params(**param)
    reg_gp.fit(x_train, i)
    prediction = reg_gp.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_gp.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1


MemoryError: 

In [19]:
# # --------------->
for i in y_train_nn.T:
    print(x_train.shape, i.shape)
    param = NN(x_train, i)
    reg_nn = MLPRegressor().set_params(**param)
    reg_nn.fit(x_train, i)

    prediction = reg_nn.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)

    print("RMSE on test data : ", rmse)
    print("Score with test data",reg_nn.score(x_test, y_test[:,j]))
    j+=1

(20000, 13) (20000,)
Model with rank: 1
Mean validation score: 0.888 (std: 0.006)
Parameters: {'solver': 'lbfgs', 'shuffle': False, 'learning_rate': 'adaptive', 'hidden_layer_sizes': 9, 'activation': 'logistic'}

Model with rank: 2
Mean validation score: 0.887 (std: 0.005)
Parameters: {'solver': 'lbfgs', 'shuffle': False, 'learning_rate': 'adaptive', 'hidden_layer_sizes': 12, 'activation': 'logistic'}

Model with rank: 3
Mean validation score: 0.886 (std: 0.005)
Parameters: {'solver': 'lbfgs', 'shuffle': False, 'learning_rate': 'constant', 'hidden_layer_sizes': 16, 'activation': 'tanh'}

RMSE on test data :  195772.58127573403
Score with test data -0.30667781955403384
(20000, 13) (20000,)
Model with rank: 1
Mean validation score: 0.887 (std: 0.004)
Parameters: {'solver': 'lbfgs', 'shuffle': True, 'learning_rate': 'adaptive', 'hidden_layer_sizes': 14, 'activation': 'logistic'}

Model with rank: 2
Mean validation score: 0.887 (std: 0.006)
Parameters: {'solver': 'lbfgs', 'shuffle': False,