In [1]:
import numpy as np
from scipy.io import arff
from io import StringIO
import sklearn
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from sklearn.metrics import mean_squared_error
import pandas as pd

In [2]:
def read(a):
    b = pd.read_csv(a, delimiter = ',')
    b = b.astype(np.float64)
    return b.to_numpy()

In [3]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        j = 0
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            if j > 1:
                break
            j+=1



def randomCV(clf, X, y, param_grid, n_iter, cv):
    random_search = RandomizedSearchCV(clf, param_distributions = param_grid,
          n_iter = n_iter, cv = cv, iid = False)
    random_search.fit(X, y)
    report(random_search.cv_results_)
    return random_search.best_params_

def Rf(x, y):
  clf = RandomForestRegressor()
  param_grid = {
        "n_estimators" : np.arange(2,50),
        "max_depth" : np.arange(1,6),

    # "criterion" : ['mse', 'mae'],
    # "min_samples_split" : np.random.random_sample((100,)),      
    # "min_samples_split" : np.linspace(0.01,1, num = 1000),
    # "min_samples_leaf" : np.linspace(0.01,0.5, num = 100),
    # "bootstrap" : [True, False],
    # "warm_start" : [True, False]
  }
  return randomCV(clf, x, y, param_grid, 50, 6)  


def Dt(x, y):
  clf = DecisionTreeRegressor()
  param_grid = {
      "max_depth" : np.arange(1,6),
#       "min_samples_split" : np.linspace(0.01,0.5, num = 1000),
#       "min_samples_leaf" : np.linspace(0.01,0.5, num = 1000),
      "criterion" : ['mse', 'mae', 'friedman_mse'],
      "splitter" : ['best', 'random'],
  }
  return randomCV(clf, x, y, param_grid, 5, 6)  

def Svr(x, y):
  clf = svm.SVR()
  param_grid = {
      "kernel" : ['poly', 'rbf', 'linear', 'sigmoid'],
      "gamma" : ['scale', 'auto'],
      "shrinking" : [True, False]
  }
  return randomCV(clf, x, y, param_grid, 4, 6)

def Ada(x, y):
  clf = AdaBoostRegressor()
  param_grid = {
      "n_estimators" : np.arange(1,100),
      "loss" : ['linear', 'square', 'exponential'],
      # "learning_rate" : np.arange(1,)
  }
  return randomCV(clf, x, y, param_grid, 30, 6)

def GP(x, y):
  clf = GaussianProcessRegressor()
  param_grid = {
#       "kernel" : [RBF, WhiteKernel],
      "normalize_y" : [True, False],
      "copy_X_train" : [True, False],
      "alpha" : np.linspace(0, 3, 100),
      
  }
  return randomCV(clf, x, y, param_grid, 4, 6)

def LR(x, y):
  clf = LinearRegression()
  param_grid = {
      "fit_intercept" : [True, False],
      "normalize" : [True, False],
      "copy_X" : [True, False],
  }
  return randomCV(clf, x, y, param_grid, 25, 6)

def NN(x, y):
  clf = MLPRegressor()
  param_grid = {
      "hidden_layer_sizes" : np.arange(1,20),
      "activation" : ['identity', 'logistic', 'tanh', 'relu'],
      "solver" : ['lbfgs', 'sgd', 'adam'],
      "learning_rate" : ['constant', 'invscaling', 'adaptive'],
      "shuffle" : [True, False],
  }
  return randomCV(clf, x, y, param_grid, 30, 6)

In [4]:
data = read("segm.csv")

np.random.shuffle(data)

y = data[:,-4:]
x = data[:,:13]

x_test, x_train = np.split(x, [36240])
y_test, y_train = np.split(y, [36240])

scaler = StandardScaler()                         # scaling features
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

scaler.fit(y_train)
y_train_nn = scaler.transform(y_train)
y_test_nn = scaler.transform(y_test)


# print(y_train.shape, x_train.shape)

# np.where(np.isnan(y_train))

# y_train = np.nan_to_num(y_train)

col_mean = np.nanmean(y_train, axis=0)
inds = np.where(np.isnan(y_train))
y_train[inds] = np.take(col_mean, inds[1])

col_mean = np.nanmean(y_test, axis=0)
inds = np.where(np.isnan(y_test))
y_test[inds] = np.take(col_mean, inds[1])

col_mean = np.nanmean(x_train, axis=0)
inds = np.where(np.isnan(x_train))
x_train[inds] = np.take(col_mean, inds[1])

col_mean = np.nanmean(x_test, axis=0)
inds = np.where(np.isnan(x_test))
x_test[inds] = np.take(col_mean, inds[1])

col_mean = np.nanmean(y_train_nn, axis=0)
inds = np.where(np.isnan(y_train_nn))
y_train_nn[inds] = np.take(col_mean, inds[1])

col_mean = np.nanmean(y_test_nn, axis=0)
inds = np.where(np.isnan(y_test_nn))
y_test_nn[inds] = np.take(col_mean, inds[1])

In [None]:
# ---------------> Run for Decision Tree regressor
j = 0
for i in y_train.T:
    param = Dt(x_train, i)
    reg_tree = DecisionTreeRegressor().set_params(**param)
    reg_tree.fit(x_train, i)
    prediction = reg_tree.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_tree.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1


In [None]:
j = 0
for i in y_train.T:
    param = Rf(x_train, i)
    reg_rf = RandomForestRegressor().set_params(**param)
    reg_rf.fit(x_train, i)
    prediction = reg_rf.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_rf.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1


In [None]:
j = 0
for i in y_train.T:
    param = Svr(x_train, i)
    reg_svr = svm.SVR().set_params(**param)
    reg_svr.fit(x_train, i)
    prediction = reg_svr.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_svr.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1


In [None]:
j = 0
for i in y_train.T:
    param = Ada(x_train, i)
    reg_ada = AdaBoostRegressor().set_params(**param)
    reg_ada.fit(x_train, i)
    prediction = reg_ada.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_ada.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1


In [None]:
j = 0
for i in y_train.T:
    param = LR(x_train, i)
    reg_lr = LinearRegression().set_params(**param)
    reg_lr.fit(x_train, i)
    prediction = reg_lr.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_lr.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1


In [None]:
j = 0
for i in y_train.T:
    param = GP(x_train, i)
    reg_gp = GaussianProcessRegressor().set_params(**param)
    reg_gp.fit(x_train, i)
    prediction = reg_gp.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_gp.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1
