In [305]:
import numpy as np
from scipy.io import arff
from io import StringIO
import sklearn
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from sklearn.metrics import mean_squared_error
import pandas as pd

In [306]:
def read(a):
    b = pd.read_csv(a, delimiter = ';')
    le = preprocessing.LabelEncoder()
    b['Type'] = le.fit_transform(b['Type'])
    b = b.astype(np.float64)
    return b.to_numpy()

In [307]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        j = 0
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            if j > 1:
                break
            j+=1



def randomCV(clf, X, y, param_grid, n_iter, cv):
    random_search = RandomizedSearchCV(clf, param_distributions = param_grid,
          n_iter = n_iter, cv = cv, iid = False)
    random_search.fit(X, y)
    report(random_search.cv_results_)
    return random_search.best_params_

def Rf(x, y):
  clf = RandomForestRegressor()
  param_grid = {
        "n_estimators" : np.arange(2,50),
        "max_depth" : np.arange(1,6),

    # "criterion" : ['mse', 'mae'],
    # "min_samples_split" : np.random.random_sample((100,)),      
    # "min_samples_split" : np.linspace(0.01,1, num = 1000),
    # "min_samples_leaf" : np.linspace(0.01,0.5, num = 100),
    # "bootstrap" : [True, False],
    # "warm_start" : [True, False]
  }
  return randomCV(clf, x, y, param_grid, 50, 6)  


def Dt(x, y):
  clf = DecisionTreeRegressor()
  param_grid = {
      "max_depth" : np.arange(1,6),
      "min_samples_split" : np.linspace(0.01,0.5, num = 1000),
      "min_samples_leaf" : np.linspace(0.01,0.5, num = 1000),

      # "criterion" : ['mse', 'mae', 'friedman_mse'],
      # "splitter" : ['best', 'random'],
  }
  return randomCV(clf, x, y, param_grid, 400, 6)  

def Svr(x, y):
  clf = svm.SVR()
  param_grid = {
      "kernel" : ['poly', 'rbf', 'linear', 'sigmoid'],
      "gamma" : ['scale', 'auto'],
      "shrinking" : [True, False]
  }
  return randomCV(clf, x, y, param_grid, 4, 6)

def Ada(x, y):
  clf = AdaBoostRegressor()
  param_grid = {
      "n_estimators" : np.arange(1,100),
      "loss" : ['linear', 'square', 'exponential'],
      # "learning_rate" : np.arange(1,)
  }
  return randomCV(clf, x, y, param_grid, 30, 6)

def GP(x, y):
  clf = GaussianProcessRegressor()
  param_grid = {
#       "kernel" : [RBF, WhiteKernel],
      "normalize_y" : [True, False],
      "copy_X_train" : [True, False],
#       "alpha" : np.linspace(0, 3, 100),
      
  }
  return randomCV(clf, x, y, param_grid, 4, 6)

def LR(x, y):
  clf = LinearRegression()
  param_grid = {
      "fit_intercept" : [True, False],
      "normalize" : [True, False],
      "copy_X" : [True, False],
  }
  return randomCV(clf, x, y, param_grid, 25, 6)

def NN(x, y):
  clf = MLPRegressor()
  param_grid = {
      "hidden_layer_sizes" : np.arange(1,200),
      "activation" : ['identity', 'logistic', 'tanh', 'relu'],
      "solver" : ['lbfgs', 'sgd', 'adam'],
      "learning_rate" : ['constant', 'invscaling', 'adaptive'],
      "shuffle" : [True, False],
      "alpha" : np.random.uniform(0.000001, 1, 1000)
  }
  return randomCV(clf, x, y, param_grid, 30, 6)

In [308]:
# ---------------> Run for data preprocessing

data = read("5facebook.csv")

np.random.shuffle(data)

y = data[:,-12:]
x = data[:,:6]

x_test, x_train = np.split(x, [75])
y_test, y_train = np.split(y, [75])

scaler = StandardScaler()                         # scaling features
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

scaler.fit(y_train)
y_train_nn = scaler.transform(y_train)
y_test_nn = scaler.transform(y_test)


# print(y_train.shape, x_train.shape)

# np.where(np.isnan(y_train))

# y_train = np.nan_to_num(y_train)

col_mean = np.nanmean(y_train, axis=0)
inds = np.where(np.isnan(y_train))
y_train[inds] = np.take(col_mean, inds[1])

col_mean = np.nanmean(y_test, axis=0)
inds = np.where(np.isnan(y_test))
y_test[inds] = np.take(col_mean, inds[1])

col_mean = np.nanmean(x_train, axis=0)
inds = np.where(np.isnan(x_train))
x_train[inds] = np.take(col_mean, inds[1])

col_mean = np.nanmean(x_test, axis=0)
inds = np.where(np.isnan(x_test))
x_test[inds] = np.take(col_mean, inds[1])

col_mean = np.nanmean(y_train_nn, axis=0)
inds = np.where(np.isnan(y_train_nn))
y_train_nn[inds] = np.take(col_mean, inds[1])

col_mean = np.nanmean(y_test_nn, axis=0)
inds = np.where(np.isnan(y_test_nn))
y_test_nn[inds] = np.take(col_mean, inds[1])


In [309]:
# ---------------> Run for Decision Tree regresssor
j = 0
for i in y_train.T:
    param = Dt(x_train, i)
    reg_tree = DecisionTreeRegressor().set_params(**param)
    reg_tree.fit(x_train, i)
    prediction = reg_tree.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_tree.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1


Model with rank: 1
Mean validation score: 0.017 (std: 0.054)
Parameters: {'min_samples_split': 0.057577577577577574, 'min_samples_leaf': 0.22189189189189187, 'max_depth': 5}

Model with rank: 1
Mean validation score: 0.017 (std: 0.054)
Parameters: {'min_samples_split': 0.08798798798798797, 'min_samples_leaf': 0.22091091091091092, 'max_depth': 3}

Model with rank: 3
Mean validation score: 0.016 (std: 0.052)
Parameters: {'min_samples_split': 0.06395395395395395, 'min_samples_leaf': 0.22924924924924925, 'max_depth': 2}

RMSE on test data :  504339950.8519739
Score with test data 0.06094882390042233
new data






Model with rank: 1
Mean validation score: -0.002 (std: 0.047)
Parameters: {'min_samples_split': 0.31214214214214214, 'min_samples_leaf': 0.17186186186186186, 'max_depth': 4}

Model with rank: 2
Mean validation score: -0.002 (std: 0.049)
Parameters: {'min_samples_split': 0.23856856856856856, 'min_samples_leaf': 0.17480480480480481, 'max_depth': 4}

Model with rank: 3
Mean validati

In [310]:
# ---------------> Run for Random Forest regressor
j = 0
for i in y_train.T:
    param = Rf(x_train, i)
    reg_rf = RandomForestRegressor().set_params(**param)
    reg_rf.fit(x_train, i)
    prediction = reg_rf.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_rf.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1


Model with rank: 1
Mean validation score: -0.007 (std: 0.045)
Parameters: {'n_estimators': 33, 'max_depth': 2}

Model with rank: 2
Mean validation score: -0.009 (std: 0.036)
Parameters: {'n_estimators': 39, 'max_depth': 2}

Model with rank: 3
Mean validation score: -0.011 (std: 0.049)
Parameters: {'n_estimators': 32, 'max_depth': 2}

RMSE on test data :  505254576.43315595
Score with test data 0.05924584513332509
new data






Model with rank: 1
Mean validation score: -0.027 (std: 0.066)
Parameters: {'n_estimators': 32, 'max_depth': 1}

Model with rank: 2
Mean validation score: -0.030 (std: 0.071)
Parameters: {'n_estimators': 20, 'max_depth': 1}

Model with rank: 3
Mean validation score: -0.034 (std: 0.071)
Parameters: {'n_estimators': 44, 'max_depth': 1}

RMSE on test data :  4347102240.638041
Score with test data 0.03323388805555694
new data






Model with rank: 1
Mean validation score: 0.122 (std: 0.159)
Parameters: {'n_estimators': 6, 'max_depth': 2}

Model with rank: 2
Mean val

In [311]:
# ---------------> Run for Support Vector regressor
j = 0
for i in y_train.T:
    param = Svr(x_train, i)
    reg_svr = svm.SVR().set_params(**param)
    reg_svr.fit(x_train, i)
    prediction = reg_svr.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_svr.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1


Model with rank: 1
Mean validation score: -0.157 (std: 0.036)
Parameters: {'shrinking': False, 'kernel': 'sigmoid', 'gamma': 'scale'}

Model with rank: 1
Mean validation score: -0.157 (std: 0.036)
Parameters: {'shrinking': True, 'kernel': 'sigmoid', 'gamma': 'scale'}

Model with rank: 3
Mean validation score: -0.157 (std: 0.036)
Parameters: {'shrinking': False, 'kernel': 'rbf', 'gamma': 'auto'}

RMSE on test data :  614875570.0723639
Score with test data -0.1448619651406935
new data






Model with rank: 1
Mean validation score: -0.106 (std: 0.045)
Parameters: {'shrinking': True, 'kernel': 'poly', 'gamma': 'auto'}

Model with rank: 1
Mean validation score: -0.106 (std: 0.045)
Parameters: {'shrinking': False, 'kernel': 'poly', 'gamma': 'auto'}

Model with rank: 3
Mean validation score: -0.106 (std: 0.046)
Parameters: {'shrinking': False, 'kernel': 'sigmoid', 'gamma': 'scale'}

Model with rank: 3
Mean validation score: -0.106 (std: 0.046)
Parameters: {'shrinking': True, 'kernel': 'sigmo

In [312]:
# ---------------> Run for Adaboost regressor
j = 0
for i in y_train.T:
    param = Ada(x_train, i)
    reg_ada = AdaBoostRegressor().set_params(**param)
    reg_ada.fit(x_train, i)
    prediction = reg_ada.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_ada.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1


Model with rank: 1
Mean validation score: -0.075 (std: 0.155)
Parameters: {'n_estimators': 3, 'loss': 'exponential'}

Model with rank: 2
Mean validation score: -0.292 (std: 0.236)
Parameters: {'n_estimators': 10, 'loss': 'exponential'}

Model with rank: 3
Mean validation score: -0.321 (std: 0.344)
Parameters: {'n_estimators': 11, 'loss': 'exponential'}

RMSE on test data :  574229304.267181
Score with test data -0.06918102088090428
new data






Model with rank: 1
Mean validation score: -0.238 (std: 0.353)
Parameters: {'n_estimators': 3, 'loss': 'exponential'}

Model with rank: 2
Mean validation score: -0.292 (std: 0.267)
Parameters: {'n_estimators': 22, 'loss': 'square'}

Model with rank: 3
Mean validation score: -0.297 (std: 0.196)
Parameters: {'n_estimators': 60, 'loss': 'square'}

RMSE on test data :  4442767629.093121
Score with test data 0.01195855324061168
new data






Model with rank: 1
Mean validation score: 0.003 (std: 0.317)
Parameters: {'n_estimators': 9, 'loss': 'linear

In [313]:
# ---------------> Run for Linear regressor
j = 0
for i in y_train.T:
    param = LR(x_train, i)
    reg_lr = LinearRegression().set_params(**param)
    reg_lr.fit(x_train, i)
    prediction = reg_lr.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_lr.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1


Model with rank: 1
Mean validation score: 0.006 (std: 0.057)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.006 (std: 0.057)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': False}

Model with rank: 3
Mean validation score: 0.006 (std: 0.057)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 3
Mean validation score: 0.006 (std: 0.057)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': False}

RMSE on test data :  503213534.22277963
Score with test data 0.0630461450795915
new data






Model with rank: 1
Mean validation score: -0.028 (std: 0.090)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: -0.028 (std: 0.090)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': False}

Model with rank: 3
Mean validation score: -0.028 (std: 0.090)
Parameters: {'normalize': True, 'fit



Score with test data 0.03626028423078398
new data






Model with rank: 1
Mean validation score: 0.119 (std: 0.204)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.119 (std: 0.204)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': False}

Model with rank: 3
Mean validation score: 0.119 (std: 0.204)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': True}

Model with rank: 3
Mean validation score: 0.119 (std: 0.204)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': False}

RMSE on test data :  317598.596795071
Score with test data 0.14377016397352882
new data






Model with rank: 1
Mean validation score: 0.149 (std: 0.236)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.149 (std: 0.236)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': False}

Model with rank: 3
Mean validation score: 0.149



RMSE on test data :  235099.18398144678
Score with test data 0.17418970352538388
new data






Model with rank: 1
Mean validation score: 0.124 (std: 0.082)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.124 (std: 0.082)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': False}

Model with rank: 3
Mean validation score: 0.124 (std: 0.082)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 3
Mean validation score: 0.124 (std: 0.082)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': False}

RMSE on test data :  1168145.742326366
Score with test data 0.26089266682966117
new data










Model with rank: 1
Mean validation score: -0.179 (std: 0.379)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: -0.179 (std: 0.379)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': False}

Model with rank: 3
Mean validation score: -0.179 (std: 0.379)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': True}

Model with rank: 3
Mean validation score: -0.179 (std: 0.379)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': False}

RMSE on test data :  444967000.7948783
Score with test data -0.06957818812980476
new data






Model with rank: 1
Mean validation score: 0.052 (std: 0.036)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.052 (std: 0.036)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': False}

Model with rank: 3
Mean validation score: 0.052 (std: 0.036)
Parameters: {'normalize': False, 'fi



Model with rank: 1
Mean validation score: -0.069 (std: 0.142)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: -0.069 (std: 0.142)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': False}

Model with rank: 3
Mean validation score: -0.069 (std: 0.142)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': True}

Model with rank: 3
Mean validation score: -0.069 (std: 0.142)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': False}

RMSE on test data :  50.79056761174663
Score with test data -0.40150572880095536
new data






Model with rank: 1
Mean validation score: -0.008 (std: 0.045)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: -0.008 (std: 0.045)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': False}

Model with rank: 3
Mean validation score: -0.008 (std: 0.045)
Parameters: {'normalize': True,



In [314]:
# ---------------> Run for Gaussian Process regressor
j = 0
for i in y_train.T:
    param = GP(x_train, i)
    reg_gp = GaussianProcessRegressor().set_params(**param)
    reg_gp.fit(x_train, i)
    prediction = reg_gp.predict(x_test)

    rmse = mean_squared_error(y_test[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_gp.score(x_test, y_test[:,j]))
    print('new data\n\n\n\n\n\n')
    j+=1


Model with rank: 1
Mean validation score: -14.197 (std: 8.986)
Parameters: {'normalize_y': False, 'copy_X_train': True}

Model with rank: 1
Mean validation score: -14.197 (std: 8.986)
Parameters: {'normalize_y': False, 'copy_X_train': False}

Model with rank: 3
Mean validation score: -14.243 (std: 9.032)
Parameters: {'normalize_y': True, 'copy_X_train': True}

Model with rank: 3
Mean validation score: -14.243 (std: 9.032)
Parameters: {'normalize_y': True, 'copy_X_train': False}

RMSE on test data :  7020004231.517602
Score with test data -12.070832914772284
new data






Model with rank: 1
Mean validation score: -8.822 (std: 6.643)
Parameters: {'normalize_y': False, 'copy_X_train': True}

Model with rank: 1
Mean validation score: -8.822 (std: 6.643)
Parameters: {'normalize_y': False, 'copy_X_train': False}

Model with rank: 3
Mean validation score: -8.837 (std: 6.680)
Parameters: {'normalize_y': True, 'copy_X_train': True}

Model with rank: 3
Mean validation score: -8.837 (std: 6.680)

In [315]:
# ---------------> Run for Neural Net regressor
j = 0
for i in y_train_nn.T:
    param = NN(x_train, i)
    reg_nn = MLPRegressor().set_params(**param)
    reg_nn.fit(x_train, i)
    prediction = reg_nn.predict(x_test)

    rmse = mean_squared_error(y_test_nn[:,j], prediction)
    print("RMSE on test data : ", rmse)

    print("Score with test data",reg_nn.score(x_test, y_test_nn[:,j]))
    print('\n\n\n\n\n\n')
    j+=1


Model with rank: 1
Mean validation score: 0.010 (std: 0.023)
Parameters: {'solver': 'adam', 'shuffle': True, 'learning_rate': 'invscaling', 'hidden_layer_sizes': 12, 'alpha': 0.9334986308285207, 'activation': 'logistic'}

Model with rank: 2
Mean validation score: 0.008 (std: 0.055)
Parameters: {'solver': 'adam', 'shuffle': True, 'learning_rate': 'invscaling', 'hidden_layer_sizes': 198, 'alpha': 0.1974228227098902, 'activation': 'identity'}

Model with rank: 3
Mean validation score: 0.006 (std: 0.049)
Parameters: {'solver': 'sgd', 'shuffle': False, 'learning_rate': 'adaptive', 'hidden_layer_sizes': 73, 'alpha': 0.5407697900045038, 'activation': 'tanh'}

RMSE on test data :  1.000791616181798
Score with test data 0.045186996019682346







Model with rank: 1
Mean validation score: -0.011 (std: 0.067)
Parameters: {'solver': 'adam', 'shuffle': True, 'learning_rate': 'constant', 'hidden_layer_sizes': 137, 'alpha': 0.8793447434794246, 'activation': 'logistic'}

Model with rank: 2
Mean valid

Model with rank: 1
Mean validation score: 0.011 (std: 0.027)
Parameters: {'solver': 'adam', 'shuffle': True, 'learning_rate': 'invscaling', 'hidden_layer_sizes': 15, 'alpha': 0.9679528262015413, 'activation': 'logistic'}

Model with rank: 2
Mean validation score: 0.006 (std: 0.039)
Parameters: {'solver': 'adam', 'shuffle': False, 'learning_rate': 'constant', 'hidden_layer_sizes': 159, 'alpha': 0.7698670360141316, 'activation': 'logistic'}

Model with rank: 3
Mean validation score: 0.003 (std: 0.048)
Parameters: {'solver': 'adam', 'shuffle': False, 'learning_rate': 'invscaling', 'hidden_layer_sizes': 136, 'alpha': 0.5200468122752874, 'activation': 'tanh'}

RMSE on test data :  0.26515294650622623
Score with test data 0.05071908126896163







