In [7]:
import numpy as np
from scipy.io import arff
from io import StringIO
import sklearn
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
import pandas as pd


class reg4:
    def read(self, a):
      f = open(a, "r")
      c = StringIO(f.read())
      return np.loadtxt(c, delimiter=',')

    def report(self, results, n_top=3):
        print("\n\n\n\n\n")
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results['rank_test_score'] == i)
            j = 0
            for candidate in candidates:
                print("Model with rank: {0}".format(i))
                print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                      results['mean_test_score'][candidate],
                      results['std_test_score'][candidate]))
                print("Parameters: {0}".format(results['params'][candidate]))
                print("")
                if j > 1:
                    break
                j+=1


    def randomCV(self, clf, X, y, param_grid, n_iter, cv):
        random_search = RandomizedSearchCV(clf, param_distributions = param_grid,
              n_iter = n_iter, cv = cv, iid = False, n_jobs = -1)
        random_search.fit(X, y)
        self.report(random_search.cv_results_)
        return random_search.best_params_

    def Rf(self, x, y):
      clf = RandomForestRegressor()
      param_grid = {
            "n_estimators" : np.arange(2,50),
            "max_depth" : np.arange(1,6),

        "criterion" : ['mse', 'mae'],
        "min_samples_split" : np.random.random_sample((100,)),      
        "min_samples_split" : np.linspace(0.01,1, num = 1000),
        "min_samples_leaf" : np.linspace(0.01,0.5, num = 100),
        "bootstrap" : [True, False],
        "warm_start" : [True, False]
      }
      return self.randomCV(clf, x, y, param_grid, 100, 6)  


    def Dt(self, x, y):
      clf = DecisionTreeRegressor()
      param_grid = {
          "max_depth" : np.arange(1,6),
          "min_samples_split" : np.linspace(0.01,0.5, num = 1000),
          "min_samples_leaf" : np.linspace(0.01,0.5, num = 1000),
          "criterion" : ['mse', 'mae', 'friedman_mse'],
          "splitter" : ['best', 'random'],
      }
      return self.randomCV(clf, x, y, param_grid, 400, 6)  

    def Svr(self, x, y):
      clf = svm.SVR()
      param_grid = {
          "kernel" : ['poly', 'rbf', 'linear', 'sigmoid'],
          "gamma" : ['scale', 'auto'],
          "shrinking" : [True, False]
      }
      return self.randomCV(clf, x, y, param_grid, 15, 6)

    def Ada(self, x, y):
      clf = AdaBoostRegressor()
      param_grid = {
          "n_estimators" : np.arange(1,100),
          "loss" : ['linear', 'square', 'exponential'],
          # "learning_rate" : np.arange(1,)
      }
      return self.randomCV(clf, x, y, param_grid, 250, 6)

    def GP(self, x, y):
      clf = GaussianProcessRegressor()
      param_grid = {
    #       "kernel" : ['RBF', 'WhiteKernel'],
          "normalize_y" : [True, False],
          "copy_X_train" : [True, False],
          "alpha" : np.linspace(0, 5, 100),

      }
      return self.randomCV(clf, x, y, param_grid, 400, 6)

    def LR(self, x, y):
      clf = LinearRegression()
      param_grid = {
          "fit_intercept" : [True, False],
          "normalize" : [True, False],
          "copy_X" : [True, False],
      }
      return self.randomCV(clf, x, y, param_grid, 25, 6)

    def NN(self, x, y):
      clf = MLPRegressor()
      param_grid = {
          "hidden_layer_sizes" : np.arange(1,200),
          "activation" : ['identity', 'logistic', 'tanh', 'relu'],
          "solver" : ['lbfgs', 'sgd', 'adam'],
          "learning_rate" : ['constant', 'invscaling', 'adaptive'],
          "shuffle" : [True, False],
      }
      return self.randomCV(clf, x, y, param_grid, 30, 6)

    def start(self):
        data = self.read("4train.txt")       # read data
        data = data[:,1:]                                 # remove id

        # np.random.shuffle(data)                           # shuffle for fairness

        y = data[:,-2:-1]                                 # separating prediction var
        x = data[:,:data.shape[1] -2]

        # temp = data[:,:data.shape[1] -2]

        # print(temp.shape)

        # x = temp[:,:3].mean(axis = 1)
        # # print(x.shape)

        # x = np.column_stack((x, temp[:,3:13].mean(axis = 1)))
        # x = np.column_stack((x, temp[:,13:17].mean(axis = 1)))
        # x = np.column_stack((x, temp[:,17:26].mean(axis = 1)))

        # print(x.shape)
        x_train, x_test = np.split(x, [940])              # separating test data
        y_train, y_test = np.split(y, [940])

        sc = SelectKBest(f_regression, k=20).fit(x_train, y_train)
        x_train = sc.transform(x_train)
        x_test = sc.transform(x_test)


        # print(x_test.shape)


        scaler = StandardScaler()                         # scaling features
        scaler.fit(x_train)
        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)

        # scaler.fit(y_train)
        # y_train = scaler.transform(y_train)
        # y_test = scaler.transform(y_test)
        
        # ---------------> 
        param = self.Dt(x_train, y_train)
        reg_tree = DecisionTreeRegressor().set_params(**param)
        reg_tree.fit(x_train, y_train)
        prediction = reg_tree.predict(x_test)

        rmse = mean_squared_error(y_test, prediction)
        print("RMSE on test data : ", rmse)

        print("Score with test data",reg_tree.score(x_test, y_test))
        
        
        # --------------->
        param = self.Rf(x_train, y_train)
        reg_rf = RandomForestRegressor().set_params(**param)
        reg_rf.fit(x_train, y_train)
        prediction = reg_rf.predict(x_test)

        rmse = mean_squared_error(y_test, prediction)
        print("RMSE on test data : ", rmse)

        print("Score with test data",reg_tree.score(x_test, y_test))
        
        
        # --------------->
        param = self.Ada(x_train, y_train)
        reg_ada = AdaBoostRegressor().set_params(**param)
        reg_ada.fit(x_train, y_train)
        prediction = reg_ada.predict(x_test)

        rmse = mean_squared_error(y_test, prediction)
        print("RMSE on test data : ", rmse)

        print("Score with test data",reg_ada.score(x_test, y_test))

        
        # --------------->
        param = self.GP(x_train, y_train)
        reg_gp = GaussianProcessRegressor().set_params(**param)
        reg_gp.fit(x_train, y_train)
        prediction = reg_gp.predict(x_test)

        rmse = mean_squared_error(y_test, prediction)
        print("RMSE on test data : ", rmse)

        print("Score with test data",reg_gp.score(x_test, y_test))
        
        
        # --------------->
        param = self.LR(x_train, y_train)
        reg_lr = LinearRegression().set_params(**param)
        reg_lr.fit(x_train, y_train)
        prediction = reg_lr.predict(x_test)

        rmse = mean_squared_error(y_test, prediction)
        print("RMSE on test data : ", rmse)

        print("Score with test data",reg_lr.score(x_test, y_test))

        
        # # --------------->
        param = self.NN(x_train, y_train)
        reg_nn = MLPRegressor().set_params(**param)
        reg_nn.fit(x_train, y_train)

        prediction = reg_nn.predict(x_test)

        rmse = mean_squared_error(y_test, prediction)

        print("RMSE on test data : ", rmse)
        print("Score with test data",reg_nn.score(x_test, y_test))
        
        
        # ---------------> Support vector regression
        param = self.Svr(x_train, y_train)
        reg_nn = svm.SVR().set_params(**param)
        reg_nn.fit(x_train, y_train)

        prediction = reg_nn.predict(x_test)

        rmse = mean_squared_error(y_test, prediction)

        print("RMSE on test data : ", rmse)
        print("Score with test data",reg_nn.score(x_test, y_test))
        
        
class reg5:
    def read(self, a):
        b = pd.read_csv(a, delimiter = ';')
        le = preprocessing.LabelEncoder()
        b['Type'] = le.fit_transform(b['Type'])
        b = b.astype(np.float64)
        return b.to_numpy()

    def report(self, results, n_top=3):
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results['rank_test_score'] == i)
            j = 0
            for candidate in candidates:
                print("Model with rank: {0}".format(i))
                print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                      results['mean_test_score'][candidate],
                      results['std_test_score'][candidate]))
                print("Parameters: {0}".format(results['params'][candidate]))
                print("")
                if j > 1:
                    break
                j+=1



    def randomCV(self, clf, X, y, param_grid, n_iter, cv):
        random_search = RandomizedSearchCV(clf, param_distributions = param_grid,
              n_iter = n_iter, cv = cv, iid = False)
        random_search.fit(X, y)
        self.report(random_search.cv_results_)
        return random_search.best_params_

    def Rf(self, x, y):
      clf = RandomForestRegressor()
      param_grid = {
            "n_estimators" : np.arange(2,50),
            "max_depth" : np.arange(1,6),

        # "criterion" : ['mse', 'mae'],
        # "min_samples_split" : np.random.random_sample((100,)),      
        # "min_samples_split" : np.linspace(0.01,1, num = 1000),
        # "min_samples_leaf" : np.linspace(0.01,0.5, num = 100),
        # "bootstrap" : [True, False],
        # "warm_start" : [True, False]
      }
      return self.randomCV(clf, x, y, param_grid, 50, 6)  


    def Dt(self, x, y):
      clf = DecisionTreeRegressor()
      param_grid = {
          "max_depth" : np.arange(1,6),
          "min_samples_split" : np.linspace(0.01,0.5, num = 1000),
          "min_samples_leaf" : np.linspace(0.01,0.5, num = 1000),

          # "criterion" : ['mse', 'mae', 'friedman_mse'],
          # "splitter" : ['best', 'random'],
      }
      return self.randomCV(clf, x, y, param_grid, 400, 6)  

    def Svr(self, x, y):
      clf = svm.SVR()
      param_grid = {
          "kernel" : ['poly', 'rbf', 'linear', 'sigmoid'],
          "gamma" : ['scale', 'auto'],
          "shrinking" : [True, False]
      }
      return self.randomCV(clf, x, y, param_grid, 4, 6)

    def Ada(self, x, y):
      clf = AdaBoostRegressor()
      param_grid = {
          "n_estimators" : np.arange(1,100),
          "loss" : ['linear', 'square', 'exponential'],
          # "learning_rate" : np.arange(1,)
      }
      return self.randomCV(clf, x, y, param_grid, 30, 6)

    def GP(self, x, y):
      clf = GaussianProcessRegressor()
      param_grid = {
    #       "kernel" : [RBF, WhiteKernel],
          "normalize_y" : [True, False],
          "copy_X_train" : [True, False],
    #       "alpha" : np.linspace(0, 3, 100),

      }
      return self.randomCV(clf, x, y, param_grid, 4, 6)

    def LR(self, x, y):
      clf = LinearRegression()
      param_grid = {
          "fit_intercept" : [True, False],
          "normalize" : [True, False],
          "copy_X" : [True, False],
      }
      return self.randomCV(clf, x, y, param_grid, 25, 6)

    def NN(self, x, y):
      clf = MLPRegressor()
      param_grid = {
          "hidden_layer_sizes" : np.arange(1,200),
          "activation" : ['identity', 'logistic', 'tanh', 'relu'],
          "solver" : ['lbfgs', 'sgd', 'adam'],
          "learning_rate" : ['constant', 'invscaling', 'adaptive'],
          "shuffle" : [True, False],
          "alpha" : np.random.uniform(0.000001, 1, 1000)
      }
      return self.randomCV(clf, x, y, param_grid, 30, 6)

    def start(self):
                # ---------------> Run for data preprocessing

        data = self.read("5facebook.csv")

        np.random.shuffle(data)

        y = data[:,-12:]
        x = data[:,:6]

        x_test, x_train = np.split(x, [75])
        y_test, y_train = np.split(y, [75])

        scaler = StandardScaler()                         # scaling features
        scaler.fit(x_train)
        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)

        scaler.fit(y_train)
        y_train_nn = scaler.transform(y_train)
        y_test_nn = scaler.transform(y_test)


        # print(y_train.shape, x_train.shape)

        # np.where(np.isnan(y_train))

        # y_train = np.nan_to_num(y_train)

        col_mean = np.nanmean(y_train, axis=0)
        inds = np.where(np.isnan(y_train))
        y_train[inds] = np.take(col_mean, inds[1])

        col_mean = np.nanmean(y_test, axis=0)
        inds = np.where(np.isnan(y_test))
        y_test[inds] = np.take(col_mean, inds[1])

        col_mean = np.nanmean(x_train, axis=0)
        inds = np.where(np.isnan(x_train))
        x_train[inds] = np.take(col_mean, inds[1])

        col_mean = np.nanmean(x_test, axis=0)
        inds = np.where(np.isnan(x_test))
        x_test[inds] = np.take(col_mean, inds[1])

        col_mean = np.nanmean(y_train_nn, axis=0)
        inds = np.where(np.isnan(y_train_nn))
        y_train_nn[inds] = np.take(col_mean, inds[1])

        col_mean = np.nanmean(y_test_nn, axis=0)
        inds = np.where(np.isnan(y_test_nn))
        y_test_nn[inds] = np.take(col_mean, inds[1])
        # ---------------> Run for Decision Tree regresssor
        j = 0
        for i in y_train.T:
            param = self.Dt(x_train, i)
            reg_tree = DecisionTreeRegressor().set_params(**param)
            reg_tree.fit(x_train, i)
            prediction = reg_tree.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_tree.score(x_test, y_test[:,j]))
            print('new data\n\n\n\n\n\n')
            j+=1

                    # ---------------> Run for Random Forest regressor
        j = 0
        for i in y_train.T:
            param = self.Rf(x_train, i)
            reg_rf = RandomForestRegressor().set_params(**param)
            reg_rf.fit(x_train, i)
            prediction = reg_rf.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_rf.score(x_test, y_test[:,j]))
            print('new data\n\n\n\n\n\n')
            j+=1

        # ---------------> Run for Support Vector regressor
        j = 0
        for i in y_train.T:
            param = self.Svr(x_train, i)
            reg_svr = svm.SVR().set_params(**param)
            reg_svr.fit(x_train, i)
            prediction = reg_svr.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_svr.score(x_test, y_test[:,j]))
            print('new data\n\n\n\n\n\n')
            j+=1

                    # ---------------> Run for Adaboost regressor
        j = 0
        for i in y_train.T:
            param = self.Ada(x_train, i)
            reg_ada = AdaBoostRegressor().set_params(**param)
            reg_ada.fit(x_train, i)
            prediction = reg_ada.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_ada.score(x_test, y_test[:,j]))
            print('new data\n\n\n\n\n\n')
            j+=1
        # ---------------> Run for Linear regressor
        j = 0
        for i in y_train.T:
            param = self.LR(x_train, i)
            reg_lr = LinearRegression().set_params(**param)
            reg_lr.fit(x_train, i)
            prediction = reg_lr.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_lr.score(x_test, y_test[:,j]))
            print('new data\n\n\n\n\n\n')
            j+=1

            # ---------------> Run for Neural Net regressor
        j = 0
        for i in y_train_nn.T:
            param = self.NN(x_train, i)
            reg_nn = MLPRegressor().set_params(**param)
            reg_nn.fit(x_train, i)
            prediction = reg_nn.predict(x_test)

            rmse = mean_squared_error(y_test_nn[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_nn.score(x_test, y_test_nn[:,j]))
            print('\n\n\n\n\n\n')
            j+=1
        # ---------------> Run for Gaussian Process regressor
        j = 0
        for i in y_train.T:
            param = self.GP(x_train, i)
            reg_gp = GaussianProcessRegressor().set_params(**param)
            reg_gp.fit(x_train, i)
            prediction = reg_gp.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_gp.score(x_test, y_test[:,j]))
            print('new data\n\n\n\n\n\n')
            j+=1

class reg9:
    def read(self, a):
        b = pd.read_csv(a, delimiter = ',')
        b = b.astype(np.float64)
        return b.to_numpy()
    def report(self, results, n_top=3):
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results['rank_test_score'] == i)
            j = 0
            for candidate in candidates:
                print("Model with rank: {0}".format(i))
                print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                      results['mean_test_score'][candidate],
                      results['std_test_score'][candidate]))
                print("Parameters: {0}".format(results['params'][candidate]))
                print("")
                if j > 1:
                    break
                j+=1



    def randomCV(self, clf, X, y, param_grid, n_iter, cv):
        random_search = RandomizedSearchCV(clf, param_distributions = param_grid,
              n_iter = n_iter, cv = cv, iid = False)
        random_search.fit(X, y)
        self.report(random_search.cv_results_)
        return self.random_search.best_params_

    def Rf(self, x, y):
      clf = RandomForestRegressor()
      param_grid = {
            "n_estimators" : np.arange(2,50),
            "max_depth" : np.arange(1,6),

        # "criterion" : ['mse', 'mae'],
        # "min_samples_split" : np.random.random_sample((100,)),      
        # "min_samples_split" : np.linspace(0.01,1, num = 1000),
        # "min_samples_leaf" : np.linspace(0.01,0.5, num = 100),
        # "bootstrap" : [True, False],
        # "warm_start" : [True, False]
      }
      return self.randomCV(clf, x, y, param_grid, 50, 6)  


    def Dt(self, x, y):
      clf = DecisionTreeRegressor()
      param_grid = {
          "max_depth" : np.arange(1,6),
    #       "min_samples_split" : np.linspace(0.01,0.5, num = 1000),
    #       "min_samples_leaf" : np.linspace(0.01,0.5, num = 1000),
          "criterion" : ['mse', 'mae', 'friedman_mse'],
          "splitter" : ['best', 'random'],
      }
      return self.randomCV(clf, x, y, param_grid, 5, 4)  

    def Svr(self, x, y):
      clf = svm.SVR()
      param_grid = {
          "kernel" : ['poly', 'rbf', 'linear', 'sigmoid'],
          "gamma" : ['scale', 'auto'],
          "shrinking" : [True, False]
      }
      return self.randomCV(clf, x, y, param_grid, 4, 6)

    def Ada(self, x, y):
      clf = AdaBoostRegressor()
      param_grid = {
          "n_estimators" : np.arange(1,100),
          "loss" : ['linear', 'square', 'exponential'],
          # "learning_rate" : np.arange(1,)
      }
      return self.randomCV(clf, x, y, param_grid, 30, 6)

    def GP(self, x, y):
      clf = GaussianProcessRegressor()
      param_grid = {
    #       "kernel" : [RBF, WhiteKernel],
          "normalize_y" : [True, False],
          "copy_X_train" : [True, False],
          "alpha" : np.linspace(0, 3, 100),

      }
      return self.randomCV(clf, x, y, param_grid, 4, 6)

    def LR(self, x, y):
      clf = LinearRegression()
      param_grid = {
          "fit_intercept" : [True, False],
          "normalize" : [True, False],
          "copy_X" : [True, False],
      }
      return self.randomCV(clf, x, y, param_grid, 25, 6)

    def NN(self, x, y):
      clf = MLPRegressor()
      param_grid = {
          "hidden_layer_sizes" : np.arange(1,20),
          "activation" : ['identity', 'logistic', 'tanh', 'relu'],
          "solver" : ['lbfgs', 'sgd', 'adam'],
          "learning_rate" : ['constant', 'invscaling', 'adaptive'],
          "shuffle" : [True, False],
      }
      return self.randomCV(clf, x, y, param_grid, 30, 6)

    def start(self):
        data = self.read("segm.csv")
        np.random.shuffle(data)

        y = data[:,-4:]
        x = data[:,:13]

        x_test, x_train = np.split(x, [36240])
        y_test, y_train = np.split(y, [36240])

        scaler = StandardScaler()                         # scaling features
        scaler.fit(x_train)
        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)

        scaler.fit(y_train)
        y_train_nn = scaler.transform(y_train)
        y_test_nn = scaler.transform(y_test)


        # print(y_train.shape, x_train.shape)

        # np.where(np.isnan(y_train))

        # y_train = np.nan_to_num(y_train)

        col_mean = np.nanmean(y_train, axis=0)
        inds = np.where(np.isnan(y_train))
        y_train[inds] = np.take(col_mean, inds[1])

        col_mean = np.nanmean(y_test, axis=0)
        inds = np.where(np.isnan(y_test))
        y_test[inds] = np.take(col_mean, inds[1])

        col_mean = np.nanmean(x_train, axis=0)
        inds = np.where(np.isnan(x_train))
        x_train[inds] = np.take(col_mean, inds[1])

        col_mean = np.nanmean(x_test, axis=0)
        inds = np.where(np.isnan(x_test))
        x_test[inds] = np.take(col_mean, inds[1])

        col_mean = np.nanmean(y_train_nn, axis=0)
        inds = np.where(np.isnan(y_train_nn))
        y_train_nn[inds] = np.take(col_mean, inds[1])

        col_mean = np.nanmean(y_test_nn, axis=0)
        inds = np.where(np.isnan(y_test_nn))
        y_test_nn[inds] = np.take(col_mean, inds[1])

        x_train = x_train[:20000]
        y_train = y_train[:20000]
        y_train_nn = y_train_nn[:20000]

        x_test = x_test[:3000]
        y_test = y_test[:3000]
        y_test_nn = y_test_nn[:3000]

        # ---------------> Run for Decision Tree regressor
        j = 0
        for i in y_train.T:
            param = self.Dt(x_train, i)
            reg_tree = DecisionTreeRegressor().set_params(**param)
            reg_tree.fit(x_train, i)
            prediction = reg_tree.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_tree.score(x_test, y_test[:,j]))
            print('new data\n\n\n\n\n\n')
            j+=1
        j = 0
        for i in y_train.T:
            param = self.Rf(x_train, i)
            reg_rf = RandomForestRegressor().set_params(**param)
            reg_rf.fit(x_train, i)
            prediction = reg_rf.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_rf.score(x_test, y_test[:,j]))
            print('new data\n\n\n\n\n\n')
            j+=1
        j = 0
        for i in y_train.T:
            param = self.Svr(x_train, i)
            reg_svr = svm.SVR().set_params(**param)
            reg_svr.fit(x_train, i)
            prediction = reg_svr.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_svr.score(x_test, y_test[:,j]))
            print('new data\n\n\n\n\n\n')
            j+=1
        j = 0
        for i in y_train.T:
            param = self.Ada(x_train, i)
            reg_ada = AdaBoostRegressor().set_params(**param)
            reg_ada.fit(x_train, i)
            prediction = reg_ada.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_ada.score(x_test, y_test[:,j]))
            print('new data\n\n\n\n\n\n')
            j+=1
        j = 0
        for i in y_train.T:
            param = self.LR(x_train, i)
            reg_lr = LinearRegression().set_params(**param)
            reg_lr.fit(x_train, i)
            prediction = reg_lr.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_lr.score(x_test, y_test[:,j]))
            print('new data\n\n\n\n\n\n')
            j+=1

            ########### does not converge ############
    #     j = 0
    #     for i in y_train.T:
    #         param = self.GP(x_train, i)
    #         reg_gp = GaussianProcessRegressor().set_params(**param)
    #         reg_gp.fit(x_train, i)
    #         prediction = reg_gp.predict(x_test)

    #         rmse = mean_squared_error(y_test[:,j], prediction)
    #         print("RMSE on test data : ", rmse)

    #         print("Score with test data",reg_gp.score(x_test, y_test[:,j]))
    #         print('new data\n\n\n\n\n\n')
    #         j+=1

        # # --------------->
        for i in y_train_nn.T:
            print(x_train.shape, i.shape)
            param = self.NN(x_train, i)
            reg_nn = MLPRegressor().set_params(**param)
            reg_nn.fit(x_train, i)

            prediction = reg_nn.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)

            print("RMSE on test data : ", rmse)
            print("Score with test data",reg_nn.score(x_test, y_test[:,j]))
            j+=1        

In [13]:
import numpy as np
from scipy.io import arff
from io import StringIO
import sklearn
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2
import math

class reg10:
    def read(self, a):
        with open(a) as f:
            cols = f.readline().rstrip('\n').split(',')
            X = np.loadtxt(a, delimiter=',', usecols=range(2, len(cols)), skiprows=1, dtype=np.uint8)
            y = np.loadtxt(a, delimiter=',', usecols=[1], skiprows=1)
        if a == 'ACT2.csv':
            np.save('ACT2.npy', np.column_stack((X, y)))
        else:
            np.save('ACT4.npy', np.column_stack((X, y)))
        return np.column_stack((X, y))        
    def report(self, results, n_top=3):
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results['rank_test_score'] == i)
            j = 0
            for candidate in candidates:
                print("Model with rank: {0}".format(i))
                print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                      results['mean_test_score'][candidate],
                      results['std_test_score'][candidate]))
                print("Parameters: {0}".format(results['params'][candidate]))
                print("")
                if j > 1:
                    break
                j+=1



    def randomCV(self, clf, X, y, param_grid, n_iter, cv):
        random_search = RandomizedSearchCV(clf, param_distributions = param_grid,
              n_iter = n_iter, cv = cv, iid = False, n_jobs = -1)
        random_search.fit(X, y)
        self.report(random_search.cv_results_)
        return random_search.best_params_

    def Rf(self, x, y):
      clf = RandomForestRegressor()
      param_grid = {
            "n_estimators" : np.arange(2,50),
            "max_depth" : np.arange(1,6),

        # "criterion" : ['mse', 'mae'],
        # "min_samples_split" : np.random.random_sample((100,)),      
        # "min_samples_split" : np.linspace(0.01,1, num = 1000),
        # "min_samples_leaf" : np.linspace(0.01,0.5, num = 100),
        # "bootstrap" : [True, False],
        # "warm_start" : [True, False]
      }
      return self.randomCV(clf, x, y, param_grid, 5, 6)  


    def Dt(self, x, y):
      clf = DecisionTreeRegressor()
      param_grid = {
          "max_depth" : np.arange(1,6),
    #       "min_samples_split" : np.linspace(0.01,0.5, num = 1000),
    #       "min_samples_leaf" : np.linspace(0.01,0.5, num = 1000),
          "criterion" : ['mse', 'mae', 'friedman_mse'],
          "splitter" : ['best', 'random'],
      }
      return self.randomCV(clf, x, y, param_grid, 10, 6)  

    def Svr(self, x, y):
      clf = svm.SVR()
      param_grid = {
          "kernel" : ['poly', 'rbf', 'linear', 'sigmoid'],
          "gamma" : ['scale', 'auto'],
          "shrinking" : [True, False]
      }
      return self.randomCV(clf, x, y, param_grid, 4, 6)

    def Ada(self, x, y):
      clf = AdaBoostRegressor()
      param_grid = {
          "n_estimators" : np.arange(1,100),
          "loss" : ['linear', 'square', 'exponential'],
          # "learning_rate" : np.arange(1,)
      }
      return self.randomCV(clf, x, y, param_grid, 8, 6)

    def GP(self, x, y):
      clf = GaussianProcessRegressor()
      param_grid = {
    #       "kernel" : [RBF, WhiteKernel],
          "normalize_y" : [True, False],
          "copy_X_train" : [True, False],
          "alpha" : np.linspace(0, 3, 100),

      }
      return self.randomCV(clf, x, y, param_grid, 4, 6)

    def LR(self, x, y):
      clf = LinearRegression()
      param_grid = {
          "fit_intercept" : [True, False],
          "normalize" : [True, False],
          "copy_X" : [True, False],
      }
      return self.randomCV(clf, x, y, param_grid, 8, 6)

    def NN(self, x, y):
      clf = MLPRegressor()
      param_grid = {
          "hidden_layer_sizes" : np.arange(1,20),
          "activation" : ['identity', 'logistic', 'tanh', 'relu'],
          "solver" : ['lbfgs', 'sgd', 'adam'],
          "learning_rate" : ['constant', 'invscaling', 'adaptive'],
          "shuffle" : [True, False],
      }
      return self.randomCV(clf, x, y, param_grid, 8, 6)

    def start(self):
        data = self.read("ACT2.csv")
        data1 = self.read("ACT4.csv")
        
#         frames = [data1, data]
#         dataC = pd.concat(frames)
        
#         dataC = dataC.to_numpy()

        # le = preprocessing.LabelEncoder()
        # j = 0
        # for i in dataC.T:
        #   if type(i[0]) != np.int or np.float64 or np.int32:
        #     i = le.fit_transform(i)

        dataC = dataC.astype(np.float64)
        col_mean = np.nanmean(dataC, axis=0)
        inds = np.where(np.isnan(dataC))
        dataC[inds] = np.take(col_mean, inds[1])

        np.random.shuffle(dataC)

        y = dataC[:,:1]
        x = dataC[:,1:]

        p = pd.DataFrame(x)

        selector = VarianceThreshold(p.var(axis = 1).mean() * 0.5)

        p = selector.fit_transform(p)

        x = p
        scaler = StandardScaler()                         # scaling features
        scaler.fit(x)
        x = scaler.transform(x)

        pca = PCA()
        pca.fit_transform(x_train)
        print(len(pca.explained_variance_ratio_[pca.explained_variance_ratio_ > 0.001]))


        if x.shape[1] > int(x.shape[1] * 0.5):
          x = SelectKBest(k=int(x.shape[1] * 0.5), score_func = f_regression).fit_transform(x, y)

        print(x.shape)

        pca = PCA(n_components= 52)
        x = pca.fit_transform(x)
        # x_train = pca.transform(x_train)
        # x_test = pca.transform(x_test)


        x_test, x_train = np.split(x, [1843])
        y_test, y_train = np.split(y, [1843])


        # scaler.fit(y_train)
        # y_train_nn = scaler.transform(y_train)
        # y_test_nn = scaler.transform(y_test)


        # print(y_train.shape, x_train.shape)

        # np.where(np.isnan(y_train))

        # y_train = np.nan_to_num(y_train)





        # col_mean = np.nanmean(y_train, axis=0)
        # inds = np.where(np.isnan(y_train))
        # y_train[inds] = np.take(col_mean, inds[1])

        # col_mean = np.nanmean(y_test, axis=0)
        # inds = np.where(np.isnan(y_test))
        # y_test[inds] = np.take(col_mean, inds[1])

        # col_mean = np.nanmean(x_train, axis=0)
        # inds = np.where(np.isnan(x_train))
        # x_train[inds] = np.take(col_mean, inds[1])

        # col_mean = np.nanmean(x_test, axis=0)
        # inds = np.where(np.isnan(x_test))
        # x_test[inds] = np.take(col_mean, inds[1])

        # col_mean = np.nanmean(y_train_nn, axis=0)
        # inds = np.where(np.isnan(y_train_nn))
        # y_train_nn[inds] = np.take(col_mean, inds[1])

        # col_mean = np.nanmean(y_test_nn, axis=0)
        # inds = np.where(np.isnan(y_test_nn))
        # y_test_nn[inds] = np.take(col_mean, inds[1])

        # ---------------> Run for Decision Tree regressor
        j = 0
        # for i in y_train.T:
        param = self.Dt(x_train, y_train)
        reg_tree = DecisionTreeRegressor().set_params(**param)
        reg_tree.fit(x_train, y_train)
        prediction = reg_tree.predict(x_test)

        rmse = mean_squared_error(y_test, prediction)
        print("RMSE on test data : ", rmse)

        print("Score with test data",reg_tree.score(x_test, y_test))
        print('new data\n\n\n\n\n\n')
        j+=1
        j = 0
        for i in y_train.T:
            param = self.Rf(x_train, i)
            reg_rf = RandomForestRegressor().set_params(**param)
            reg_rf.fit(x_train, i)
            prediction = reg_rf.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_rf.score(x_test, y_test[:,j]))
            print('new data\n\n\n\n\n\n')
            j+=1

        j = 0
        for i in y_train.T:
            param = self.Svr(x_train, i)
            reg_svr = svm.SVR().set_params(**param)
            reg_svr.fit(x_train, i)
            prediction = reg_svr.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_svr.score(x_test, y_test[:,j]))
            print('new data\n\n\n\n\n\n')
            j+=1

        j = 0
        for i in y_train.T:
            param = self.Ada(x_train, i)
            reg_ada = AdaBoostRegressor().set_params(**param)
            reg_ada.fit(x_train, i)
            prediction = reg_ada.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_ada.score(x_test, y_test[:,j]))
            print('new data\n\n\n\n\n\n')
            j+=1
        j = 0
        for i in y_train.T:
            param = self.LR(x_train, i)
            reg_lr = LinearRegression().set_params(**param)
            reg_lr.fit(x_train, i)
            prediction = reg_lr.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_lr.score(x_test, y_test[:,j]))
            print('new data\n\n\n\n\n\n')
            j+=1
        j = 0
        ########## may not converge #############
        for i in y_train.T:
            param = self.GP(x_train, i)
            reg_gp = GaussianProcessRegressor().set_params(**param)
            reg_gp.fit(x_train, i)
            prediction = reg_gp.predict(x_test)

            rmse = mean_squared_error(y_test[:,j], prediction)
            print("RMSE on test data : ", rmse)

            print("Score with test data",reg_gp.score(x_test, y_test[:,j]))
            print('new data\n\n\n\n\n\n')
            j+=1


In [14]:
c = reg10()
c.start()

FileNotFoundError: [Errno 2] No such file or directory: 'ACT2.csv'

In [19]:
cl = reg4()
cl.start()

cl1 = reg5()
cl1.start()

cl2 = reg9()
cl2.start()

Model with rank: 1
Mean validation score: -0.073 (std: 0.207)
Parameters: {'min_samples_split': 0.44947947947947947, 'min_samples_leaf': 0.014414414414414415, 'max_depth': 5}

Model with rank: 2
Mean validation score: -0.080 (std: 0.220)
Parameters: {'min_samples_split': 0.49656656656656656, 'min_samples_leaf': 0.014904904904904905, 'max_depth': 5}

Model with rank: 3
Mean validation score: -0.083 (std: 0.170)
Parameters: {'min_samples_split': 0.2621121121121121, 'min_samples_leaf': 0.499019019019019, 'max_depth': 4}

Model with rank: 3
Mean validation score: -0.083 (std: 0.170)
Parameters: {'min_samples_split': 0.13997997997998, 'min_samples_leaf': 0.5, 'max_depth': 1}

Model with rank: 3
Mean validation score: -0.083 (std: 0.170)
Parameters: {'min_samples_split': 0.12575575575575576, 'min_samples_leaf': 0.499019019019019, 'max_depth': 4}

RMSE on test data :  382754582.20620346
Score with test data 0.03560677049618999
new data






Model with rank: 1
Mean validation score: -0.129 (s

Model with rank: 1
Mean validation score: -0.084 (std: 0.244)
Parameters: {'n_estimators': 43, 'max_depth': 2}

Model with rank: 2
Mean validation score: -0.091 (std: 0.279)
Parameters: {'n_estimators': 47, 'max_depth': 1}

Model with rank: 3
Mean validation score: -0.092 (std: 0.249)
Parameters: {'n_estimators': 32, 'max_depth': 2}

RMSE on test data :  403147049.39608264
Score with test data -0.01577434472767192
new data






Model with rank: 1
Mean validation score: -0.205 (std: 0.396)
Parameters: {'n_estimators': 27, 'max_depth': 1}

Model with rank: 2
Mean validation score: -0.205 (std: 0.466)
Parameters: {'n_estimators': 15, 'max_depth': 1}

Model with rank: 3
Mean validation score: -0.207 (std: 0.408)
Parameters: {'n_estimators': 44, 'max_depth': 1}

RMSE on test data :  1578906638.3940132
Score with test data -0.09703833745678159
new data






Model with rank: 1
Mean validation score: 0.199 (std: 0.111)
Parameters: {'n_estimators': 27, 'max_depth': 2}

Model with rank: 2
Mean

Model with rank: 1
Mean validation score: -0.033 (std: 0.049)
Parameters: {'shrinking': False, 'kernel': 'linear', 'gamma': 'scale'}

Model with rank: 1
Mean validation score: -0.033 (std: 0.049)
Parameters: {'shrinking': True, 'kernel': 'linear', 'gamma': 'auto'}

Model with rank: 3
Mean validation score: -0.092 (std: 0.049)
Parameters: {'shrinking': True, 'kernel': 'poly', 'gamma': 'scale'}

RMSE on test data :  264906.30987689673
Score with test data -0.029342157916430796
new data






Model with rank: 1
Mean validation score: -0.083 (std: 0.035)
Parameters: {'shrinking': False, 'kernel': 'linear', 'gamma': 'scale'}

Model with rank: 1
Mean validation score: -0.083 (std: 0.035)
Parameters: {'shrinking': False, 'kernel': 'linear', 'gamma': 'auto'}

Model with rank: 3
Mean validation score: -0.083 (std: 0.035)
Parameters: {'shrinking': True, 'kernel': 'linear', 'gamma': 'auto'}

RMSE on test data :  104.1488601489832
Score with test data -0.1261901104934513
new data






Model with 



Model with rank: 1
Mean validation score: 0.208 (std: 0.152)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.208 (std: 0.152)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': False}

Model with rank: 3
Mean validation score: 0.208 (std: 0.152)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 3
Mean validation score: 0.208 (std: 0.152)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': False}

RMSE on test data :  1658040.601921504
Score with test data 0.08684107818979403
new data






Model with rank: 1
Mean validation score: 0.081 (std: 0.098)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.081 (std: 0.098)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': False}

Model with rank: 3
Mean validation score: 0.081 (std: 0.098)
Parameters: {'normalize': True, 'fit_in



Model with rank: 1
Mean validation score: 0.154 (std: 0.179)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: 0.154 (std: 0.179)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': False}

Model with rank: 3
Mean validation score: 0.154 (std: 0.179)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': True}

Model with rank: 3
Mean validation score: 0.154 (std: 0.179)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': False}

RMSE on test data :  180986.64221432214
Score with test data 0.2967431355730539
new data






Model with rank: 1
Mean validation score: -0.066 (std: 0.066)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: -0.066 (std: 0.066)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: -0.066 (std: 0.066)
Parameters: {'normalize': True, 'fit_i



Model with rank: 1
Mean validation score: -0.020 (std: 0.052)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': True}

Model with rank: 1
Mean validation score: -0.020 (std: 0.052)
Parameters: {'normalize': False, 'fit_intercept': True, 'copy_X': False}

Model with rank: 3
Mean validation score: -0.020 (std: 0.052)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': True}

Model with rank: 3
Mean validation score: -0.020 (std: 0.052)
Parameters: {'normalize': True, 'fit_intercept': True, 'copy_X': False}

RMSE on test data :  58679.24818607845
Score with test data 0.005494905620035184
new data






Model with rank: 1
Mean validation score: -0.073 (std: 0.145)
Parameters: {'solver': 'sgd', 'shuffle': False, 'learning_rate': 'constant', 'hidden_layer_sizes': 41, 'alpha': 0.4941026728606091, 'activation': 'logistic'}

Model with rank: 2
Mean validation score: -0.074 (std: 0.186)
Parameters: {'solver': 'adam', 'shuffle': False, 'learning_rate': 'constant', 'hi

Model with rank: 1
Mean validation score: -0.001 (std: 0.034)
Parameters: {'solver': 'adam', 'shuffle': False, 'learning_rate': 'invscaling', 'hidden_layer_sizes': 66, 'alpha': 0.7940114449657545, 'activation': 'logistic'}

Model with rank: 2
Mean validation score: -0.002 (std: 0.035)
Parameters: {'solver': 'adam', 'shuffle': False, 'learning_rate': 'adaptive', 'hidden_layer_sizes': 107, 'alpha': 0.6729853037777666, 'activation': 'logistic'}

Model with rank: 3
Mean validation score: -0.002 (std: 0.036)
Parameters: {'solver': 'adam', 'shuffle': False, 'learning_rate': 'adaptive', 'hidden_layer_sizes': 131, 'alpha': 0.43094650040671617, 'activation': 'logistic'}

RMSE on test data :  0.35520960127327167
Score with test data 0.04092005149213528







Model with rank: 1
Mean validation score: -32.059 (std: 41.846)
Parameters: {'normalize_y': True, 'copy_X_train': True}

Model with rank: 1
Mean validation score: -32.059 (std: 41.846)
Parameters: {'normalize_y': True, 'copy_X_train': False

In [None]:
import numpy as np
from pandas import read_csv
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
np.set_printoptions(precision=3, suppress=True) 
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)


class Aquatic_toxicity:
    
    def __init__(self):
        return
    
    def read_data(self):
        # reading data 
        data = np.loadtxt('qsar_aquatic_toxicity.csv', delimiter=';')
        return data
    
    def preprocessing(self, data):               
        # splitting data
        X = data[:,:-1]
        y = data[:,-1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)        
        #preprocessing using standard scaler
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        return X_train, y_train, X_test, y_test
    
    def dim_Reduction(self, X_train, X_test):
        # Dimensionality Reduction using PCA from 123 dims to 20 dims
        pca = PCA(n_components = 20)
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
        return X_train, X_test
        
    def cv_SVR(self, X, y):
        #scorer = make_scorer(neg_mean_squared_error)
        C_grid = [0.1, 1, 10]
        gamma_grid = np.logspace(-2, 1, 4)[0:3]
        svm = sklearn.svm.SVR(kernel='rbf')
        param_grid = { 'C' : C_grid, 'gamma' : gamma_grid, 'kernel' : ['rbf', 'sigmoid',  'linear']}
        gridcv = sklearn.model_selection.GridSearchCV(svm, param_grid, n_jobs=-1, verbose=1, cv=3)
        #, scoring = 'neg_mean_squared_error'
        gridcv.fit(X_train, y_train)
        print("best parameters:", gridcv.best_params_)
        print("%.1f%% neg mean squared error on validation sets (average)" % (gridcv.best_score_*100))
        return gridcv.best_params_
    
    def cv_DTR(self, X, y):
        dt = DecisionTreeRegressor()
        param_grid = {
            "min_samples_split" : np.random.random_sample((100,)),
            "min_samples_leaf" : np.arange(1,6),
            'max_depth': range(1, 20),
            'criterion' : ['mse', 'mae', 'friedman_mse'],
            'splitter' : ['best', 'random'],
        }
        return Aquatic_toxicity.randomCV(dt, X, y, param_grid, 100, 6)
        
    def cv_RandomForest(self, X, y):
        rf = RandomForestRegressor()
        param_grid = {
            "n_estimators" : [10*x for x in np.arange(1,50)],
            "min_samples_split" : np.random.random_sample((100,)),
            "min_samples_leaf" : np.arange(1,6),
            'max_depth': range(1, 20),
        }
        return Aquatic_toxicity.randomCV(rf, X, y, param_grid, 30, 6)
        
    def cv_adaBoost(self, X, y):
        #scorer = make_scorer(precision_score)
        ada_boost = AdaBoostRegressor(n_estimators=50, learning_rate=1)
        param_grid = {'n_estimators': range(1, 50), 'learning_rate': [0.1, 0.5, 1]}
        gridcv = sklearn.model_selection.GridSearchCV(ada_boost, param_grid, verbose=1, cv=3, n_jobs=-1)
                                                      #, scoring='explained_variance')
        gridcv.fit(X, y)
        print("best parameters:", gridcv.best_params_)
        print("%.1f%% validation on validation sets (average)" % (gridcv.best_score_))
        return gridcv.best_params_
    
    def cv_linReg(self, X, y):
        lr = LinearRegression()
        param_grid = {
            "fit_intercept" : [True, False],
        }
        return Aquatic_toxicity.randomCV(lr, X, y, param_grid, 50, 6)
        
    def cv_GP(self, X, y):
        clf = GaussianProcessRegressor()
        param_grid = {
            
        "normalize_y" : [True, False],
        "copy_X_train" : [True, False],
        "alpha" : np.linspace(0, 5, 100),
        }
        return Aquatic_toxicity.randomCV(clf, X, y, param_grid, 25, 6)
    
    def cv_NNRegressor(self, X, y):
        nn = sklearn.neural_network.MLPRegressor(hidden_layer_sizes=(50,),
                                           solver='sgd', batch_size=100, max_iter=10,
                                           learning_rate_init=.01, momentum=0.9, alpha=0.05,
                                           verbose=False, random_state=0)

        param_grid ={
                    'hidden_layer_sizes' : range(2,100),
                    "activation" : ['identity', 'logistic', 'tanh', 'relu']
                    }
        return Aquatic_toxicity.randomCV(nn, X, y, param_grid, 100, 6)
        
    def randomCV(clf, X, y, param_grid, n_iter, cv):
        #scorer = make_scorer(precision_score)
        random_search = RandomizedSearchCV(clf, param_distributions = param_grid, n_iter = n_iter, cv = cv, iid = False)
        #scoring = "explained_variance"
        random_search.fit(X, y)
        #print(random_search.cv_results_)
        Aquatic_toxicity.report(random_search.cv_results_)
        return random_search.best_params_
    
    def report(results, n_top=1):
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results['rank_test_score'] == i)
            k = 0
            for candidate in candidates:                
                print("Model with rank: {0}".format(i))
                print("Variance on validation data: {0:.3f} (std: {1:.3f})".format(
                      results['mean_test_score'][candidate],
                      results['std_test_score'][candidate]))
                print("Parameters: {0}".format(results['params'][candidate]))
                print("")
                k += 1
                if k == 3:
                    break
                
    def predict(self, model, X_test, y_test):
        predict = model.predict(X_test)
        predict[predict<0] =0
        rmse = mean_squared_error(y_test, predict)
        print("MSE on test data : ", rmse)
        
if __name__ == "__main__":
    obj = Aquatic_toxicity()
    data = obj.read_data()
    X_train, y_train, X_test, y_test = obj.preprocessing(data)
    # X_train, X_test = obj.dim_Reduction(X_train, X_test)
    print('---------SVR--------')
    model = obj.cv_SVR(X_train, y_train)
    reg = sklearn.svm.SVR().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------DTR--------')
    model = obj.cv_DTR(X_train, y_train)
    reg = sklearn.tree.DecisionTreeRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Random Forrest Regressor--------')
    # taking more than 3 mins 
    model = obj.cv_RandomForest(X_train, y_train)
    reg = sklearn.ensemble.RandomForestRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Adaboost Regressor--------')
    model = obj.cv_adaBoost(X_train, y_train)
    reg = sklearn.ensemble.AdaBoostRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Gaussian Process Regressor--------')
    model = obj.cv_GP(X_train, y_train)
    reg = sklearn.gaussian_process.GaussianProcessRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Linear Regressor--------')
    model = obj.cv_linReg(X_train, y_train)
    reg = LinearRegression().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------NN Regressor--------')
    model = obj.cv_NNRegressor(X_train, y_train)
    reg = MLPRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
            

In [None]:
import numpy as np
from pandas import read_csv
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
np.set_printoptions(precision=3, suppress=True) 
from datetime import datetime
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

class Bike_Sharing:
    
    def __init__(self):
        return
    
    # reading data 
    def read_data(self):        
       #Dropping not predictive attributes : instant
        data = np.loadtxt('hour.csv', delimiter=',', skiprows=1, usecols=(range(2,17)))
        data2 = np.loadtxt('hour.csv', delimiter=',', skiprows=1, usecols=(1), dtype='|S10').astype(str)
        
        # Preprocessing step: extracting week number from date attribute
        for i in range(data2.shape[0]):
            data2[i] = datetime.date(datetime.strptime(data2[i], '%Y-%m-%d')).isocalendar()[1]
        data2 = data2.astype(int)        
        data = np.column_stack((data2.reshape(-1,1), data))
        return data
    
    def preprocessing(self, data):               
        # splitting data
        X = data[:,:-1]
        y = data[:,-1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)        
        #preprocessing using standard scaler
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        return X_train, y_train, X_test, y_test
    
        
    def cv_SVR(self, X, y):
        #scorer = make_scorer(neg_mean_squared_error)
        C_grid = [0.1, 1, 10]
        gamma_grid = np.logspace(-2, 1, 4)[0:3]
        svm = sklearn.svm.SVR(kernel='rbf')
        param_grid = { 'C' : C_grid, 'gamma' : gamma_grid, 'kernel' : ['rbf', 'sigmoid',  'linear']}
        gridcv = sklearn.model_selection.GridSearchCV(svm, param_grid, n_jobs=-1, verbose=1, cv=3)
        #, scoring = 'neg_mean_squared_error'
        gridcv.fit(X_train, y_train)
        print("best parameters:", gridcv.best_params_)
        print("%.1f%% neg mean squared error on validation sets (average)" % (gridcv.best_score_*100))
        return gridcv.best_params_
    
    def cv_DTR(self, X, y):
        dt = DecisionTreeRegressor()
        param_grid = {
            "min_samples_split" : np.random.random_sample((100,)),
            "min_samples_leaf" : np.arange(1,6),
            'max_depth': range(1, 20),
            'criterion' : ['mse', 'mae', 'friedman_mse'],
            'splitter' : ['best', 'random'],
        }
        return Bike_Sharing.randomCV(dt, X, y, param_grid, 50, 6)
        
    def cv_RandomForest(self, X, y):
        rf = RandomForestRegressor()
        param_grid = {
            #"n_estimators" : [10*x for x in np.arange(1,25)],
            "min_samples_split" : np.random.random_sample((100,)),
            "min_samples_leaf" : np.arange(1,6),
            'max_depth': range(1, 20),
        }
        return Bike_Sharing.randomCV(rf, X, y, param_grid, 40, 6)
        
    def cv_adaBoost(self, X, y):
        #scorer = make_scorer(precision_score)
        ada_boost = AdaBoostRegressor(n_estimators=50, learning_rate=1)
        param_grid = {'n_estimators': range(1, 50), 'learning_rate': [0.1, 0.5, 1]}
        gridcv = sklearn.model_selection.GridSearchCV(ada_boost, param_grid, verbose=1, cv=3, n_jobs=-1)
                                                      #, scoring='explained_variance')
        gridcv.fit(X, y)
        print("best parameters:", gridcv.best_params_)
        print("%.1f%% validation on validation sets (average)" % (gridcv.best_score_))
        return gridcv.best_params_
    
    def cv_linReg(self, X, y):
        lr = LinearRegression()
        param_grid = {
            "fit_intercept" : [True, False],
        }
        return Bike_Sharing.randomCV(lr, X, y, param_grid, 40, 6)
        
    def cv_GP(self, X, y):
        clf = GaussianProcessRegressor()
        param_grid = {
            
        "normalize_y" : [True, False],
        "copy_X_train" : [True, False],
        "alpha" : np.linspace(0, 5, 100),
        }
        return Bike_Sharing.randomCV(clf, X, y, param_grid, 10, 6)
    
    def cv_NNRegressor(self, X, y):
        nn = sklearn.neural_network.MLPRegressor()

        param_grid ={
                    'hidden_layer_sizes' : range(2,100),
                    "activation" : ['identity', 'logistic', 'tanh', 'relu']
                    }
        return Bike_Sharing.randomCV(nn, X, y, param_grid, 50, 5)
        
    def randomCV(clf, X, y, param_grid, n_iter, cv):
        #scorer = make_scorer(precision_score)
        random_search = RandomizedSearchCV(clf, param_distributions = param_grid, n_iter = n_iter, cv = cv, iid = False, 
                                           verbose=1, n_jobs=-1, scoring='explained_variance')
        #scoring = "explained_variance"
        random_search.fit(X, y)
        #print(random_search.cv_results_)
        Bike_Sharing.report(random_search.cv_results_)
        return random_search.best_params_
    
    def report(results, n_top=1):
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results['rank_test_score'] == i)
            k = 0
            for candidate in candidates:                
                print("Model with rank: {0}".format(i))
                print("Variance on validation data: {0:.3f} (std: {1:.3f})".format(
                      results['mean_test_score'][candidate],
                      results['std_test_score'][candidate]))
                print("Parameters: {0}".format(results['params'][candidate]))
                print("")
                k += 1
                if k == 3:
                    break
                
    def predict(self, model, X_test, y_test):
        predict = model.predict(X_test)
        predict[predict<0] =0
        rmse = mean_squared_error(y_test, predict)
        print("MSE on test data : ", rmse)
        
if __name__ == "__main__":
    obj = Bike_Sharing()
    data = obj.read_data()
    X_train, y_train, X_test, y_test = obj.preprocessing(data)
    print('---------SVR--------')
    model = obj.cv_SVR(X_train, y_train)
    reg = sklearn.svm.SVR().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------DTR--------')
    model = obj.cv_DTR(X_train, y_train)
    reg = sklearn.tree.DecisionTreeRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Random Forrest Regressor--------') 
    model = obj.cv_RandomForest(X_train, y_train)
    reg = sklearn.ensemble.RandomForestRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Adaboost Regressor--------')
    model = obj.cv_adaBoost(X_train, y_train)
    reg = sklearn.ensemble.AdaBoostRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Gaussian Process Regressor--------')
    model = obj.cv_GP(X_train, y_train)
    reg = sklearn.gaussian_process.GaussianProcessRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Linear Regressor--------')
    model = obj.cv_linReg(X_train, y_train)
    reg = LinearRegression().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------NN Regressor--------')
    model = obj.cv_NNRegressor(X_train, y_train)
    reg = MLPRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)

In [5]:
import numpy as np
from pandas import read_csv
import sklearn
from io import StringIO
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
np.set_printoptions(precision=3, suppress=True) 
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

class Concrete:
    
    def __init__(self):
        return
    
    # reading data 
    def read_data(self):        
        #Dropping not predictive attributes : instant
        data = np.loadtxt('compresive_strength_concrete.csv',  skiprows=1, delimiter=',')
        return data
    
    def preprocessing(self, data):               
        # splitting data
        X = data[:,:-1]
        y = data[:,-1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)        
        #preprocessing using standard scaler
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        return X_train, y_train, X_test, y_test
        
    def cv_SVR(self, X, y):
        #scorer = make_scorer(neg_mean_squared_error)
        C_grid = [0.1, 1, 10]
        gamma_grid = np.logspace(-2, 1, 4)[0:3]
        svm = sklearn.svm.SVR(kernel='rbf')
        param_grid = { 'C' : C_grid, 'gamma' : gamma_grid, 'kernel' : ['rbf', 'sigmoid',  'linear']}
        gridcv = sklearn.model_selection.GridSearchCV(svm, param_grid, n_jobs=-1, verbose=1, cv=3)
        #, scoring = 'neg_mean_squared_error'
        gridcv.fit(X_train, y_train)
        print("best parameters:", gridcv.best_params_)
        print("%.1f%%  on validation sets (average)" % (gridcv.best_score_*100))
        return gridcv.best_params_
    
    def cv_DTR(self, X, y):
        dt = DecisionTreeRegressor()
        param_grid = {
            "min_samples_split" : np.random.random_sample((100,)),
            "min_samples_leaf" : np.arange(1,6),
            'max_depth': range(1, 20),
            'criterion' : ['mse', 'mae', 'friedman_mse'],
            'splitter' : ['best', 'random'],
        }
        return Concrete.randomCV(dt, X, y, param_grid, 50, 6)
        
    def cv_RandomForest(self, X, y):
        rf = RandomForestRegressor()
        param_grid = {
            #"n_estimators" : [10*x for x in np.arange(1,25)],
            "min_samples_split" : np.random.random_sample((100,)),
            "min_samples_leaf" : np.arange(1,6),
            'max_depth': range(1, 20),
        }
        return Concrete.randomCV(rf, X, y, param_grid, 40, 6)
    
    def cv_GP(self, X, y):
        clf = GaussianProcessRegressor()
        param_grid = {
            
        "normalize_y" : [True, False],
        "copy_X_train" : [True, False],
        "alpha" : np.linspace(0, 5, 100),
        }
        return Concrete.randomCV(clf, X, y, param_grid, 25, 6)
        
    def cv_adaBoost(self, X, y):
        #scorer = make_scorer(precision_score)
        ada_boost = AdaBoostRegressor(n_estimators=50, learning_rate=1)
        param_grid = {'n_estimators': range(1, 50), 'learning_rate': [0.1, 0.5, 1]}
        gridcv = sklearn.model_selection.GridSearchCV(ada_boost, param_grid, verbose=1, cv=3, n_jobs=-1)
                                                      #, scoring='explained_variance')
        gridcv.fit(X, y)
        print("best parameters:", gridcv.best_params_)
        print("%.1f%% validation on validation sets (average)" % (gridcv.best_score_))
        return gridcv.best_params_
    
    def cv_linReg(self, X, y):
        lr = LinearRegression()
        param_grid = {
            "fit_intercept" : [True, False],
        }
        return Concrete.randomCV(lr, X, y, param_grid, 40, 6)
        
   
    
    def cv_NNRegressor(self, X, y):
        nn = sklearn.neural_network.MLPRegressor()

        param_grid ={
                    'hidden_layer_sizes' : range(2,100),
                    "activation" : ['identity', 'logistic', 'tanh', 'relu']
                    }
        return Concrete.randomCV(nn, X, y, param_grid, 100, 6)
        
    def randomCV(clf, X, y, param_grid, n_iter, cv):
        #scorer = make_scorer(precision_score)
        random_search = RandomizedSearchCV(clf, param_distributions = param_grid, n_iter = n_iter, cv = cv, iid = False, 
                                           verbose=1, n_jobs=-1)
        #scoring = "explained_variance"
        random_search.fit(X, y)
        #print(random_search.cv_results_)
        Concrete.report(random_search.cv_results_)
        return random_search.best_params_
    
    def report(results, n_top=1):
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results['rank_test_score'] == i)
            k = 0
            for candidate in candidates:                
                print("Model with rank: {0}".format(i))
                print("Variance on validation data: {0:.3f} (std: {1:.3f})".format(
                      results['mean_test_score'][candidate],
                      results['std_test_score'][candidate]))
                print("Parameters: {0}".format(results['params'][candidate]))
                print("")
                k += 1
                if k == 3:
                    break
                
    def predict(self, model, X_test, y_test):
        predict = model.predict(X_test)
        predict[predict<0] =0
        rmse = mean_squared_error(y_test, predict)
        print("MSE on test data : ", rmse)
        
if __name__ == "__main__":
    obj = Concrete()
    data = obj.read_data()
    X_train, y_train, X_test, y_test = obj.preprocessing(data)
    #print(X_train)
    #print(y_train)
    print('---------SVR--------')
    model = obj.cv_SVR(X_train, y_train)
    reg = sklearn.svm.SVR().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------DTR--------')
    model = obj.cv_DTR(X_train, y_train)
    reg = sklearn.tree.DecisionTreeRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Random Forrest Regressor--------') 
    model = obj.cv_RandomForest(X_train, y_train)
    reg = sklearn.ensemble.RandomForestRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Adaboost Regressor--------')
    model = obj.cv_adaBoost(X_train, y_train)
    reg = sklearn.ensemble.AdaBoostRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Gaussian Process Regressor--------')
    model = obj.cv_GP(X_train, y_train)
    reg = sklearn.gaussian_process.GaussianProcessRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Linear Regressor--------')
    model = obj.cv_linReg(X_train, y_train)
    reg = LinearRegression().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------NN Regressor--------')
    model = obj.cv_NNRegressor(X_train, y_train)
    reg = MLPRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)

OSError: compresive_strength_concrete.csv not found.

In [None]:
import numpy as np
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from io import StringIO
import scipy
import scipy.stats               # For reciprocal distribution
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer
from sklearn import svm
import sklearn.gaussian_process 
import sklearn.tree        # For DecisionTreeClassifier class
import sklearn.ensemble    # For RandomForestClassifier class
import sklearn.linear_model # For Logistic Classifier
#from sklearn.neighbors import LSHForest
import sklearn.naive_bayes #For Naive Bayes
import sklearn.neural_network #For MLP classifier
import csv
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)  # Ignore sklearn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)       # Ignore sklearn deprecation warnings
np.set_printoptions(precision=20, suppress=True)




class student_performance:
    def __init__(self):
        pass
    
    def preprocess_data(self,file):
        f = open(file,"r")
        c = StringIO(f.read())
        print("READING TRAIN DATA")
       
        #Read data for One hot encoding
        X_string_OH = np.char.strip(np.genfromtxt(c,dtype='str',delimiter = ';',usecols = (0,1,3,5,8,9,10,11),skip_header=1))
        f = open(file,"r")
        c = StringIO(f.read())
        
        #Read Data for ordinal encoding
        X_string_OE = np.char.strip(np.genfromtxt(c,dtype='str',delimiter = ';',usecols = (4,15,16,17,18,19,20,21,22),skip_header=1))
        X_float = np.loadtxt(file,delimiter = ";",usecols = (2,6,7,12,13,14,23,24,25,26,27,28,29,30,31,32), dtype = np.float,skiprows=1).astype(int)
        print(X_string_OE.shape)
        print(X_string_OH.shape)
        print(X_float.shape)
        X_soh = self.preprocess_data_OH(X_string_OH)
        X_soe = self.preprocess_data_OE(X_string_OE)
        X = np.column_stack((X_soh,X_soe,X_float[:,:-1]))
        y = X_float[:,-1]
        return(X,y)
    
    def preprocess_data_OH(self,X):
        X=X.tolist()
        encoder = preprocessing.OneHotEncoder(sparse = False)
        print(encoder.fit(X))
        X = encoder.transform(X)
        print(X[0])
        return(X)
    
    def preprocess_data_OE(self,X):
        X=X.tolist()
        encoder = preprocessing.OrdinalEncoder()
        encoder.fit(X)
        X = encoder.transform(X)
        print(X[5])
        return(X)
    
    def scale_data_1(self,X):
        scaler = preprocessing.MinMaxScaler((0,1)).fit(X)
        return(scaler)
    
    def scale_data_5(self,X):
 
        scaler = preprocessing.MinMaxScaler((0,5)).fit(X)
        return(scaler)
    
    def random_CV(self,clf,X,y,param_grid,n_iter,cv):
 
        print("Starting search")
       
        random_search = model_selection.RandomizedSearchCV(clf, param_distributions = param_grid,n_iter = n_iter, cv = cv,
                                           iid = False,verbose=1,n_jobs = 4)
        random_search.fit(X, y)
        print("best parameters:", random_search.best_params_)
        print("%.1f%% accuracy on validation sets (average)" % (random_search.best_score_*100))
        return random_search.best_params_
    
    def SVR_reg(self,X,y):
        print("SVR classifier called")
        svr_clf = svm.SVR()
        param_dist = {
            'C'     : scipy.stats.reciprocal(1.0, 1000.),
            'kernel': ['rbf','poly','linear'],
            'degree': [2],
            'gamma' : scipy.stats.reciprocal(0.01, 10.)
        }
        return self.random_CV(svr_clf,X,y,param_dist,15,3)
    
    def DTR_reg(self,X,y):
        print("Decision Tree Regressor called")
        dt_reg = sklearn.tree.DecisionTreeRegressor()
        param_dist  ={
            "splitter" : ['best', 'random'],
            "max_depth" :[None,100,150,200,250,300],
            "max_features": ["sqrt","log2",None]
        }
        return self.random_CV(dt_reg,X,y,param_dist,10,3)
    
    def RF_reg(self,X,y):
        print("Random Forest Regressor Called")
        rf_reg = sklearn.ensemble.RandomForestRegressor()
        param_dist = {
            "n_estimators" : [10,25,50,75,100,125,150,175,200],
            "max_depth" :[None,100,150,200,250,300],
            "max_features": ["sqrt","log2",None]
        }
        return self.random_CV(rf_reg,X,y,param_dist,12,3)
    
    def ADB_reg(self,X,y):
        print("Ada Boost Regressor called")
        adb_reg = sklearn.ensemble.AdaBoostRegressor()
        param_dist = {
            "n_estimators" : [50,75,100,150,200,250,300,400,500],
            "loss" : ['linear', 'square', 'exponential'],
            "learning_rate" : [0.5,0.75,1,1.5]
        }
        return self.random_CV(adb_reg,X,y,param_dist,10,3)
    
    def GP_reg (self,X,y):
        print("Gaussian Process Regression called")
        
        gp_reg = sklearn.gaussian_process.GaussianProcessRegressor()
        param_dist = {
            'alpha' : scipy.stats.reciprocal(np.exp(-12), np.exp(-1)),
            "normalize_y" : [False, True]
        }
        return self.random_CV(gp_reg,X,y,param_dist,10,3)
    
    def LR_reg(self,X,y):
        print("Linear Regression called")
        
        lr_reg = sklearn.linear_model.LinearRegression()
        param_dist = {
            'fit_intercept' : [True, False],
            'normalize': [True,False]
        }
        return self.random_CV(lr_reg,X,y,param_dist,4,3)
        
    def MLP_reg(self,X,y):
        print("Neural Network regressor Called")
        mlp_reg = sklearn.neural_network.MLPRegressor()
        param_dist = {
            "hidden_layer_sizes" : [(50,), (100,),(50,50),(100,50),(100,100),(50,50,50)],
            "activation" :['tanh', 'relu','logistic','identity'],
            "solver" : ['lbfgs', 'sgd'], #smaller Data set, no adam
            "alpha" : scipy.stats.reciprocal(0.00005,0.1),
            "learning_rate" : ['constant','invscaling'],
            "max_iter" : [100,200,300,500,700]
        }
        return self.random_CV(mlp_reg,X,y,param_dist,15,3)
    
    def train_reg(self,reg,params,X,y):
        reg.set_params(**params)
        reg.fit(X,y)
        print("Complete Training Accuracy")
        print(reg.score(X,y))
        return reg
    
    def start(self):
        print("******Regression of Student Performance Data Set Begins ******")
        file = 'student-por.csv'
        (X,y)=self.preprocess_data(file)
        
        
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
        
        print("Calculating Standard Scaler using training data")
        scaler_1 = self.scale_data_1(X_train[:,:-2])  #Scale all the first 30 attributes to get a val range of 0,1
        scaler_2 = self.scale_data_5(X_train[:,X_train.shape[1]-2:X_train.shape[1]]) #Scale the imp last 2 attributes for a val range 0,5, to show its importance
        
        X_train_1 = scaler_1.transform(X_train[:,:-2])
        X_test_1 = scaler_1.transform(X_test[:,:-2])
        
        X_train_2 = scaler_2.transform(X_train[:,X_train.shape[1]-2:X_train.shape[1]])
        X_test_2 = scaler_2.transform(X_test[:,X_train.shape[1]-2:X_train.shape[1]])

        X_train = np.column_stack((X_train_1,X_train_2))
        X_test = np.column_stack((X_test_1,X_test_2))
        
        print("**Training of Student_performance starts**")
        
        print("__Training SVR__")
        
        svr_reg = self.train_reg(svm.SVR(),self.SVR_reg(X_train,y_train),X_train,y_train)
        
        print("__Training DTR__")
        dt_reg = self.train_reg(sklearn.tree.DecisionTreeRegressor(),self.DTR_reg(X_train,y_train),X_train,y_train)
        
        print("__Training Random Forest Reg__")
        rf_reg = self.train_reg(sklearn.ensemble.RandomForestRegressor(),self.RF_reg(X_train,y_train),X_train,y_train)
        
        print("__Training AdaBoost Reg__")
        adb_reg = self.train_reg(sklearn.ensemble.AdaBoostRegressor(),self.ADB_reg(X_train,y_train),X_train,y_train)
        
        print("__Training Gaussian proc Reg__")
        gp_reg = self.train_reg(sklearn.gaussian_process.GaussianProcessRegressor(),self.GP_reg(X_train,y_train),X_train,y_train)
        
        print("__Training Linear Reg__")
        lin_reg = self.train_reg(sklearn.linear_model.LinearRegression(),self.LR_reg(X_train,y_train),X_train,y_train)
        
        print("__Training Neural Net Reg__")
        nn_reg = self.train_reg(sklearn.neural_network.MLPRegressor(),self.MLP_reg(X_train,y_train),X_train,y_train)
        
        print("**Test Data Prediction Begins*")
        
        
        print("Testing SVM Regressionr")
        print(svr_reg.score(X_test,y_test))
        
        print("Testing Decision Trees")
        print(dt_reg.score(X_test,y_test))
        
        print("Testing Random Forests")
        print(rf_reg.score(X_test,y_test))
        
        print("Testing Adaboost")
        print(adb_reg.score(X_test,y_test))
        
        
        print("Testing Gaussian Process reg")
        print(gp_reg.score(X_test,y_test))
        
        print("Testing Linear Regression")
        print(lin_reg.score(X_test,y_test))
        
        print("Testing Neural Network/MLP")
        print(nn_reg.score(X_test,y_test))
        
        return
    
obj = student_performance()
obj.start()
        


In [None]:
import numpy as np
import numpy as np
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from io import StringIO
import scipy
import scipy.stats               # For reciprocal distribution
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer
from sklearn import svm
import sklearn.gaussian_process 
import sklearn.tree        # For DecisionTreeClassifier class
import sklearn.ensemble    # For RandomForestClassifier class
import sklearn.linear_model # For Logistic Classifier
#from sklearn.neighbors import LSHForest
import sklearn.naive_bayes #For Naive Bayes
import sklearn.neural_network #For MLP classifier
import csv
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)  # Ignore sklearn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)       # Ignore sklearn deprecation warnings
np.set_printoptions(precision=20, suppress=True)

class wine:
    
    def __init__(self):
        pass
    
    def SFS(self,X,y): #Feature selection as some features are useless
        size = 11
        cols = [0,1,2,3,4,5,6,7,8,9,10]
        min_score = 0
        useless_col = 20
        useless_cols = []
        rf_reg = sklearn.ensemble.RandomForestRegressor()
        while(len(cols)> 8):
            min_score = 0
            X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
            for i in cols:
                if i  in useless_cols:
                    continue   
                X_sfs = np.column_stack((X_train[:,0:i],X_train[:,(i+1):len(cols)]))
                    
                rf_reg.fit(X_sfs,y_train);
                score = rf_reg.score(np.column_stack((X_test[:,0:i],X_test[:,(i+1):len(cols)])),y_test)
                if(score > min_score):
                    min_score = score
                    print(score)
                    print("removing %d" % i)
                    useless_col = i
            useless_cols.append(useless_col)
            #cols.pop()
            print("deleting")
            print(useless_col)
            np.delete(X,useless_col,1)
        return useless_cols

    
    def scale_data(self,X):
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)
        return(X,scaler)
    def random_CV(self,clf,X,y,param_grid,n_iter,cv):
 
        print("Starting search")
       
        random_search = model_selection.RandomizedSearchCV(clf, param_distributions = param_grid,n_iter = n_iter, cv = cv,
                                           iid = False,verbose=1,n_jobs = 4)
        random_search.fit(X, y)
        print("best parameters:", random_search.best_params_)
        print("%.1f%% accuracy on validation sets (average)" % (random_search.best_score_*100))
        return random_search.best_params_
    
    def SVR_reg(self,X,y):
        print("SVR classifier called")
        svr_clf = svm.SVR()
        param_dist = {
            'C'     : scipy.stats.reciprocal(1.0, 1000.),
            'kernel': ['rbf','linear'],
            'degree': [2],
            'gamma' : scipy.stats.reciprocal(0.01, 10.)
        }
        return self.random_CV(svr_clf,X,y,param_dist,5,3)
    
    def DTR_reg(self,X,y):
        print("Decision Tree Regressor called")
        dt_reg = sklearn.tree.DecisionTreeRegressor()
        param_dist  ={
            "splitter" : ['best', 'random'],
            "max_depth" :[None,100,150,200,250,300],
            "max_features": ["sqrt","log2",None]
        }
        return self.random_CV(dt_reg,X,y,param_dist,10,3)
    
    def RF_reg(self,X,y):
        print("Random Forest Regressor Called")
        rf_reg = sklearn.ensemble.RandomForestRegressor()
        param_dist = {
            "n_estimators" : [10,25,50,75,100,125,150,175,200],
            "max_depth" :[None,100,150,200,250,300],
            "max_features": ["sqrt","log2",None]
        }
        return self.random_CV(rf_reg,X,y,param_dist,12,3)
    
    def ADB_reg(self,X,y):
        print("Ada Boost Regressor called")
        adb_reg = sklearn.ensemble.AdaBoostRegressor()
        param_dist = {
            "n_estimators" : [50,75,100,150,200,250,300,400,500],
            "loss" : ['linear', 'square', 'exponential'],
            "learning_rate" : [0.5,0.75,1,1.5]
        }
        return self.random_CV(adb_reg,X,y,param_dist,10,3)
    
    def GP_reg (self,X,y):
        print("Gaussian Process Regression called")
        
        gp_reg = sklearn.gaussian_process.GaussianProcessRegressor()
        param_dist = {
            'alpha' : scipy.stats.reciprocal(np.exp(-12), np.exp(-1)),
            "normalize_y" : [False, True]
        }
        return self.random_CV(gp_reg,X,y,param_dist,10,3)
    
    def LR_reg(self,X,y):
        print("Linear Regression called")
        
        lr_reg = sklearn.linear_model.LinearRegression()
        param_dist = {
            'fit_intercept' : [True, False],
            'normalize': [True,False]
        }
        return self.random_CV(lr_reg,X,y,param_dist,4,3)
        
    def MLP_reg(self,X,y):
        print("Neural Network regressor Called")
        mlp_reg = sklearn.neural_network.MLPRegressor()
        param_dist = {
            "hidden_layer_sizes" : [(50,), (100,),(50,50),(100,50),(100,100),(50,50,50)],
            "activation" :['tanh', 'relu','logistic','identity'],
            "solver" : ['lbfgs', 'sgd'], #smaller Data set, no adam
            "alpha" : scipy.stats.reciprocal(0.00005,0.1),
            "learning_rate" : ['constant','invscaling'],
            "max_iter" : [100,200,300,500,700]
        }
        return self.random_CV(mlp_reg,X,y,param_dist,15,3)
    
    def train_reg(self,reg,params,X,y):
        reg.set_params(**params)
        reg.fit(X,y)
        print("Complete Training Accuracy")
        print(reg.score(X,y))
        return reg  
    
    def start(self):
        file = 'winequality-white.csv'
        
        X_float = np.loadtxt(file,delimiter = ";", dtype = np.float,skiprows=1)
        
        print(X_float[0])
        
        X,y = X_float[:,:-1],X_float[:,-1]
        
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.2)
        X_train,scaler = self.scale_data(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        
        print("Calinng SFS")
     
        print("**Training of Student_performance starts**")
        
        print("__Training SVR__")
        
        #svr_reg = self.train_reg(svm.SVR(),self.SVR_reg(X_train,y_train),X_train,y_train)
        
        print("__Training DTR__")
        dt_reg = self.train_reg(sklearn.tree.DecisionTreeRegressor(),self.DTR_reg(X_train,y_train),X_train,y_train)
        
        print("__Training Random Forest Reg__")
        rf_reg = self.train_reg(sklearn.ensemble.RandomForestRegressor(),self.RF_reg(X_train,y_train),X_train,y_train)
        
        print("__Training AdaBoost Reg__")
        adb_reg = self.train_reg(sklearn.ensemble.AdaBoostRegressor(),self.ADB_reg(X_train,y_train),X_train,y_train)
        
        print("__Training Gaussian proc Reg__")
        gp_reg = self.train_reg(sklearn.gaussian_process.GaussianProcessRegressor(),self.GP_reg(X_train,y_train),X_train,y_train)
        
        print("__Training Linear Reg__")
        lin_reg = self.train_reg(sklearn.linear_model.LinearRegression(),self.LR_reg(X_train,y_train),X_train,y_train)
        
      #  print("__Training Neural Net Reg__")
       # nn_reg = self.train_reg(sklearn.neural_network.MLPRegressor(),self.MLP_reg(X_train.astype(float),y_train),X_train.astype(float),y_train)
        
        print("**Test Data Prediction Begins*")
        
        
        #print("Testing SVM Regressionr")
        #print(svr_reg.score(X_test,y_test))
        
        print("Testing Decision Trees")
        print(dt_reg.score(X_test,y_test))
        
        print("Testing Random Forests")
        print(rf_reg.score(X_test,y_test))
        
        print("Testing Adaboost")
        print(adb_reg.score(X_test,y_test))
        
        
        print("Testing Gaussian Process reg")
        print(gp_reg.score(X_test,y_test))
        
        print("Testing Linear Regression")
        print(lin_reg.score(X_test,y_test))
        
        #print("Testing Neural Network/MLP")
       # print(nn_reg.score(X_test,y_test))
        
        
        return
    
obj = wine()
obj.start()
        