In [5]:
import numpy as np
from pandas import read_csv
import sklearn
from io import StringIO
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
np.set_printoptions(precision=3, suppress=True) 
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [27]:
data = np.loadtxt('compresive_strength_concrete.csv',  skiprows=1, delimiter=',')
data

array([[540.  ,   0.  ,   0.  , ..., 676.  ,  28.  ,  79.99],
       [540.  ,   0.  ,   0.  , ..., 676.  ,  28.  ,  61.89],
       [332.5 , 142.5 ,   0.  , ..., 594.  , 270.  ,  40.27],
       ...,
       [148.5 , 139.4 , 108.6 , ..., 780.  ,  28.  ,  23.7 ],
       [159.1 , 186.7 ,   0.  , ..., 788.9 ,  28.  ,  32.77],
       [260.9 , 100.5 ,  78.3 , ..., 761.5 ,  28.  ,  32.4 ]])

In [10]:
class Concrete:
    
    def __init__(self):
        return
    
    # reading data 
    def read_data(self):        
        #Dropping not predictive attributes : instant
        data = np.loadtxt('compresive_strength_concrete.csv',  skiprows=1, delimiter=',')
        return data
    
    def preprocessing(self, data):               
        # splitting data
        X = data[:,:-1]
        y = data[:,-1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)        
        #preprocessing using standard scaler
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        return X_train, y_train, X_test, y_test
        
    def cv_SVR(self, X, y):
        #scorer = make_scorer(neg_mean_squared_error)
        C_grid = [0.1, 1, 10]
        gamma_grid = np.logspace(-2, 1, 4)[0:3]
        svm = sklearn.svm.SVR(kernel='rbf')
        param_grid = { 'C' : C_grid, 'gamma' : gamma_grid, 'kernel' : ['rbf', 'sigmoid',  'linear']}
        gridcv = sklearn.model_selection.GridSearchCV(svm, param_grid, n_jobs=-1, verbose=1, cv=3)
        #, scoring = 'neg_mean_squared_error'
        gridcv.fit(X_train, y_train)
        print("best parameters:", gridcv.best_params_)
        print("%.1f%%  on validation sets (average)" % (gridcv.best_score_*100))
        return gridcv.best_params_
    
    def cv_DTR(self, X, y):
        dt = DecisionTreeRegressor()
        param_grid = {
            "min_samples_split" : np.random.random_sample((100,)),
            "min_samples_leaf" : np.arange(1,6),
            'max_depth': range(1, 20),
            'criterion' : ['mse', 'mae', 'friedman_mse'],
            'splitter' : ['best', 'random'],
        }
        return Concrete.randomCV(dt, X, y, param_grid, 50, 6)
        
    def cv_RandomForest(self, X, y):
        rf = RandomForestRegressor()
        param_grid = {
            #"n_estimators" : [10*x for x in np.arange(1,25)],
            "min_samples_split" : np.random.random_sample((100,)),
            "min_samples_leaf" : np.arange(1,6),
            'max_depth': range(1, 20),
        }
        return Concrete.randomCV(rf, X, y, param_grid, 40, 6)
    
    def cv_GP(self, X, y):
        clf = GaussianProcessRegressor()
        param_grid = {
            
        "normalize_y" : [True, False],
        "copy_X_train" : [True, False],
        "alpha" : np.linspace(0, 5, 100),
        }
        return Concrete.randomCV(clf, X, y, param_grid, 25, 6)
        
    def cv_adaBoost(self, X, y):
        #scorer = make_scorer(precision_score)
        ada_boost = AdaBoostRegressor(n_estimators=50, learning_rate=1)
        param_grid = {'n_estimators': range(1, 50), 'learning_rate': [0.1, 0.5, 1]}
        gridcv = sklearn.model_selection.GridSearchCV(ada_boost, param_grid, verbose=1, cv=3, n_jobs=-1)
                                                      #, scoring='explained_variance')
        gridcv.fit(X, y)
        print("best parameters:", gridcv.best_params_)
        print("%.1f%% validation on validation sets (average)" % (gridcv.best_score_))
        return gridcv.best_params_
    
    def cv_linReg(self, X, y):
        lr = LinearRegression()
        param_grid = {
            "fit_intercept" : [True, False],
        }
        return Concrete.randomCV(lr, X, y, param_grid, 40, 6)
        
   
    
    def cv_NNRegressor(self, X, y):
        nn = sklearn.neural_network.MLPRegressor()

        param_grid ={
                    'hidden_layer_sizes' : range(2,100),
                    "activation" : ['identity', 'logistic', 'tanh', 'relu']
                    }
        return Concrete.randomCV(nn, X, y, param_grid, 100, 6)
        
    def randomCV(clf, X, y, param_grid, n_iter, cv):
        #scorer = make_scorer(precision_score)
        random_search = RandomizedSearchCV(clf, param_distributions = param_grid, n_iter = n_iter, cv = cv, iid = False, 
                                           verbose=1, n_jobs=-1)
        #scoring = "explained_variance"
        random_search.fit(X, y)
        #print(random_search.cv_results_)
        Concrete.report(random_search.cv_results_)
        return random_search.best_params_
    
    def report(results, n_top=1):
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results['rank_test_score'] == i)
            k = 0
            for candidate in candidates:                
                print("Model with rank: {0}".format(i))
                print("Variance on validation data: {0:.3f} (std: {1:.3f})".format(
                      results['mean_test_score'][candidate],
                      results['std_test_score'][candidate]))
                print("Parameters: {0}".format(results['params'][candidate]))
                print("")
                k += 1
                if k == 3:
                    break
                
    def predict(self, model, X_test, y_test):
        predict = model.predict(X_test)
        predict[predict<0] =0
        rmse = mean_squared_error(y_test, predict)
        print("MSE on test data : ", rmse)

In [31]:
X_train

array([[ 1.919, -0.881, -0.846, ..., -0.46 , -0.644, -0.288],
       [ 0.915, -0.881, -0.846, ...,  0.852, -0.187, -0.288],
       [ 0.232,  0.704, -0.846, ..., -0.981, -0.625, -0.288],
       ...,
       [-1.294, -0.881,  1.868, ..., -0.37 ,  0.874, -0.288],
       [-0.38 ,  3.192, -0.846, ..., -0.388, -1.343, -0.288],
       [-0.835,  2.383, -0.846, ..., -0.54 , -0.704,  0.716]])

In [11]:
if __name__ == "__main__":
    obj = Concrete()
    data = obj.read_data()
    X_train, y_train, X_test, y_test = obj.preprocessing(data)
    #print(X_train)
    #print(y_train)
    print('---------SVR--------')
    model = obj.cv_SVR(X_train, y_train)
    reg = sklearn.svm.SVR().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------DTR--------')
    model = obj.cv_DTR(X_train, y_train)
    reg = sklearn.tree.DecisionTreeRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Random Forrest Regressor--------') 
    model = obj.cv_RandomForest(X_train, y_train)
    reg = sklearn.ensemble.RandomForestRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Adaboost Regressor--------')
    model = obj.cv_adaBoost(X_train, y_train)
    reg = sklearn.ensemble.AdaBoostRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Gaussian Process Regressor--------')
    model = obj.cv_GP(X_train, y_train)
    reg = sklearn.gaussian_process.GaussianProcessRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Linear Regressor--------')
    model = obj.cv_linReg(X_train, y_train)
    reg = LinearRegression().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------NN Regressor--------')
    model = obj.cv_NNRegressor(X_train, y_train)
    reg = MLPRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)

---------SVR--------
Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:    6.5s finished


best parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
78.5%  on validation sets (average)
MSE on test data :  46.61412620783457
---------DTR--------
Fitting 6 folds for each of 50 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    1.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Model with rank: 1
Variance on validation data: 0.770 (std: 0.028)
Parameters: {'splitter': 'best', 'min_samples_split': 0.029864188153626903, 'min_samples_leaf': 3, 'max_depth': 19, 'criterion': 'mae'}

MSE on test data :  63.28207220873787
---------Random Forrest Regressor--------
Fitting 6 folds for each of 40 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done 100 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:    1.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Model with rank: 1
Variance on validation data: 0.821 (std: 0.015)
Parameters: {'min_samples_split': 0.04158547816075708, 'min_samples_leaf': 2, 'max_depth': 17}

MSE on test data :  43.18212179887168
---------Adaboost Regressor--------
Fitting 3 folds for each of 147 candidates, totalling 441 fits


[Parallel(n_jobs=-1)]: Done 261 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 426 out of 441 | elapsed:    8.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 441 out of 441 | elapsed:    8.4s finished


best parameters: {'learning_rate': 1, 'n_estimators': 48}
0.8% validation on validation sets (average)
MSE on test data :  54.250622587752765
---------Gaussian Process Regressor--------
Fitting 6 folds for each of 25 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    5.3s finished


Model with rank: 1
Variance on validation data: 0.829 (std: 0.022)
Parameters: {'normalize_y': True, 'copy_X_train': False, 'alpha': 0.35353535353535354}

MSE on test data :  37.86086474231978
---------Linear Regressor--------
Fitting 6 folds for each of 2 candidates, totalling 12 fits
Model with rank: 1
Variance on validation data: 0.599 (std: 0.030)
Parameters: {'fit_intercept': True}

MSE on test data :  95.61717380589968
---------NN Regressor--------
Fitting 6 folds for each of 100 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  3.0min finished


Model with rank: 1
Variance on validation data: 0.688 (std: 0.022)
Parameters: {'hidden_layer_sizes': 97, 'activation': 'tanh'}

MSE on test data :  63.869742902766546
