In [9]:
import numpy as np
from pandas import read_csv
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
np.set_printoptions(precision=3, suppress=True) 
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [10]:
#data = read_csv('qsar_aquatic_toxicity.csv', delimiter=';', skiprows=0)
data = np.loadtxt('qsar_aquatic_toxicity.csv', delimiter=';')
X = data[:,:-1]
y = data[:,-1]
data[0]

array([0.   , 0.   , 0.   , 2.419, 1.225, 0.667, 0.   , 0.   , 3.74 ])

In [7]:
class Aquatic_toxicity:
    
    def __init__(self):
        return
    
    def read_data(self):
        # reading data 
        data = np.loadtxt('qsar_aquatic_toxicity.csv', delimiter=';')
        return data
    
    def preprocessing(self, data):               
        # splitting data
        X = data[:,:-1]
        y = data[:,-1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)        
        #preprocessing using standard scaler
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        return X_train, y_train, X_test, y_test
    
    def dim_Reduction(self, X_train, X_test):
        # Dimensionality Reduction using PCA from 123 dims to 20 dims
        pca = PCA(n_components = 20)
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
        return X_train, X_test
        
    def cv_SVR(self, X, y):
        #scorer = make_scorer(neg_mean_squared_error)
        C_grid = [0.1, 1, 10]
        gamma_grid = np.logspace(-2, 1, 4)[0:3]
        svm = sklearn.svm.SVR(kernel='rbf')
        param_grid = { 'C' : C_grid, 'gamma' : gamma_grid, 'kernel' : ['rbf', 'sigmoid',  'linear']}
        gridcv = sklearn.model_selection.GridSearchCV(svm, param_grid, n_jobs=-1, verbose=1, cv=3)
        #, scoring = 'neg_mean_squared_error'
        gridcv.fit(X_train, y_train)
        print("best parameters:", gridcv.best_params_)
        print("%.1f%% neg mean squared error on validation sets (average)" % (gridcv.best_score_*100))
        return gridcv.best_params_
    
    def cv_DTR(self, X, y):
        dt = DecisionTreeRegressor()
        param_grid = {
            "min_samples_split" : np.random.random_sample((100,)),
            "min_samples_leaf" : np.arange(1,6),
            'max_depth': range(1, 20),
            'criterion' : ['mse', 'mae', 'friedman_mse'],
            'splitter' : ['best', 'random'],
        }
        return Aquatic_toxicity.randomCV(dt, X, y, param_grid, 100, 6)
        
    def cv_RandomForest(self, X, y):
        rf = RandomForestRegressor()
        param_grid = {
            "n_estimators" : [10*x for x in np.arange(1,50)],
            "min_samples_split" : np.random.random_sample((100,)),
            "min_samples_leaf" : np.arange(1,6),
            'max_depth': range(1, 20),
        }
        return Aquatic_toxicity.randomCV(rf, X, y, param_grid, 30, 6)
        
    def cv_adaBoost(self, X, y):
        #scorer = make_scorer(precision_score)
        ada_boost = AdaBoostRegressor(n_estimators=50, learning_rate=1)
        param_grid = {'n_estimators': range(1, 50), 'learning_rate': [0.1, 0.5, 1]}
        gridcv = sklearn.model_selection.GridSearchCV(ada_boost, param_grid, verbose=1, cv=3, n_jobs=-1)
                                                      #, scoring='explained_variance')
        gridcv.fit(X, y)
        print("best parameters:", gridcv.best_params_)
        print("%.1f%% validation on validation sets (average)" % (gridcv.best_score_))
        return gridcv.best_params_
    
    def cv_linReg(self, X, y):
        lr = LinearRegression()
        param_grid = {
            "fit_intercept" : [True, False],
        }
        return Aquatic_toxicity.randomCV(lr, X, y, param_grid, 50, 6)
        
    def cv_GP(self, X, y):
        clf = GaussianProcessRegressor()
        param_grid = {
            
        "normalize_y" : [True, False],
        "copy_X_train" : [True, False],
        "alpha" : np.linspace(0, 5, 100),
        }
        return Aquatic_toxicity.randomCV(clf, X, y, param_grid, 25, 6)
    
    def cv_NNRegressor(self, X, y):
        nn = sklearn.neural_network.MLPRegressor(hidden_layer_sizes=(50,),
                                           solver='sgd', batch_size=100, max_iter=10,
                                           learning_rate_init=.01, momentum=0.9, alpha=0.05,
                                           verbose=False, random_state=0)

        param_grid ={
                    'hidden_layer_sizes' : range(2,100),
                    "activation" : ['identity', 'logistic', 'tanh', 'relu']
                    }
        return Aquatic_toxicity.randomCV(nn, X, y, param_grid, 100, 6)
        
    def randomCV(clf, X, y, param_grid, n_iter, cv):
        #scorer = make_scorer(precision_score)
        random_search = RandomizedSearchCV(clf, param_distributions = param_grid, n_iter = n_iter, cv = cv, iid = False)
        #scoring = "explained_variance"
        random_search.fit(X, y)
        #print(random_search.cv_results_)
        Aquatic_toxicity.report(random_search.cv_results_)
        return random_search.best_params_
    
    def report(results, n_top=1):
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results['rank_test_score'] == i)
            k = 0
            for candidate in candidates:                
                print("Model with rank: {0}".format(i))
                print("Variance on validation data: {0:.3f} (std: {1:.3f})".format(
                      results['mean_test_score'][candidate],
                      results['std_test_score'][candidate]))
                print("Parameters: {0}".format(results['params'][candidate]))
                print("")
                k += 1
                if k == 3:
                    break
                
    def predict(self, model, X_test, y_test):
        predict = model.predict(X_test)
        predict[predict<0] =0
        rmse = mean_squared_error(y_test, predict)
        print("MSE on test data : ", rmse)

In [28]:
X_train

array([[ 1.258, -0.652, -0.571, ..., -0.18 , -0.711, -0.415],
       [ 1.276,  1.149,  0.007, ..., -0.556,  0.67 , -0.415],
       [-1.016, -0.843, -0.571, ..., -0.825, -0.711, -0.415],
       ...,
       [-1.016, -0.843, -0.571, ...,  0.857, -0.711, -0.415],
       [ 0.373,  0.647,  0.585, ...,  0.857, -0.021,  0.799],
       [-0.602, -0.253,  0.007, ..., -0.1  , -0.711, -0.415]])

In [30]:
if __name__ == "__main__":
    obj = Aquatic_toxicity()
    data = obj.read_data()
    X_train, y_train, X_test, y_test = obj.preprocessing(data)
    print('---------DTR--------')
    model = obj.cv_DTR(X_train, y_train)
    reg = sklearn.tree.DecisionTreeRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)

---------DTR--------
Model with rank: 1
Variance on validation data: 0.402 (std: 0.080)
Parameters: {'splitter': 'random', 'min_samples_split': 0.06846205825429486, 'min_samples_leaf': 3, 'max_depth': 16, 'criterion': 'friedman_mse'}

MSE on test data :  2.0598038388339965


In [8]:
if __name__ == "__main__":
    obj = Aquatic_toxicity()
    data = obj.read_data()
    X_train, y_train, X_test, y_test = obj.preprocessing(data)
    # X_train, X_test = obj.dim_Reduction(X_train, X_test)
    print('---------SVR--------')
    model = obj.cv_SVR(X_train, y_train)
    reg = sklearn.svm.SVR().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------DTR--------')
    model = obj.cv_DTR(X_train, y_train)
    reg = sklearn.tree.DecisionTreeRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Random Forrest Regressor--------')
    # taking more than 3 mins 
    model = obj.cv_RandomForest(X_train, y_train)
    reg = sklearn.ensemble.RandomForestRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Adaboost Regressor--------')
    model = obj.cv_adaBoost(X_train, y_train)
    reg = sklearn.ensemble.AdaBoostRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Gaussian Process Regressor--------')
    model = obj.cv_GP(X_train, y_train)
    reg = sklearn.gaussian_process.GaussianProcessRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Linear Regressor--------')
    model = obj.cv_linReg(X_train, y_train)
    reg = LinearRegression().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------NN Regressor--------')
    model = obj.cv_NNRegressor(X_train, y_train)
    reg = MLPRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    

---------SVR--------
Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:    0.5s finished


best parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
52.1% neg mean squared error on validation sets (average)
MSE on test data :  1.395885346501551
---------DTR--------
Model with rank: 1
Variance on validation data: 0.392 (std: 0.041)
Parameters: {'splitter': 'random', 'min_samples_split': 0.09884540354706373, 'min_samples_leaf': 5, 'max_depth': 14, 'criterion': 'mse'}

MSE on test data :  2.2268043378391127
---------Random Forrest Regressor--------
Model with rank: 1
Variance on validation data: 0.482 (std: 0.050)
Parameters: {'n_estimators': 180, 'min_samples_split': 0.04761171036789935, 'min_samples_leaf': 3, 'max_depth': 17}

MSE on test data :  1.7227859276276762
---------Adaboost Regressor--------
Fitting 3 folds for each of 147 candidates, totalling 441 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 426 out of 441 | elapsed:    5.7s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 441 out of 441 | elapsed:    5.9s finished


best parameters: {'learning_rate': 1, 'n_estimators': 18}
0.5% validation on validation sets (average)
MSE on test data :  1.6474053608017445
---------Gaussian Process Regressor--------
Model with rank: 1
Variance on validation data: 0.527 (std: 0.057)
Parameters: {'normalize_y': True, 'copy_X_train': False, 'alpha': 1.1111111111111112}

MSE on test data :  1.4958514587168728
---------Linear Regressor--------




Model with rank: 1
Variance on validation data: 0.448 (std: 0.051)
Parameters: {'fit_intercept': True}

MSE on test data :  1.4197727187578515
---------NN Regressor--------
Model with rank: 1
Variance on validation data: 0.474 (std: 0.053)
Parameters: {'hidden_layer_sizes': 35, 'activation': 'tanh'}

MSE on test data :  1.4026524538929086
