In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import os
import warnings
warnings.filterwarnings('ignore')

In [42]:
theFile='../data/processed/train_mq_fish_toxicity.csv'

In [43]:
datadf = pd.read_csv(theFile)
print("Data Shape")
print(datadf.shape)
print("Data Values")
datadf.head()   

Data Shape
(726, 7)
Data Values


Unnamed: 0,CIC0,SM1_Dz(Z),GATS1i,NdsCH,NdssC,MLOGP,responseLC50
0,3.459,1.47,1.483,0,2,0.491,4.486
1,3.833,0.405,1.522,0,1,3.492,2.092
2,4.717,0.223,1.288,1,1,4.804,6.384
3,3.609,0.811,1.208,0,0,3.291,4.38
4,3.103,0.134,1.016,0,0,2.193,3.21


In [44]:
numeric_feat = [ "CIC0","SM1_Dz(Z)","GATS1i","MLOGP","responseLC50" ]
category_feat = ["NdsCH","NdssC"]

In [45]:
from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()

In [46]:
dfi2 = datadf.copy(deep = True)

In [47]:
numeric_feat.remove('responseLC50')

In [48]:
dfi2[numeric_feat].head()

Unnamed: 0,CIC0,SM1_Dz(Z),GATS1i,MLOGP
0,3.459,1.47,1.483,0.491
1,3.833,0.405,1.522,3.492
2,4.717,0.223,1.288,4.804
3,3.609,0.811,1.208,3.291
4,3.103,0.134,1.016,2.193


In [49]:
dfi2[category_feat].head()

Unnamed: 0,NdsCH,NdssC
0,0,2
1,0,1
2,1,1
3,0,0
4,0,0


In [50]:
#one hot encoding
datadfv2 = pd.get_dummies(dfi2, columns = category_feat, drop_first = True)

In [51]:
datadfv2.head()

Unnamed: 0,CIC0,SM1_Dz(Z),GATS1i,MLOGP,responseLC50,NdsCH_1,NdsCH_2,NdsCH_3,NdsCH_4,NdssC_1,NdssC_2,NdssC_3,NdssC_4,NdssC_6
0,3.459,1.47,1.483,0.491,4.486,0,0,0,0,0,1,0,0,0
1,3.833,0.405,1.522,3.492,2.092,0,0,0,0,1,0,0,0,0
2,4.717,0.223,1.288,4.804,6.384,1,0,0,0,1,0,0,0,0
3,3.609,0.811,1.208,3.291,4.38,0,0,0,0,0,0,0,0,0
4,3.103,0.134,1.016,2.193,3.21,0,0,0,0,0,0,0,0,0


In [52]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [53]:
datadfv2[numeric_feat] = scaler.fit_transform(datadfv2[numeric_feat])

In [54]:
datadfv2.head(5)

Unnamed: 0,CIC0,SM1_Dz(Z),GATS1i,MLOGP,responseLC50,NdsCH_1,NdsCH_2,NdsCH_3,NdsCH_4,NdssC_1,NdssC_2,NdssC_3,NdssC_4,NdssC_6
0,0.760991,1.940217,0.482268,-1.148189,4.486,0,0,0,0,0,1,0,0,0
1,1.257042,-0.554364,0.579053,0.980847,2.092,0,0,0,0,1,0,0,0,0
2,2.429527,-0.980668,-0.001654,1.911635,6.384,1,0,0,0,1,0,0,0,0
3,0.959942,0.396622,-0.200187,0.838249,4.38,0,0,0,0,0,0,0,0,0
4,0.288813,-1.189136,-0.676665,0.059281,3.21,0,0,0,0,0,0,0,0,0


In [55]:
from sklearn.model_selection import train_test_split

In [56]:
X = datadfv2.drop(['responseLC50'], axis = 1)
y = datadfv2[['responseLC50']]

In [59]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics

In [114]:
from sklearn.ensemble import RandomForestRegressor
random_forest = RandomForestRegressor(n_estimators=25, random_state = 5,max_depth=15,max_features=1)

In [115]:
from sklearn.model_selection import RepeatedKFold 
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(random_forest, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

Mean MAE: 0.644 (0.082)


In [102]:
import xgboost as xgb
from sklearn.model_selection import RepeatedKFold 

In [110]:
model = xgb.XGBRegressor(n_estimators=1000, max_depth=700, eta=0.1, subsample=0.7, colsample_bytree=0.8)

In [111]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [112]:
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

Mean MAE: 0.656 (0.075)


In [116]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

In [117]:
GBR = GradientBoostingRegressor()

In [118]:
parameters = {'learning_rate': [0.01,0.02,0.03,0.04],
                  'subsample'    : [0.9, 0.5, 0.2, 0.1],
                  'n_estimators' : [100,500,1000, 1500],
                  'max_depth'    : [4,6,8,10]
                 }

In [119]:
grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters, cv = cv, n_jobs=-1)
grid_GBR.fit(X, y)

In [120]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 GradientBoostingRegressor(learning_rate=0.01, max_depth=10, n_estimators=500,
                          subsample=0.2)

 The best score across ALL searched params:
 0.6136181690548421

 The best parameters across ALL searched params:
 {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 500, 'subsample': 0.2}
