In [2]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor as XGBR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from mpl_toolkits.mplot3d import Axes3D 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import log_loss
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import shap
import seaborn as sns
import numpy as np
import pandas as pd
import os

In [2]:
X = pd.read_excel('std_x0.xlsx')
y = pd.read_excel('std_y.xlsx')['Cell viability']

Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=60)

rfc = GradientBoostingRegressor(random_state=10)
CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
regressor = rfc.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.44613507126237 TEST: 0.523312992930695


In [3]:
for i in range(500):
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=i)
    rfc = GradientBoostingRegressor()
    CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    regressor = rfc.fit(Xtrain, Ytrain)
    score_test = regressor.score(Xtest,Ytest)
#     if CV < CV_score and test < score_test:
#         CV = CV_score
#         test = score_test
    if CV_score>0.5 and score_test>0.55:
        print("5cv:",CV_score,"TEST:",score_test,"random_state:",i)

In [3]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=179)

rfc = GradientBoostingRegressor()
CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
regressor = rfc.fit(Xtrain, Ytrain)
CV_predictions = cross_val_predict(rfc, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))


score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)
print("rmse_5CV",rmse)

5cv: 0.5059605968581462 TEST: 0.5832641280079494
rmse_5CV 16.7106367504351


In [4]:
score_5cv_all = []
for i in range(1, 200, 1):
    rfc =GradientBoostingRegressor(random_state=i)
    score_5cv =cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
random_state_5cv = range(1, 200,1)[score_5cv_all.index(max(score_5cv_all))]

print("Best_5cv score：{}".format(score_max_5cv),
      "random_5cv:{}".format(random_state_5cv))


score_5cv_all = []
for i in range(1, 300, 1):
    rfc = GradientBoostingRegressor(n_estimators=i
                                , random_state=random_state_5cv)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
n_est_5cv = range(1,300,1)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "n_est_5cv:{}".format(n_est_5cv))


score_5cv_all = []
for i in range(1, 100, 1):
    rfc = GradientBoostingRegressor(n_estimators=n_est_5cv
                                ,random_state=random_state_5cv
                                ,max_depth=i)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
max_depth_5cv = range(1,100,1)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "max_depth_5cv:{}".format(max_depth_5cv))


score_5cv_all = []
for i in range(1,Xtrain.shape[1]+1):
    rfc = GradientBoostingRegressor(n_estimators=n_est_5cv
                                ,random_state=random_state_5cv
                                ,max_depth=max_depth_5cv
                                ,max_features=i)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
max_features_5cv = range(1, Xtrain.shape[1]+1)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "max_features_5cv:{}".format(max_features_5cv))


rfc = GradientBoostingRegressor(n_estimators=n_est_5cv
                                ,random_state=random_state_5cv
                                ,max_depth=max_features_5cv
                                , max_features=max_features_5cv
                                )

CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
CV_predictions = cross_val_predict(rfc, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
mae = MAE(Ytrain,CV_predictions)
print("r2_5cv:",CV_score,"rmse_5CV",rmse,"mae_5CV",mae)
expvspred_5cv = {'Exp': Ytrain, 'Pred':CV_predictions}
pd.DataFrame(expvspred_5cv).to_excel('GBDT_5fcv_pred.xlsx')

regressor = rfc.fit(Xtrain, Ytrain)
test_predictions = regressor.predict(Xtest)
test_mae = MAE(Ytest,test_predictions)
score_test = regressor.score(Xtest,Ytest)
rmse = np.sqrt(mean_squared_error(Ytest,test_predictions))
print("r2_test:",score_test,"rmse_test",rmse,"mae_test",test_mae)
expvspred_test = {'Exp':Ytest,'Pred':test_predictions}
pd.DataFrame(expvspred_test).to_excel('GBDT_test_pred.xlsx')

Best_5cv score：0.7590838846956267 random_5cv:151
Best_5cv score：0.7615717290272597 n_est_5cv:75
Best_5cv score：0.7636534255919231 max_depth_5cv:4
Best_5cv score：0.7636534255919231 max_features_5cv:9
r2_5cv: 0.7604663703829436 rmse_5CV 0.419875004847796 mae_5CV 0.2752905919756645
r2_test: 0.8625962456219246 rmse_test 0.3388679117995989 mae_test 0.23925464460267917


## Nanoplastic

In [5]:
Xtrain = pd.read_excel("Xtrain.xlsx",index_col=0)
Xtest = pd.read_excel("Xtest.xlsx",index_col=0)
Ytrain = pd.read_excel("Ytrain.xlsx")['Cell viability']
Ytest = pd.read_excel("Ytest.xlsx")['Cell viability']

In [6]:
score_5cv_all = []
for i in range(1, 200, 1):
    rfc =GradientBoostingRegressor(random_state=i)
    score_5cv =cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
random_state_5cv = range(1, 200,1)[score_5cv_all.index(max(score_5cv_all))]

print("Best_5cv score：{}".format(score_max_5cv),
      "random_5cv:{}".format(random_state_5cv))


score_5cv_all = []
for i in range(1, 300, 1):
    rfc = GradientBoostingRegressor(n_estimators=i
                                , random_state=random_state_5cv)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
n_est_5cv = range(1,300,1)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "n_est_5cv:{}".format(n_est_5cv))


score_5cv_all = []
for i in range(1, 100, 1):
    rfc = GradientBoostingRegressor(n_estimators=n_est_5cv
                                ,random_state=random_state_5cv
                                ,max_depth=i)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
max_depth_5cv = range(1,100,1)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "max_depth_5cv:{}".format(max_depth_5cv))


score_5cv_all = []
for i in range(1,Xtrain.shape[1]+1):
    rfc = GradientBoostingRegressor(n_estimators=n_est_5cv
                                ,random_state=random_state_5cv
                                ,max_depth=max_depth_5cv
                                ,max_features=i)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
max_features_5cv = range(1, Xtrain.shape[1]+1)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "max_features_5cv:{}".format(max_features_5cv))


rfc = GradientBoostingRegressor(n_estimators=n_est_5cv
                                ,random_state=random_state_5cv
                                ,max_depth=max_features_5cv
                                , max_features=max_features_5cv
                                )

CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
CV_predictions = cross_val_predict(rfc, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
mae = MAE(Ytrain,CV_predictions)
print("r2_5cv:",CV_score,"rmse_5CV",rmse,"mae_5CV",mae)
expvspred_5cv = {'Exp': Ytrain, 'Pred':CV_predictions}
pd.DataFrame(expvspred_5cv).to_excel('GBDT_5fcv_pred.xlsx')

regressor = rfc.fit(Xtrain, Ytrain)
test_predictions = regressor.predict(Xtest)
test_mae = MAE(Ytest,test_predictions)
score_test = regressor.score(Xtest,Ytest)
rmse = np.sqrt(mean_squared_error(Ytest,test_predictions))
print("r2_test:",score_test,"rmse_test",rmse,"mae_test",test_mae)
expvspred_test = {'Exp':Ytest,'Pred':test_predictions}
pd.DataFrame(expvspred_test).to_excel('GBDT_test_pred.xlsx')

Best_5cv score：0.7387295556193527 random_5cv:87
Best_5cv score：0.7387295556193527 n_est_5cv:100
Best_5cv score：0.7387295556193527 max_depth_5cv:3
Best_5cv score：0.7387295556193527 max_features_5cv:9
r2_5cv: 0.7177353849676017 rmse_5CV 1.0048036541943046 mae_5CV 0.7116046195323731
r2_test: 0.8429290838263872 rmse_test 0.7970635955098595 mae_test 0.5728904956259273


In [8]:

rfc = GradientBoostingRegressor(n_estimators=299
                                ,random_state=145
                                ,max_depth=6
                                )

In [9]:
CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
CV_predictions = cross_val_predict(rfc, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
mae = MAE(Ytrain,CV_predictions)
print("r2_5cv:",CV_score,"rmse_5CV",rmse,"mae_5CV",mae)
expvspred_5cv = {'Exp': Ytrain, 'Pred':CV_predictions}
pd.DataFrame(expvspred_5cv).to_excel('GBDT_5fcv_pred.xlsx')

r2_5cv: 0.7154187139333976 rmse_5CV 12.745684548395445 mae_5CV 7.922475429138053


In [10]:
regressor = rfc.fit(Xtrain, Ytrain)
test_predictions = regressor.predict(Xtest)
test_mae = MAE(Ytest,test_predictions)
score_test = regressor.score(Xtest,Ytest)
rmse = np.sqrt(mean_squared_error(Ytest,test_predictions))
print("r2_test:",score_test,"rmse_test",rmse,"mae_test",test_mae)
expvspred_test = {'Exp':Ytest,'Pred':test_predictions}
pd.DataFrame(expvspred_test).to_excel('GBDT_test_pred.xlsx')

r2_test: 0.7821613336829869 rmse_test 11.19834972432879 mae_test 7.388434216078038
