In [1]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor as XGBR
from interpret.glassbox import ExplainableBoostingRegressor
#, LinearRegression, RegressionTree
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from mpl_toolkits.mplot3d import Axes3D 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import log_loss
from sklearn.model_selection import learning_curve
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import shap
import seaborn as sns
import numpy as np
import pandas as pd
import os

In [2]:
# 定义一个函数来执行 Y Scrambling 并评估模型
def perform_y_scrambling(model, X_train, X_val, Y_train, Y_val, num_iterations=10):
   original_score = model.fit(X_train, Y_train).score(X_val, Y_val)
   print("Original Model Score:", original_score)

   scores = []
   for i in range(num_iterations):
      Y_train_scrambled = np.random.permutation(Y_train)  # Shuffle the target variable
      model.fit(X_train, Y_train_scrambled)  # Retrain the model on scrambled data
      score = model.score(X_val, Y_val)  # Evaluate the model on validation set
      scores.append(score)

   avg_score = np.mean(scores)
   print("Y Scrambling Average Score:", avg_score)

   score_difference = original_score - avg_score
   print("Score Difference:", score_difference)

In [131]:
# 导入数据(anti)
data = pd.read_excel('./data_anti.xlsx')
data = data.loc[:,["material","concentration(umol/ml)","shape","diameter(nm)","temperature(℃)","time(hr)","bacteria","ZOI(mm)"]] # ZOI
#data = data.loc[:,["material","concentration(umol/ml)","shape","diameter(nm)","temperature(℃)","time(hr)","bacteria","rate(%)"]] # rate
#data = data.loc[:,["material","shape","diameter(nm)","temperature(℃)","time(hr)","bacteria","MIC(ug/ml)"]] # MIC
#data = data.loc[:,["material","shape","diameter(nm)","temperature(℃)","time(hr)","bacteria","MBC(ug/ml)"]] # MBC
#data = data.loc[:,["material","concentration(ug/ml)","shape","temperature(℃)","time(hr)","bacteria","ZOI(mm)"]] # ZOI—shape-无size
#data = data.loc[:,["material","concentration(umol/ml)","shape","diameter(nm)","temperature(℃)","time(hr)","bacteria","Density(cfu/ml)","ZOI(mm)"]] # ZOI-Density
data = data[data["shape"].isin(["spherical"])]
data=data.dropna(axis=0)
data.index = range(data.shape[0])#删除有缺失值的行之后恢复索引

# 独热编码
x_onehot = data.loc[:,["material","shape","bacteria"]]
ohe = OneHotEncoder(categories='auto').fit(x_onehot)#auto让模型自行查看有哪些类别
result = ohe.transform(x_onehot).toarray()
result = pd.DataFrame(result, columns=list(ohe.get_feature_names_out()))
data = pd.concat([data,result],axis=1)
data.drop(["material","shape","bacteria"],axis=1,inplace=True)
X = data.drop(["ZOI(mm)"],axis=1)
y=data["ZOI(mm)"]
#y=np.log(y)

#对特征进行数据标准化
X = StandardScaler().fit_transform(X)

In [9]:
# 导入数据（shape）
data = pd.read_excel('./data_anti.xlsx')
data = data.loc[:,["material","concentration(umol/ml)","shape","temperature(℃)","time(hr)","bacteria","ZOI(mm)"]]
data=data.dropna(axis=0)
data.index = range(data.shape[0])#删除有缺失值的行之后恢复索引

# 独热编码
x_onehot = data.loc[:,["material","shape","bacteria"]]
ohe = OneHotEncoder(categories='auto').fit(x_onehot)#auto让模型自行查看有哪些类别
result = ohe.transform(x_onehot).toarray()
result = pd.DataFrame(result, columns=list(ohe.get_feature_names_out()))
data = pd.concat([data,result],axis=1)
data.drop(["material","shape","bacteria"],axis=1,inplace=True)
X = data.drop(["ZOI(mm)"],axis=1)
Columns = X.columns
y=data["ZOI(mm)"]

#对特征进行数据标准化
X_std = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_std)
X.columns = Columns

In [17]:
# 导入数据(cell viability)
data = pd.read_excel('./data_Cell_Viability.xlsx')
data = data.loc[:,["Material","Concentration(umol/ml)","Diameter(nm)","Cell_Type","Cell_Source","Cell_Tissue"
                   ,"Cell_Morphology","Cell Line_P Cell","Time(hr)","Test","Cell_Viability(%)"]]
# data = data[data["shape"].isin(["spherical"])]
data=data.dropna(axis=0)
data.index = range(data.shape[0])#删除有缺失值的行之后恢复索引

# 独热编码
x_onehot = data.loc[:,["Material","Cell_Type","Cell_Source","Cell_Tissue","Cell_Morphology","Cell Line_P Cell","Test"]]
ohe = OneHotEncoder(categories='auto').fit(x_onehot)#auto让模型自行查看有哪些类别
result = ohe.transform(x_onehot).toarray()
result = pd.DataFrame(result, columns=list(ohe.get_feature_names_out()))
data = pd.concat([data,result],axis=1)
data.drop(["Material","Cell_Type","Cell_Source","Cell_Tissue","Cell_Morphology","Cell Line_P Cell","Test"],axis=1,inplace=True)
X = data.drop(["Cell_Viability(%)"],axis=1)
y=data["Cell_Viability(%)"]

#对特征进行数据标准化
X = StandardScaler().fit_transform(X)


# DecisionTreeRegressor

In [18]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=60)

rfc = DecisionTreeRegressor(random_state=60)
CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
regressor = rfc.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.5730186806653625 TEST: 0.7184342122409468


In [22]:
for i in range(500):
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=i)
    rfc = DecisionTreeRegressor(random_state=60)
    CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    regressor = rfc.fit(Xtrain, Ytrain)
    score_test = regressor.score(Xtest,Ytest)
#     if CV < CV_score and test < score_test:
#         CV = CV_score
#         test = score_test
    if CV_score>0.63 and score_test>0.63:
        print("5cv:",CV_score,"TEST:",score_test,"random_state:",i)

5cv: 0.669486934434606 TEST: 0.6410587598534746 random_state: 274
5cv: 0.6323982551959595 TEST: 0.6448464613862462 random_state: 417
5cv: 0.6472965202393649 TEST: 0.6428198152717066 random_state: 420


In [33]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=417)

rfc = DecisionTreeRegressor(random_state=60)
CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
regressor = rfc.fit(Xtrain, Ytrain)
CV_predictions = cross_val_predict(rfc, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))

score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.6323982551959595 TEST: 0.6448464613862462


In [34]:
score_5cv_all = []
for i in range(0, 200, 1):
    rfc =DecisionTreeRegressor(random_state=i)
    score_5cv =cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
random_state_5cv = range(0, 200)[score_5cv_all.index(max(score_5cv_all))]

print("Best_5cv score：{}".format(score_max_5cv),
      "random_5cv:{}".format(random_state_5cv))


score_5cv_all = []
for i in range(1, 300, 1):
    rfc = DecisionTreeRegressor(random_state=random_state_5cv
                                ,max_depth=i)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
max_depth_5cv = range(1,300)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "max_depth_5cv:{}".format(max_depth_5cv))


score_5cv_all = []
for i in range(1,X.shape[1]+1):
    rfc = DecisionTreeRegressor(random_state=random_state_5cv
                                ,max_depth=max_depth_5cv
                                ,max_features=i)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
max_features_5cv = range(1, X.shape[1]+1)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "max_features_5cv:{}".format(max_features_5cv))

Best_5cv score：0.6393079021659165 random_5cv:10
Best_5cv score：0.6393079021659165 max_depth_5cv:43
Best_5cv score：0.6393079021659165 max_features_5cv:177


In [36]:
rfc = DecisionTreeRegressor(random_state=random_state_5cv
                            ,max_depth=max_depth_5cv
                            ,max_features=max_features_5cv)

CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
CV_predictions = cross_val_predict(rfc, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
print("r2_5cv:",CV_score,"rmse_5CV",rmse)

regressor = rfc.fit(Xtrain, Ytrain)
test_predictions = regressor.predict(Xtest)
score_test = regressor.score(Xtest,Ytest)
rmse = np.sqrt(mean_squared_error(Ytest,test_predictions))
print("r2_test:",score_test,"rmse_test",rmse)

r2_5cv: 0.6393079021659165 rmse_5CV 19.15017263015997
r2_test: 0.6370692555689275 rmse_test 18.74550767055514


In [37]:
# 执行Y-Scrambling
perform_y_scrambling(rfc, Xtrain, Xtest, Ytrain, Ytest, num_iterations=10)

Original Model Score: 0.6370692555689275
Y Scrambling Average Score: -1.2412528138330434
Score Difference: 1.8783220694019709


# GBDT

In [16]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=60)

rfc = GradientBoostingRegressor(random_state=10)
CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
regressor = rfc.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.4009328208275399 TEST: 0.33467350945441376


In [21]:
for i in range(500):
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=i)
    rfc = GradientBoostingRegressor(random_state=60)
    CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    regressor = rfc.fit(Xtrain, Ytrain)
    score_test = regressor.score(Xtest,Ytest)
#     if CV < CV_score and test < score_test:
#         CV = CV_score
#         test = score_test
    if CV_score>0.39 and score_test>0.39:
        print("5cv:",CV_score,"TEST:",score_test,"random_state:",i)

5cv: 0.39504474345831353 TEST: 0.3964659021646513 random_state: 290
5cv: 0.39721416182303015 TEST: 0.3977864376676816 random_state: 296
5cv: 0.3961655631986966 TEST: 0.41742656297727576 random_state: 384


In [12]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=384)

rfc = GradientBoostingRegressor(random_state=60)
CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
regressor = rfc.fit(Xtrain, Ytrain)
CV_predictions = cross_val_predict(rfc, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))

score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)
print("rmse_5CV",rmse)

5cv: 0.3961655631986966 TEST: 0.41742656297727576
rmse_5CV 5.900849811315124


In [13]:
score_5cv_all = []
for i in range(0, 200, 1):
    rfc =GradientBoostingRegressor(random_state=i)
    score_5cv =cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
random_state_5cv = range(0, 200)[score_5cv_all.index(max(score_5cv_all))]

print("Best_5cv score：{}".format(score_max_5cv),
      "random_5cv:{}".format(random_state_5cv))


score_5cv_all = []
for i in range(1, 400, 1):
    rfc = GradientBoostingRegressor(n_estimators=i
                                , random_state=random_state_5cv)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
n_est_5cv = range(1,400)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "n_est_5cv:{}".format(n_est_5cv))


score_5cv_all = []
for i in range(1, 300, 1):
    rfc = GradientBoostingRegressor(n_estimators=n_est_5cv
                                ,random_state=random_state_5cv
                                ,max_depth=i)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
max_depth_5cv = range(1,300)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "max_depth_5cv:{}".format(max_depth_5cv))


score_5cv_all = []
for i in range(1,X.shape[1]+1):
    rfc = GradientBoostingRegressor(n_estimators=n_est_5cv
                                ,random_state=random_state_5cv
                                ,max_depth=max_depth_5cv
                                ,max_features=i)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
max_features_5cv = range(1, X.shape[1]+1)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "max_features_5cv:{}".format(max_features_5cv))

Best_5cv score：0.3966816697131931 random_5cv:181
Best_5cv score：0.5175881838705316 n_est_5cv:398
Best_5cv score：0.5490509970799138 max_depth_5cv:5
Best_5cv score：0.570193080795311 max_features_5cv:53


In [14]:
rfc = GradientBoostingRegressor(n_estimators=n_est_5cv
                                ,random_state=random_state_5cv
                                ,max_depth=max_depth_5cv
                                ,max_features=max_features_5cv)

CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
CV_predictions = cross_val_predict(rfc, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
print("r2_5cv:",CV_score,"rmse_5CV",rmse)

regressor = rfc.fit(Xtrain, Ytrain)
test_predictions = regressor.predict(Xtest)
score_test = regressor.score(Xtest,Ytest)
rmse = np.sqrt(mean_squared_error(Ytest,test_predictions))
print("r2_test:",score_test,"rmse_test",rmse)

r2_5cv: 0.570193080795311 rmse_5CV 4.963273891915994
r2_test: 0.6022880645755647 rmse_test 4.62709518189357


In [25]:
# 执行Y-Scrambling
perform_y_scrambling(rfc, Xtrain, Xtest, Ytrain, Ytest, num_iterations=10)

Original Model Score: 0.6022880645755647
Y Scrambling Average Score: -0.3296364393061249
Score Difference: 0.9319245038816897


# RF

In [45]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=60)

rfc = RandomForestRegressor(random_state=60)
CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
regressor = rfc.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.7499116941965088 TEST: 0.8187416222746775


In [49]:
for i in range(500):
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=i)
    rfc = RandomForestRegressor(random_state=60)
    CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    regressor = rfc.fit(Xtrain, Ytrain)
    score_test = regressor.score(Xtest,Ytest)
#     if CV < CV_score and test < score_test:
#         CV = CV_score
#         test = score_test
    if CV_score>0.76 and score_test>0.76:
        print("5cv:",CV_score,"TEST:",score_test,"random_state:",i)

5cv: 0.7629512796453077 TEST: 0.7632891180103841 random_state: 0
5cv: 0.7642267482018694 TEST: 0.8037475712987346 random_state: 1
5cv: 0.7613585792593967 TEST: 0.7837469090437171 random_state: 14
5cv: 0.7617132008074494 TEST: 0.8258982260580247 random_state: 28
5cv: 0.7608221414510631 TEST: 0.7849820198005792 random_state: 33
5cv: 0.7607054364686913 TEST: 0.7777822408818275 random_state: 36
5cv: 0.7600576262242604 TEST: 0.7930742472380755 random_state: 41
5cv: 0.7666925049211479 TEST: 0.7682971140484884 random_state: 47
5cv: 0.7603197169031192 TEST: 0.7736019436905303 random_state: 48
5cv: 0.7602160072403059 TEST: 0.7976440024458226 random_state: 57
5cv: 0.7642165581037795 TEST: 0.7815132854905553 random_state: 61
5cv: 0.7610061573950347 TEST: 0.7859530719875533 random_state: 63
5cv: 0.7723105115167421 TEST: 0.7618236966491785 random_state: 76
5cv: 0.7609029757569863 TEST: 0.7706243474237167 random_state: 81
5cv: 0.7610671224442708 TEST: 0.7663728781062513 random_state: 89
5cv: 0.76878

KeyboardInterrupt: 

In [50]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=28)

rfc = RandomForestRegressor(random_state=60)
CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
regressor = rfc.fit(Xtrain, Ytrain)
CV_predictions = cross_val_predict(rfc, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))

score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)
print("rmse_5CV",rmse)

5cv: 0.7617132008074494 TEST: 0.8258982260580247
rmse_5CV 15.513545173744722


In [51]:
score_5cv_all = []
for i in range(0, 200, 1):
    rfc =RandomForestRegressor(random_state=i)
    score_5cv =cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
random_state_5cv = range(0, 200)[score_5cv_all.index(max(score_5cv_all))]

print("Best_5cv score：{}".format(score_max_5cv),
      "random_5cv:{}".format(random_state_5cv))


score_5cv_all = []
for i in range(1, 400, 1):
    rfc = RandomForestRegressor(n_estimators=i
                                , random_state=random_state_5cv)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
n_est_5cv = range(1,400)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "n_est_5cv:{}".format(n_est_5cv))


score_5cv_all = []
for i in range(1, 300, 1):
    rfc = RandomForestRegressor(n_estimators=n_est_5cv
                                ,random_state=random_state_5cv
                                ,max_depth=i)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
max_depth_5cv = range(1,300)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "max_depth_5cv:{}".format(max_depth_5cv))


score_5cv_all = []
for i in range(1,X.shape[1]+1):
    rfc = RandomForestRegressor(n_estimators=n_est_5cv
                                ,random_state=random_state_5cv
                                ,max_depth=max_depth_5cv
                                ,max_features=i)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
max_features_5cv = range(1, X.shape[1]+1)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "max_features_5cv:{}".format(max_features_5cv))

Best_5cv score：0.7691301478966664 random_5cv:147
Best_5cv score：0.7694243280528342 n_est_5cv:104
Best_5cv score：0.7698454059634405 max_depth_5cv:37
Best_5cv score：0.7759940322027663 max_features_5cv:64


In [52]:
rfc = RandomForestRegressor(n_estimators=n_est_5cv
                            ,random_state=random_state_5cv
                            ,max_depth=max_depth_5cv
                            ,max_features=max_features_5cv)

CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
CV_predictions = cross_val_predict(rfc, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
print("r2_5cv:",CV_score,"rmse_5CV",rmse)

regressor = rfc.fit(Xtrain, Ytrain)
test_predictions = regressor.predict(Xtest)
score_test = regressor.score(Xtest,Ytest)
rmse = np.sqrt(mean_squared_error(Ytest,test_predictions))
print("r2_test:",score_test,"rmse_test",rmse)

r2_5cv: 0.7759940322027663 rmse_5CV 15.039248638171767
r2_test: 0.8195731105192405 rmse_test 13.337415704496912


In [53]:
# 执行Y-Scrambling
perform_y_scrambling(rfc, Xtrain, Xtest, Ytrain, Ytest, num_iterations=10)

Original Model Score: 0.8195731105192405
Y Scrambling Average Score: -0.1977061461504869
Score Difference: 1.0172792566697275


# XGBoost

In [54]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=60)
XGB = XGBR(random_state=60)
CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
regressor = XGB.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.7837497580879802 TEST: 0.8334361302582743


In [55]:
for i in range(500):
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=i)
    XGB = XGBR(random_state=60)
    CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
    regressor = XGB.fit(Xtrain, Ytrain)
    score_test = regressor.score(Xtest,Ytest)
#     if CV < CV_score and test < score_test:
#         CV = CV_score
#         test = score_test
    if CV_score>0.8 and score_test>0.8:
        print("5cv:",CV_score,"TEST:",score_test,"random_state:",i)

5cv: 0.8153910675056721 TEST: 0.8032773294088013 random_state: 80
5cv: 0.8026573856648401 TEST: 0.8136876611493065 random_state: 89
5cv: 0.8012860981959211 TEST: 0.8253207291656163 random_state: 231
5cv: 0.8007017001800862 TEST: 0.8007699197382421 random_state: 258
5cv: 0.8025533771935007 TEST: 0.8361167864571495 random_state: 295
5cv: 0.8028324357694894 TEST: 0.819946770392592 random_state: 415
5cv: 0.8056434978148967 TEST: 0.8189091372397921 random_state: 420
5cv: 0.80158551984947 TEST: 0.8128405166452626 random_state: 434
5cv: 0.8006059276550104 TEST: 0.8330559057818929 random_state: 442
5cv: 0.8007112769300783 TEST: 0.8097953602804931 random_state: 457
5cv: 0.8009335835494602 TEST: 0.8018041867341114 random_state: 470


In [56]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=295)

XGB = XGBR(random_state=60)
CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
regressor = XGB.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.8025533771935007 TEST: 0.8361167864571495


In [57]:
score_5cv_all = []
for i in range(0, 200, 1):
    XGB =XGBR(random_state=i)
    score_5cv =cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass
score_max_5cv = max(score_5cv_all)
random_state_5cv = range(0, 200)[score_5cv_all.index(max(score_5cv_all))]

print("Best_5cv score：{}".format(score_max_5cv),
      "random_state_5cv:{}".format(random_state_5cv))


score_5cv_all = []
for i in np.arange(0.01, 0.5, 0.01):
    XGB = XGBR(learning_rate=i
              ,random_state=random_state_5cv)
    score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass
score_max_5cv = max(score_5cv_all)
n_lr_5cv = np.arange(0.01,0.5,0.01)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "n_lr_5cv:{}".format(n_lr_5cv))


score_5cv_all = []
for i in range(1, 400, 1):
    XGB = XGBR(n_estimators=i
               ,learning_rate=n_lr_5cv
               ,random_state=random_state_5cv)
    score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass
score_max_5cv = max(score_5cv_all)
n_est_5cv = range(1,400)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "n_est_5cv:{}".format(n_est_5cv))


score_5cv_all = []
for i in range(1, 300, 1):
    XGB = XGBR(n_estimators=n_est_5cv
               ,learning_rate=n_lr_5cv
               ,random_state=random_state_5cv
               ,max_depth=i)
    score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass
score_max_5cv = max(score_5cv_all)
max_depth_5cv = range(1,300)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "max_depth_5cv:{}".format(max_depth_5cv))


score_5cv_all = []
for i in np.arange(0,5,0.05):
    XGB = XGBR(n_estimators=n_est_5cv
               ,learning_rate=n_lr_5cv
               ,random_state=random_state_5cv
               ,max_depth=max_depth_5cv
               ,gamma=i)
    score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass
score_max_5cv = max(score_5cv_all)
max_gamma_5cv =  np.arange(0,5,0.05)[score_5cv_all.index(score_max_5cv)]
print("Best_5cv score：{}".format(score_max_5cv),
      "max_gamma_5cv:{}".format(max_gamma_5cv))


score_5cv_all = []
for i in np.arange(0,5,0.05):
    XGB = XGBR(n_estimators=n_est_5cv,
               learning_rate=n_lr_5cv
               ,random_state=random_state_5cv
               ,max_depth=max_depth_5cv
               ,gamma=max_gamma_5cv 
               ,alpha=i)
    score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass
score_max_5cv = max(score_5cv_all)
max_alpha_5cv =  np.arange(0,5,0.05)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "max_alpha_5cv:{}".format(max_alpha_5cv))

Best_5cv score：0.8025533771935007 random_state_5cv:0
Best_5cv score：0.8102184166089602 n_lr_5cv:0.45
Best_5cv score：0.8221263748978883 n_est_5cv:306
Best_5cv score：0.8221263748978883 max_depth_5cv:6
Best_5cv score：0.8253411940834257 max_gamma_5cv:0.15000000000000002
Best_5cv score：0.825672539889554 max_alpha_5cv:0.05


In [58]:
XGB = XGBR(learning_rate=n_lr_5cv
           ,n_estimators=n_est_5cv
           ,random_state=random_state_5cv
           ,max_depth=max_depth_5cv
           ,gamma =max_gamma_5cv
           ,alpha = max_alpha_5cv)

CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
CV_predictions = cross_val_predict(XGB, Xtrain, Ytrain, cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
print("r2_5cv:",CV_score,"rmse_5CV",rmse)

regressor = XGB.fit(Xtrain, Ytrain)
train_predictions = regressor.predict(Xtrain)
test_predictions = regressor.predict(Xtest)
score_test = regressor.score(Xtest,Ytest)
rmse = np.sqrt(mean_squared_error(Ytest,test_predictions))
print("r2_test:",score_test,"rmse_test",rmse)

r2_5cv: 0.825672539889554 rmse_5CV 13.42224788228552
r2_test: 0.8458090544066695 rmse_test 11.742712096216213


In [59]:
# 执行Y-Scrambling
perform_y_scrambling(XGB, Xtrain, Xtest, Ytrain, Ytest, num_iterations=10)

Original Model Score: 0.8458090544066695
Y Scrambling Average Score: -0.8511195698144913
Score Difference: 1.6969286242211608


# Explainable Boosting Machine（EBM）

In [60]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=12)
ebr = ExplainableBoostingRegressor(random_state=6)
CV_score = cross_val_score(ebr, Xtrain, Ytrain, cv=5).mean()
regressor = ebr.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.6124718650463287 TEST: 0.6308991760367249


In [62]:
for i in range(100):
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=i)
    ebr =  ExplainableBoostingRegressor(random_state=6)
    CV_score = cross_val_score(ebr, Xtrain, Ytrain, cv=5).mean()
    regressor = ebr.fit(Xtrain, Ytrain)
    score_test = regressor.score(Xtest,Ytest)
#     if CV < CV_score and test < score_test:
#         CV = CV_score
#         test = score_test
    if CV_score>0.63 and score_test>0.63:
        print("5cv:",CV_score,"TEST:",score_test,"random_state:",i)

5cv: 0.634317304976675 TEST: 0.6740435412553222 random_state: 5
5cv: 0.6374500184064864 TEST: 0.6456899193763934 random_state: 8
5cv: 0.631532548177364 TEST: 0.674037549110335 random_state: 9
5cv: 0.6360429577459036 TEST: 0.6613000029089929 random_state: 13
5cv: 0.6364666695738812 TEST: 0.6337277951041937 random_state: 14
5cv: 0.6334030906752446 TEST: 0.6634280639245795 random_state: 21
5cv: 0.6547397923340574 TEST: 0.6313180758365271 random_state: 22
5cv: 0.6361223581584087 TEST: 0.6691278055024961 random_state: 24
5cv: 0.6317899678466932 TEST: 0.6885324463647535 random_state: 28
5cv: 0.6328707082491747 TEST: 0.6772970051374081 random_state: 32
5cv: 0.6436919338023926 TEST: 0.6486483549666199 random_state: 33
5cv: 0.6337799438319013 TEST: 0.6636023177865984 random_state: 36
5cv: 0.6424844581023189 TEST: 0.6654110777886499 random_state: 38
5cv: 0.6307407976476076 TEST: 0.6458244685466057 random_state: 46
5cv: 0.6306788977589584 TEST: 0.6643222306684571 random_state: 49
5cv: 0.632146371

In [63]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=38)
ebr =  ExplainableBoostingRegressor(random_state=6)
CV_score = cross_val_score(ebr, Xtrain, Ytrain, cv=5).mean()
regressor = ebr.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.6424844581023189 TEST: 0.6654110777886499


In [64]:
score_5cv_all = []
for i in range(0, 100, 5):
    ebr =ExplainableBoostingRegressor(random_state=i)
    score_5cv =cross_val_score(ebr, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass
score_max_5cv = max(score_5cv_all)
random_state_5cv = range(0, 100, 5)[score_5cv_all.index(max(score_5cv_all))]

print("Best_5cv score：{}".format(score_max_5cv),
      "random_state_5cv:{}".format(random_state_5cv))

Best_5cv score：0.6483326270729425 random_state_5cv:45


In [65]:
ebr =ExplainableBoostingRegressor(random_state=38)

CV_score = cross_val_score(ebr, Xtrain, Ytrain, cv=5).mean()
CV_predictions = cross_val_predict(ebr, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
print("r2_5cv:",CV_score,"rmse_5CV",rmse)

regressor = ebr.fit(Xtrain, Ytrain)
test_predictions = regressor.predict(Xtest)
score_test = regressor.score(Xtest,Ytest)
rmse = np.sqrt(mean_squared_error(Ytest,test_predictions))
print("r2_test:",score_test,"rmse_test",rmse)

r2_5cv: 0.641444107304211 rmse_5CV 18.98112394571316
r2_test: 0.647067594570337 rmse_test 18.773957108428586


In [66]:
# 执行Y-Scrambling
perform_y_scrambling(ebr, Xtrain, Xtest, Ytrain, Ytest, num_iterations=10)

Original Model Score: 0.647067594570337
Y Scrambling Average Score: 0.0027272477971141294
Score Difference: 0.6443403467732229


# KNN

In [67]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=60)

rfc = KNeighborsRegressor(n_neighbors=5)
CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
regressor = rfc.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.4538817889495005 TEST: 0.5016984992177858


In [71]:
for i in range(500):
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=i)
    rfc = KNeighborsRegressor(n_neighbors=5)
    CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    regressor = rfc.fit(Xtrain, Ytrain)
    score_test = regressor.score(Xtest,Ytest)
#     if CV < CV_score and test < score_test:
#         CV = CV_score
#         test = score_test
    if CV_score>0.47 and score_test>0.47:
        print("5cv:",CV_score,"TEST:",score_test,"random_state:",i)

5cv: 0.47168781223873213 TEST: 0.48679914508725297 random_state: 119
5cv: 0.47044219434469065 TEST: 0.478696040682685 random_state: 140
5cv: 0.47665351858294613 TEST: 0.5240553484640588 random_state: 151
5cv: 0.4702287466474318 TEST: 0.4794985656922177 random_state: 321
5cv: 0.4878452956391851 TEST: 0.5103990898531245 random_state: 415
5cv: 0.4766611899937951 TEST: 0.4748143396814557 random_state: 472


In [72]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=415)

rfc = KNeighborsRegressor(n_neighbors=5)
CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
regressor = rfc.fit(Xtrain, Ytrain)
CV_predictions = cross_val_predict(rfc, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))

score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)
print("rmse_5CV",rmse)

5cv: 0.4878452956391851 TEST: 0.5103990898531245
rmse_5CV 22.714156654683194


In [73]:
score_5cv_all = []
for i in range(1, 11, 1):
    rfc =KNeighborsRegressor(n_neighbors=i)
    score_5cv =cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
n_neighbors_5cv = range(1, 11)[score_5cv_all.index(max(score_5cv_all))]

print("Best_5cv score：{}".format(score_max_5cv),
      "n_neighbors_5cv:{}".format(n_neighbors_5cv))


score_5cv_all = []
for i in ['uniform', 'distance']:
    rfc = KNeighborsRegressor(weights=i
                                , n_neighbors=n_neighbors_5cv)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
weights_5cv = ['uniform', 'distance'][score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "weights_5cv:{}".format(weights_5cv))

score_5cv_all = []
for i in ['brute', 'kd_tree']:
    rfc = KNeighborsRegressor(algorithm=i
                                , weights=weights_5cv
                                , n_neighbors=n_neighbors_5cv)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
algorithm_5cv = ['brute', 'kd_tree'][score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "algorithm_5cv:{}".format(algorithm_5cv))

score_5cv_all = []
for i in range(1, 1000, 1):
    rfc = KNeighborsRegressor(leaf_size=i
                                , algorithm=algorithm_5cv
                                , weights=weights_5cv
                                , n_neighbors=n_neighbors_5cv)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
leaf_size_5cv = range(10, 1000, 1)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "leaf_size_5cv:{}".format(leaf_size_5cv))

Best_5cv score：0.5779360943245891 n_neighbors_5cv:1
Best_5cv score：0.5779360943245891 weights_5cv:uniform
Best_5cv score：0.5779360943245891 algorithm_5cv:brute
Best_5cv score：0.5779360943245891 leaf_size_5cv:10


In [74]:
rfc = KNeighborsRegressor(leaf_size=leaf_size_5cv
                        , algorithm=algorithm_5cv
                        , weights=weights_5cv
                        , n_neighbors=n_neighbors_5cv)

CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
CV_predictions = cross_val_predict(rfc, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
print("r2_5cv:",CV_score,"rmse_5CV",rmse)

regressor = rfc.fit(Xtrain, Ytrain)
test_predictions = regressor.predict(Xtest)
score_test = regressor.score(Xtest,Ytest)
rmse = np.sqrt(mean_squared_error(Ytest,test_predictions))
print("r2_test:",score_test,"rmse_test",rmse)

r2_5cv: 0.5779360943245891 rmse_5CV 20.60467300885505
r2_test: 0.6819023767074557 rmse_test 17.844423285570954


In [75]:
# 执行Y-Scrambling
perform_y_scrambling(rfc, Xtrain, Xtest, Ytrain, Ytest, num_iterations=10)

Original Model Score: 0.6819023767074557
Y Scrambling Average Score: -0.9939485034867385
Score Difference: 1.675850880194194


# SVM

In [76]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=60)
XGB = SVR()
CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
regressor = XGB.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.1331233664981059 TEST: 0.1827175157094384


In [79]:
for i in range(500):
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=i)
    XGB = SVR()
    CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
    regressor = XGB.fit(Xtrain, Ytrain)
    score_test = regressor.score(Xtest,Ytest)
#     if CV < CV_score and test < score_test:
#         CV = CV_score
#         test = score_test
    if CV_score>0.14 and score_test>0.14:
        print("5cv:",CV_score,"TEST:",score_test,"random_state:",i)

5cv: 0.14200971638971777 TEST: 0.15114768498101938 random_state: 14
5cv: 0.14192590910629008 TEST: 0.15662990812128186 random_state: 27
5cv: 0.14167188112508194 TEST: 0.16070017004161774 random_state: 50
5cv: 0.14212678339383283 TEST: 0.15504479678421734 random_state: 91
5cv: 0.14764460071484004 TEST: 0.14350840725296843 random_state: 97
5cv: 0.14091325195421428 TEST: 0.1622868750303218 random_state: 156
5cv: 0.14638675696640419 TEST: 0.1480711022669131 random_state: 177
5cv: 0.14025941336105083 TEST: 0.1660545800308355 random_state: 181
5cv: 0.14840219010109224 TEST: 0.1630968876026958 random_state: 211
5cv: 0.14269808275195855 TEST: 0.1595964330002425 random_state: 222
5cv: 0.14316524366737982 TEST: 0.1590038362911005 random_state: 229
5cv: 0.14004368821323515 TEST: 0.19439123310691409 random_state: 265
5cv: 0.14429051893469363 TEST: 0.14919328804858334 random_state: 267
5cv: 0.14315432927836058 TEST: 0.1731936758964585 random_state: 275
5cv: 0.14272938170377972 TEST: 0.1753079699196

In [80]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=499)
XGB = SVR()
CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
regressor = XGB.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.14846351032651153 TEST: 0.18267895138296242


In [81]:
score_5cv_all = []
for i in ['linear', 'rbf',"poly","sigmoid"]:
    XGB =SVR(kernel=i)
    score_5cv =cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass
score_max_5cv = max(score_5cv_all)
kernel_5cv = ['linear', 'rbf',"poly","sigmoid"][score_5cv_all.index(max(score_5cv_all))]

print("Best_5cv score：{}".format(score_max_5cv),
      "kernel_5cv:{}".format(kernel_5cv))

score_5cv_all = []
for i in np.arange(0.1, 10, 0.1):
    XGB = SVR(C=i
               ,kernel=kernel_5cv)
    score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass
score_max_5cv = max(score_5cv_all)
C_5cv = np.arange(0.1, 10, 0.1)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "C_5cv:{}".format(C_5cv))

Best_5cv score：0.286265510724063 kernel_5cv:linear
Best_5cv score：0.31138395669825886 C_5cv:0.1


In [82]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=499)
XGB = SVR(C=C_5cv
        ,kernel=kernel_5cv)

CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
CV_predictions = cross_val_predict(XGB, Xtrain, Ytrain, cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
print("r2_5cv:",CV_score,"rmse_5CV",rmse)

regressor = XGB.fit(Xtrain, Ytrain)
test_predictions = regressor.predict(Xtest)
score_test = regressor.score(Xtest,Ytest)
rmse = np.sqrt(mean_squared_error(Ytest,test_predictions))
print("r2_test:",score_test,"rmse_test",rmse)

r2_5cv: 0.31138395669825886 rmse_5CV 26.549424082648823
r2_test: 0.32657896481156967 rmse_test 25.03401187983252


In [83]:
# 执行Y-Scrambling
perform_y_scrambling(XGB, Xtrain, Xtest, Ytrain, Ytest, num_iterations=10)

Original Model Score: 0.32657896481156967
Y Scrambling Average Score: -0.10970082030556169
Score Difference: 0.4362797851171314


# LR

In [84]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=60)

rfc = Lasso(alpha=0.1)
CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
regressor = rfc.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.3401901253973431 TEST: 0.344987287802463


In [87]:
for i in range(500):
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=i)
    rfc = Lasso(alpha=0.1)
    CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    regressor = rfc.fit(Xtrain, Ytrain)
    score_test = regressor.score(Xtest,Ytest)
#     if CV < CV_score and test < score_test:
#         CV = CV_score
#         test = score_test
    if CV_score>0.36 and score_test>0.36:
        print("5cv:",CV_score,"TEST:",score_test,"random_state:",i)

5cv: 0.3607191522003498 TEST: 0.3634255073202721 random_state: 183
5cv: 0.3614334813185254 TEST: 0.3640918905979088 random_state: 423


In [91]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=423)

rfc = Lasso(alpha=0.1)
CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
regressor = rfc.fit(Xtrain, Ytrain)
CV_predictions = cross_val_predict(rfc, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)
print("rmse_5CV",rmse)

5cv: 0.3614334813185254 TEST: 0.3640918905979088
rmse_5CV 25.405343799572815


In [88]:
score_5cv_all = []
for i in np.arange(0.01, 1, 0.01):
    rfc =Lasso(alpha=i)
    score_5cv =cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
alpha_5cv = np.arange(0.01, 1, 0.01)[score_5cv_all.index(max(score_5cv_all))]
print("Best_5cv score：{}".format(score_max_5cv),
      "alpha_5cv:{}".format(alpha_5cv))

Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 6.007e+02, tolerance: 1.570e+02
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.866e+03, tolerance: 1.553e+02
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.597e+02, tolerance: 1.545e+02
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.902e+02, tolerance: 1.557e+02
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 6.897e+02, tolerance: 1.570e+02
Objective did n

Best_5cv score：0.3618760491333573 alpha_5cv:0.22


In [90]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=423)
rfc =Lasso(alpha=alpha_5cv)

CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
CV_predictions = cross_val_predict(rfc, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
print("r2_5cv:",CV_score,"rmse_5CV",rmse)

regressor = rfc.fit(Xtrain, Ytrain)
test_predictions = regressor.predict(Xtest)
score_test = regressor.score(Xtest,Ytest)
rmse = np.sqrt(mean_squared_error(Ytest,test_predictions))
print("r2_test:",score_test,"rmse_test",rmse)

r2_5cv: 0.3614334813185254 rmse_5CV 25.405343799572815
r2_test: 0.3640918905979088 rmse_test 24.86485680131425


In [92]:
# 执行Y-Scrambling
perform_y_scrambling(rfc, Xtrain, Xtest, Ytrain, Ytest, num_iterations=10)

Original Model Score: 0.3640918905979088
Y Scrambling Average Score: -0.059157345038225694
Score Difference: 0.4232492356361345


# RR

In [93]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=60)
XGB = Ridge(alpha=27)
CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
regressor = XGB.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.3352158155728241 TEST: 0.33542490603665687


In [94]:
for i in range(500):
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=i)
    XGB = Ridge(alpha=27)
    CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
    regressor = XGB.fit(Xtrain, Ytrain)
    score_test = regressor.score(Xtest,Ytest)
#     if CV < CV_score and test < score_test:
#         CV = CV_score
#         test = score_test
    if CV_score>0.35 and score_test>0.35:
        print("5cv:",CV_score,"TEST:",score_test,"random_state:",i)

5cv: 0.3509719384627783 TEST: 0.35494230918681535 random_state: 6
5cv: 0.3528749278102713 TEST: 0.3674555743931842 random_state: 22
5cv: 0.351810267204541 TEST: 0.3529353170153605 random_state: 24
5cv: 0.3545914473612277 TEST: 0.3855460728027639 random_state: 74
5cv: 0.3544101207805642 TEST: 0.35957420588694045 random_state: 86
5cv: 0.35392523000807796 TEST: 0.3535583446262225 random_state: 106
5cv: 0.3521760418045193 TEST: 0.353850085875681 random_state: 148
5cv: 0.35741507460253724 TEST: 0.358736164604768 random_state: 183
5cv: 0.3504242358069306 TEST: 0.3821686483268163 random_state: 283
5cv: 0.3506896630922309 TEST: 0.3510147529139287 random_state: 322
5cv: 0.3578574920729859 TEST: 0.37484879501150237 random_state: 349
5cv: 0.35177890820293917 TEST: 0.366744897265334 random_state: 350
5cv: 0.35628010898048446 TEST: 0.35047309303926155 random_state: 365
5cv: 0.3506738481681019 TEST: 0.35547367731737856 random_state: 383
5cv: 0.35043704711919094 TEST: 0.358629236321676 random_state: 

In [95]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=283)

XGB = Ridge(alpha=27)
CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
regressor = XGB.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.3504242358069306 TEST: 0.3821686483268163


In [96]:
score_5cv_all = []
for i in np.arange(1, 50, 1):
    XGB =Ridge(alpha=i)
    score_5cv =cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass
score_max_5cv = max(score_5cv_all)
alpha_5cv = np.arange(1, 50, 1)[score_5cv_all.index(max(score_5cv_all))]

print("Best_5cv score：{}".format(score_max_5cv),
      "alpha_5cv:{}".format(alpha_5cv))

Best_5cv score：0.3521308818263951 alpha_5cv:49


In [97]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=283)

XGB = Ridge(alpha=alpha_5cv)
CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
CV_predictions = cross_val_predict(XGB, Xtrain, Ytrain, cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
print("r2_5cv:",CV_score,"rmse_5CV",rmse)

regressor = XGB.fit(Xtrain, Ytrain)
test_predictions = regressor.predict(Xtest)
score_test = regressor.score(Xtest,Ytest)
rmse = np.sqrt(mean_squared_error(Ytest,test_predictions))
print("r2_test:",score_test,"rmse_test",rmse)

r2_5cv: 0.3521308818263951 rmse_5CV 25.173295552873277
r2_test: 0.3814845231210475 rmse_test 26.213251091092523


In [98]:
# 执行Y-Scrambling
perform_y_scrambling(XGB, Xtrain, Xtest, Ytrain, Ytest, num_iterations=10)

Original Model Score: 0.3814845231210475
Y Scrambling Average Score: -0.06863342804414801
Score Difference: 0.45011795116519554


# ENR

In [99]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=60)
XGB = ElasticNet(alpha=0.1)
CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
regressor = XGB.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.3435835432660671 TEST: 0.3406260311983924


In [101]:
for i in range(500):
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=i)
    XGB = XGB = ElasticNet(alpha=0.1)
    CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
    regressor = XGB.fit(Xtrain, Ytrain)
    score_test = regressor.score(Xtest,Ytest)
#     if CV < CV_score and test < score_test:
#         CV = CV_score
#         test = score_test
    if CV_score>0.36 and score_test>0.36:
        print("5cv:",CV_score,"TEST:",score_test,"random_state:",i)

5cv: 0.360634070406904 TEST: 0.3640291534447416 random_state: 183
5cv: 0.36020160080888763 TEST: 0.3633678251838942 random_state: 365
5cv: 0.36044924671488604 TEST: 0.3667053119596144 random_state: 423


In [102]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=423)

XGB = ElasticNet(alpha=0.1)
CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
regressor = XGB.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.36044924671488604 TEST: 0.3667053119596144


In [103]:
score_5cv_all = []
for i in np.arange(0.01, 1, 0.01):
    XGB =ElasticNet(alpha=i)
    score_5cv =cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass
score_max_5cv = max(score_5cv_all)
alpha_5cv = np.arange(0.01, 1, 0.01)[score_5cv_all.index(max(score_5cv_all))]

print("Best_5cv score：{}".format(score_max_5cv),
      "alpha_5cv:{}".format(alpha_5cv))


score_5cv_all = []
for i in np.arange(0.1, 1, 0.1):
    XGB = ElasticNet(l1_ratio=i
              ,alpha=alpha_5cv)
    score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass
score_max_5cv = max(score_5cv_all)
l1_ratio_5cv = np.arange(0.1, 1, 0.1)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "l1_ratio_5cv:{}".format(l1_ratio_5cv))

score_5cv_all = []
for i in np.arange(0.00001, 0.01, 0.00001):
    XGB = ElasticNet(tol=i
                ,l1_ratio=l1_ratio_5cv
                ,alpha=alpha_5cv)
    score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass
score_max_5cv = max(score_5cv_all)
tol_5cv = np.arange(0.00001, 0.01, 0.00001)[score_5cv_all.index(score_max_5cv)]

print("Best_5cv score：{}".format(score_max_5cv),
      "tol_5cv:{}".format(tol_5cv))

Best_5cv score：0.360556055641655 alpha_5cv:0.08
Best_5cv score：0.3611513934528829 l1_ratio_5cv:0.9
Best_5cv score：0.3611716318487589 tol_5cv:0.0047


In [104]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=423)

XGB = ElasticNet(tol=tol_5cv
                ,l1_ratio=l1_ratio_5cv
                ,alpha=alpha_5cv)
CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
CV_predictions = cross_val_predict(XGB, Xtrain, Ytrain, cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
print("r2_5cv:",CV_score,"rmse_5CV",rmse)

regressor = XGB.fit(Xtrain, Ytrain)
test_predictions = regressor.predict(Xtest)
score_test = regressor.score(Xtest,Ytest)
rmse = np.sqrt(mean_squared_error(Ytest,test_predictions))
print("r2_test:",score_test,"rmse_test",rmse)

r2_5cv: 0.3611716318487589 rmse_5CV 25.410201868821172
r2_test: 0.36517682954836117 rmse_test 24.84363646496296


In [105]:
# 执行Y-Scrambling
perform_y_scrambling(XGB, Xtrain, Xtest, Ytrain, Ytest, num_iterations=10)

Original Model Score: 0.36517682954836117
Y Scrambling Average Score: -0.06272586539882401
Score Difference: 0.4279026949471852
