In [9]:
# import the necessary libraries to execute this code
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from sklearn.model_selection import RandomizedSearchCV as RSCV
import pickle
from lightgbm import LGBMRegressor

# NESTED_CV for the reduced feature model (LGBM)

In [66]:
datafile = "Dataset_15_feat.xlsx"
df = pd.read_excel(datafile)
          
model = LGBMRegressor(random_state=4)
p_grid ={"n_estimators":[100,150,200,250,300,400,500,600],
        'boosting_type': ['gbdt', 'dart', 'goss'],
        'num_leaves':[16,32,64,128,256],
        'learning_rate':[0.1,0.01,0.001,0.0001],
        'min_child_weight': [0.001,0.01,0.1,1.0,10.0],
        'subsample': [0.4,0.6,0.8,1.0],
        'min_child_samples':[2,10,20,40,100],
        'reg_alpha': [0, 0.005, 0.01, 0.015],
        'reg_lambda': [0, 0.005, 0.01, 0.015]}
        
X = df.drop(['Experimental_index','DP_Group','Release'],axis='columns')
stdScale = StandardScaler().fit(X)
X=stdScale.transform(X)
Y = df['Release']
G = df['DP_Group']
E = df['Experimental_index']
T = df['Time']    

In [68]:
NUM_TRIALS = 10

itr_number = [] # create new empty list for itr number 
outer_results = []
inner_results = []
model_params = []
G_test_list = []
y_test_list = []
E_test_list = []
T_test_list = []
pred_list = []

for i in range(NUM_TRIALS): #configure the cross-validation procedure - outer loop (test set) 
    
    cv_outer = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=i) #hold back 20% of the groups for test set
    
    # split data using GSS
    for train_index, test_index in cv_outer.split(X, Y, G):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        G_train, G_test = G[train_index], G[test_index]
        E_train, E_test = E[train_index], E[test_index]
        T_train, T_test = T[train_index], T[test_index]

        # store test set information
        G_test = np.array(G_test) #prevents index from being brought from dataframe
        G_test_list.append(G_test)
        E_test = np.array(E_test) #prevents index from being brought from dataframe
        E_test_list.append(E_test)
        T_test = np.array(T_test) #prevents index from being brought from dataframe
        T_test_list.append(T_test)
        y_test = np.array(y_test) #prevents index from being brought from dataframe
        y_test_list.append(y_test)

        # configure the cross-validation procedure - inner loop (validation set/HP optimization)
        cv_inner = GroupKFold(n_splits=10) #should be 10 fold group split for inner loop

        # define search space
        search = RSCV(model, p_grid, n_iter=100, verbose=0, scoring='neg_mean_absolute_error', n_jobs= 6, cv=cv_inner, refit=True) # should be 100

        # execute search
        result = search.fit(X_train, y_train, groups=G_train)

        # get the best performing model fit on the whole training set
        best_model = result.best_estimator_
        
        # get the score for the best performing model and store
        best_score = abs(result.best_score_)
        inner_results.append(best_score)

        # evaluate model on the hold out dataset
        yhat = np.round(best_model.predict(X_test), 3)

        
        # store drug release predictions
        pred_list.append(yhat)

        # evaluate the model
        acc = mean_absolute_error(y_test, yhat)

        # store the result
        itr_number.append(i+1)
        outer_results.append(acc)
        model_params.append(result.best_params_)

        # report progress at end of each inner loop
        print('\n################################################################\n\nSTATUS REPORT:') 
        print('Iteration '+str(i+1)+' of '+str(NUM_TRIALS)+' completed') 
        print('Test_Score: %.3f, Best_Valid_Score: %.3f, \n\nBest_Model_Params: \n%s' % (acc, best_score, result.best_params_))
        print("\n################################################################\n ")





################################################################

STATUS REPORT:
Iteration 1 of 10 completed
Test_Score: 0.084, Best_Valid_Score: 0.132, 

Best_Model_Params: 
{'subsample': 0.6, 'reg_lambda': 0, 'reg_alpha': 0.005, 'num_leaves': 64, 'n_estimators': 500, 'min_child_weight': 0.1, 'min_child_samples': 2, 'learning_rate': 0.1, 'boosting_type': 'goss'}

################################################################
 





################################################################

STATUS REPORT:
Iteration 2 of 10 completed
Test_Score: 0.124, Best_Valid_Score: 0.126, 

Best_Model_Params: 
{'subsample': 1.0, 'reg_lambda': 0.01, 'reg_alpha': 0.005, 'num_leaves': 16, 'n_estimators': 500, 'min_child_weight': 1.0, 'min_child_samples': 20, 'learning_rate': 0.1, 'boosting_type': 'dart'}

################################################################
 





################################################################

STATUS REPORT:
Iteration 3 of 10 completed
Test_Score: 0.108, Best_Valid_Score: 0.125, 

Best_Model_Params: 
{'subsample': 0.8, 'reg_lambda': 0.005, 'reg_alpha': 0.005, 'num_leaves': 32, 'n_estimators': 600, 'min_child_weight': 0.1, 'min_child_samples': 100, 'learning_rate': 0.1, 'boosting_type': 'dart'}

################################################################
 





################################################################

STATUS REPORT:
Iteration 4 of 10 completed
Test_Score: 0.119, Best_Valid_Score: 0.142, 

Best_Model_Params: 
{'subsample': 0.4, 'reg_lambda': 0.01, 'reg_alpha': 0.01, 'num_leaves': 32, 'n_estimators': 250, 'min_child_weight': 0.1, 'min_child_samples': 10, 'learning_rate': 0.1, 'boosting_type': 'gbdt'}

################################################################
 





################################################################

STATUS REPORT:
Iteration 5 of 10 completed
Test_Score: 0.111, Best_Valid_Score: 0.125, 

Best_Model_Params: 
{'subsample': 0.6, 'reg_lambda': 0.01, 'reg_alpha': 0.005, 'num_leaves': 16, 'n_estimators': 400, 'min_child_weight': 0.001, 'min_child_samples': 40, 'learning_rate': 0.1, 'boosting_type': 'dart'}

################################################################
 





################################################################

STATUS REPORT:
Iteration 6 of 10 completed
Test_Score: 0.113, Best_Valid_Score: 0.129, 

Best_Model_Params: 
{'subsample': 0.6, 'reg_lambda': 0, 'reg_alpha': 0.01, 'num_leaves': 64, 'n_estimators': 100, 'min_child_weight': 0.01, 'min_child_samples': 20, 'learning_rate': 0.1, 'boosting_type': 'gbdt'}

################################################################
 





################################################################

STATUS REPORT:
Iteration 7 of 10 completed
Test_Score: 0.122, Best_Valid_Score: 0.122, 

Best_Model_Params: 
{'subsample': 1.0, 'reg_lambda': 0.015, 'reg_alpha': 0.015, 'num_leaves': 16, 'n_estimators': 300, 'min_child_weight': 1.0, 'min_child_samples': 20, 'learning_rate': 0.1, 'boosting_type': 'dart'}

################################################################
 





################################################################

STATUS REPORT:
Iteration 8 of 10 completed
Test_Score: 0.097, Best_Valid_Score: 0.133, 

Best_Model_Params: 
{'subsample': 0.6, 'reg_lambda': 0.01, 'reg_alpha': 0, 'num_leaves': 16, 'n_estimators': 200, 'min_child_weight': 0.1, 'min_child_samples': 20, 'learning_rate': 0.1, 'boosting_type': 'goss'}

################################################################
 





################################################################

STATUS REPORT:
Iteration 9 of 10 completed
Test_Score: 0.140, Best_Valid_Score: 0.121, 

Best_Model_Params: 
{'subsample': 0.8, 'reg_lambda': 0.015, 'reg_alpha': 0.005, 'num_leaves': 64, 'n_estimators': 150, 'min_child_weight': 10.0, 'min_child_samples': 40, 'learning_rate': 0.1, 'boosting_type': 'goss'}

################################################################
 





################################################################

STATUS REPORT:
Iteration 10 of 10 completed
Test_Score: 0.173, Best_Valid_Score: 0.119, 

Best_Model_Params: 
{'subsample': 0.8, 'reg_lambda': 0.015, 'reg_alpha': 0.015, 'num_leaves': 16, 'n_estimators': 500, 'min_child_weight': 0.001, 'min_child_samples': 40, 'learning_rate': 0.1, 'boosting_type': 'goss'}

################################################################
 


In [69]:
#create dataframe with results of nested CV
list_of_tuples = list(zip(itr_number, inner_results, outer_results, model_params, G_test_list, E_test_list, T_test_list, y_test_list, pred_list))
CV_dataset = pd.DataFrame(list_of_tuples, columns = ['Iter', 'Valid Score', 'Test Score', 'Model Parms', 'DP_Groups', "Experimental Index", "Time", 'Experimental_Release', 'Predicted_Release'])
CV_dataset['Score_difference'] = abs(CV_dataset['Valid Score'] - CV_dataset['Test Score']) #Groupby dataframe model iterations that best fit the data (i.e., validitaion <= test)
CV_dataset.sort_values(by=['Score_difference', 'Test Score'], ascending=True, inplace=True) 
CV_dataset = CV_dataset.reset_index(drop=True) # Reset index of dataframe
CV_dataset.to_pickle("NESTED_CV_RESULTS/LGBM_15_feat.pkl", compression='infer', protocol=5, storage_options=None) # save dataframe as pickle file
CV_dataset.describe()

Unnamed: 0,Iter,Valid Score,Test Score,Score_difference
count,10.0,10.0,10.0,10.0
mean,5.5,0.127273,0.119088,0.022861
std,3.02765,0.006875,0.024174,0.017834
min,1.0,0.118771,0.084377,0.000537
25%,3.25,0.122468,0.108564,0.01457
50%,5.5,0.125436,0.11597,0.018124
75%,7.75,0.130987,0.123421,0.032702
max,10.0,0.141841,0.17277,0.053999


In [71]:
best_model_params = CV_dataset.iloc[0,3] # assign the best model paramaters
LGBM_15 = model.set_params(**best_model_params) # set params from the best model
LGBM_15 = LGBM_15.fit(X, Y)
with open('Trained_models/15_feat_LGBM_model.pkl', 'wb') as file: # Save the Model to pickle file
          pickle.dump(LGBM_15, file)
LGBM_15

LGBMRegressor(boosting_type='dart', min_child_weight=1.0, n_estimators=300,
              num_leaves=16, random_state=4, reg_alpha=0.015, reg_lambda=0.015)