In [2]:
# import the necessary libraries to execute this code
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from collections import defaultdict
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr
from sklearn.model_selection import GroupShuffleSplit
from scipy.cluster import hierarchy
import pickle

# Impact of input feature on model performance - LightGBM

In [3]:
datafile = "Dataset_17_feat.xlsx"
df = pd.read_excel(datafile)

with open('Trained_models/17_feat_LGBM_model.pkl', 'rb') as file:  
    model = pickle.load(file)

In [9]:
with open('NESTED_CV_RESULTS/17_feat_LGBM.pkl', 'rb') as file:  
    df = pickle.load(file)
    

{'subsample': 0.8,
 'reg_lambda': 0,
 'reg_alpha': 0,
 'num_leaves': 16,
 'n_estimators': 250,
 'min_child_weight': 0.01,
 'min_child_samples': 40,
 'learning_rate': 0.1,
 'boosting_type': 'dart'}

In [3]:
X = df.drop(['Experimental_index','DP_Group','Release'],axis='columns')
X_features = df.drop(['Experimental_index','DP_Group','Release'],axis='columns')
stdScale = StandardScaler().fit(X)
X=stdScale.transform(X)
Y = df['Release']
G = df['DP_Group']

In [4]:
# Ward_linkage
corr = spearmanr(X).correlation # generate a correlation matrix is symmetric
corr = (corr + corr.T) / 2 # ensure the correlation matrix is symmetric
np.fill_diagonal(corr, 1)
distance_matrix = 1 - np.abs(corr) # convert the correlation matrix to a distance matrix 
dist_linkage = hierarchy.ward(squareform(distance_matrix)) # generate Ward's linkage values for hierarchical clustering

In [5]:
# evaluate_model iterations
MAE_list = [] # empty list to store MAE values
std_list = [] # empty list to store MAE values
test_test_list = [] # empty list to store MAE values
feature_name_list = [] # empty list to store features names
feature_number_list = [] # empty list to store number of features
linkage_distance_list = [] # empty list to store the Ward'slinkage distance
        
for n in range(0, 17, 1):
    cluster_ids = hierarchy.fcluster(dist_linkage, (n/17), criterion="distance") # select input features to be included in this model iteration based on Ward's linkage of n/10
    cluster_id_to_feature_ids = defaultdict(list) 
    
    for idx, cluster_id in enumerate(cluster_ids):
        cluster_id_to_feature_ids[cluster_id].append(idx)
            
    selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]
    linkage_distance_list.append(n/17) # append linkage distance to empty list
    tested_features = []  # create empty list to save feature names
        
    for feature in selected_features: # for loop to append the utilized input feature names to the empty list
        tested_features.append(X_features.columns[feature])
            
    feature_number_list.append(len(tested_features)) # append the number of input features to empty list
    feature_name_list.append(tested_features) # append the list of feature names to an empty list of lists
            
    test_list = []

    for i in range(10): # for loop that splits and testd the model 10 times to generate mean and stdev values
        cv_outer = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=i+1) #hold back 20% of the groups for test set
            
        for train_index, test_index in cv_outer.split(X, Y, G): # split data using group-shuffle-split based on drug-polymer groups
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = Y[train_index], Y[test_index]
                    
            X_train_sel = X_train[:, selected_features] # select input features from training dataset based on Ward's Linkage value
            X_test_sel = X_test[:, selected_features] # select input features from test dataset based on Ward's Linkage value
                    
            clf_sel = model # assign selected model to clf_sel
            clf_sel.fit(X_train_sel, y_train) # fit the selected model with the training set
            y_pred = clf_sel.predict(X_test_sel) # predict test set based on selected input features
            test_list.append(round(mean_absolute_error(y_pred, y_test), 3)) # append average MAE value to empty list
            
    MAE_list.append(np.mean(test_list)) # append average MAE value to empty list
    std_list.append(np.std(test_list)) # append average MAE value to empty list
    test_test_list.append(test_list) # append average MAE value to empty list
                
    print('\n################################################################\n\nSTATUS REPORT:') 
    print('Iteration '+str(n+1)+' of '+str(17)+' completed') 
    print('Test_Score: %.3f' % (np.mean(test_list)))
    print("\n################################################################\n ")


################################################################

STATUS REPORT:
Iteration 1 of 17 completed
Test_Score: 0.116

################################################################
 

################################################################

STATUS REPORT:
Iteration 2 of 17 completed
Test_Score: 0.116

################################################################
 

################################################################

STATUS REPORT:
Iteration 3 of 17 completed
Test_Score: 0.142

################################################################
 

################################################################

STATUS REPORT:
Iteration 4 of 17 completed
Test_Score: 0.143

################################################################
 

################################################################

STATUS REPORT:
Iteration 5 of 17 completed
Test_Score: 0.143

################################################################
 

###################

In [11]:
# results dataframe 
list_of_tuples = list(zip(feature_number_list, feature_name_list, MAE_list, std_list, test_test_list, linkage_distance_list)) # create a list of tuples with results model refinement
results_df = pd.DataFrame(list_of_tuples, columns = ['# of Features', 'Feature names', 'MAE', 'std', 'test_values','linkage distance']) # create a dataframe with results model refinement
results_df

Unnamed: 0,# of Features,Feature names,MAE,std,test_values,linkage distance
0,17,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1158,0.018054,"[0.125, 0.098, 0.086, 0.113, 0.122, 0.118, 0.0...",0.0
1,15,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1155,0.016806,"[0.126, 0.1, 0.086, 0.113, 0.118, 0.12, 0.094,...",0.058824
2,13,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1415,0.017253,"[0.144, 0.126, 0.108, 0.137, 0.154, 0.134, 0.1...",0.117647
3,12,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1433,0.017275,"[0.144, 0.135, 0.106, 0.134, 0.155, 0.138, 0.1...",0.176471
4,12,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1433,0.017275,"[0.144, 0.135, 0.106, 0.134, 0.155, 0.138, 0.1...",0.235294
5,11,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1434,0.018139,"[0.145, 0.131, 0.106, 0.138, 0.155, 0.135, 0.1...",0.294118
6,10,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1429,0.01862,"[0.137, 0.132, 0.106, 0.13, 0.159, 0.137, 0.14...",0.352941
7,10,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1429,0.01862,"[0.137, 0.132, 0.106, 0.13, 0.159, 0.137, 0.14...",0.411765
8,10,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1429,0.01862,"[0.137, 0.132, 0.106, 0.13, 0.159, 0.137, 0.14...",0.470588
9,9,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1392,0.021655,"[0.124, 0.118, 0.109, 0.128, 0.144, 0.134, 0.1...",0.529412


In [12]:
results_df.to_excel("LGBM_15_model_refinement_results.xlsx")