In [1]:
# import the necessary libraries to execute this code
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from collections import defaultdict
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr
from sklearn.model_selection import GroupShuffleSplit
from scipy.cluster import hierarchy
import pickle

# Impact of input feature on model performance - LightGBM

In [8]:
datafile = "Dataset_14_feat.xlsx"
df = pd.read_excel(datafile)

with open('Trained_models/14_feat_RF_model.pkl', 'rb') as file:  
    model = pickle.load(file)

In [9]:
X = df.drop(['Experimental_index','DP_Group','Release'],axis='columns')
X_features = df.drop(['Experimental_index','DP_Group','Release'],axis='columns')
stdScale = StandardScaler().fit(X)
X=stdScale.transform(X)
Y = df['Release']
G = df['DP_Group']

In [10]:
# Ward_linkage
corr = spearmanr(X).correlation # generate a correlation matrix is symmetric
corr = (corr + corr.T) / 2 # ensure the correlation matrix is symmetric
np.fill_diagonal(corr, 1)
distance_matrix = 1 - np.abs(corr) # convert the correlation matrix to a distance matrix 
dist_linkage = hierarchy.ward(squareform(distance_matrix)) # generate Ward's linkage values for hierarchical clustering

In [11]:
# evaluate_model iterations
MAE_list = [] # empty list to store MAE values
std_list = [] # empty list to store MAE values
test_test_list = [] # empty list to store MAE values
feature_name_list = [] # empty list to store features names
feature_number_list = [] # empty list to store number of features
linkage_distance_list = [] # empty list to store the Ward'slinkage distance
        
for n in range(0, 14, 1):
    cluster_ids = hierarchy.fcluster(dist_linkage, (n/14), criterion="distance") # select input features to be included in this model iteration based on Ward's linkage of n/10
    cluster_id_to_feature_ids = defaultdict(list) 
    
    for idx, cluster_id in enumerate(cluster_ids):
        cluster_id_to_feature_ids[cluster_id].append(idx)
            
    selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]
    linkage_distance_list.append(n/14) # append linkage distance to empty list
    tested_features = []  # create empty list to save feature names
        
    for feature in selected_features: # for loop to append the utilized input feature names to the empty list
        tested_features.append(X_features.columns[feature])
            
    feature_number_list.append(len(tested_features)) # append the number of input features to empty list
    feature_name_list.append(tested_features) # append the list of feature names to an empty list of lists
            
    test_list = []

    for i in range(10): # for loop that splits and testd the model 10 times to generate mean and stdev values
        cv_outer = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=i+1) #hold back 20% of the groups for test set
            
        for train_index, test_index in cv_outer.split(X, Y, G): # split data using group-shuffle-split based on drug-polymer groups
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = Y[train_index], Y[test_index]
                    
            X_train_sel = X_train[:, selected_features] # select input features from training dataset based on Ward's Linkage value
            X_test_sel = X_test[:, selected_features] # select input features from test dataset based on Ward's Linkage value
                    
            clf_sel = model # assign selected model to clf_sel
            clf_sel.fit(X_train_sel, y_train) # fit the selected model with the training set
            y_pred = clf_sel.predict(X_test_sel) # predict test set based on selected input features
            test_list.append(round(mean_absolute_error(y_pred, y_test), 3)) # append average MAE value to empty list
            
    MAE_list.append(np.mean(test_list)) # append average MAE value to empty list
    std_list.append(np.std(test_list)) # append average MAE value to empty list
    test_test_list.append(test_list) # append average MAE value to empty list
                
    print('\n################################################################\n\nSTATUS REPORT:') 
    print('Iteration '+str(n+1)+' of '+str(14)+' completed') 
    print('Test_Score: %.3f' % (np.mean(test_list)))
    print("\n################################################################\n ")


################################################################

STATUS REPORT:
Iteration 1 of 14 completed
Test_Score: 0.160

################################################################
 

################################################################

STATUS REPORT:
Iteration 2 of 14 completed
Test_Score: 0.161

################################################################
 

################################################################

STATUS REPORT:
Iteration 3 of 14 completed
Test_Score: 0.161

################################################################
 

################################################################

STATUS REPORT:
Iteration 4 of 14 completed
Test_Score: 0.161

################################################################
 

################################################################

STATUS REPORT:
Iteration 5 of 14 completed
Test_Score: 0.162

################################################################
 

###################

In [12]:
# results dataframe 
list_of_tuples = list(zip(feature_number_list, feature_name_list, MAE_list, std_list, test_test_list, linkage_distance_list)) # create a list of tuples with results model refinement
results_df = pd.DataFrame(list_of_tuples, columns = ['# of Features', 'Feature names', 'MAE', 'std', 'test_values','linkage distance']) # create a dataframe with results model refinement
results_df

Unnamed: 0,# of Features,Feature names,MAE,std,test_values,linkage distance
0,14,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1603,0.033191,"[0.151, 0.165, 0.118, 0.153, 0.14, 0.139, 0.17...",0.0
1,13,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1612,0.032012,"[0.15, 0.164, 0.13, 0.152, 0.14, 0.135, 0.172,...",0.071429
2,11,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1614,0.02674,"[0.15, 0.168, 0.133, 0.155, 0.141, 0.141, 0.17...",0.142857
3,11,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1614,0.02674,"[0.15, 0.168, 0.133, 0.155, 0.141, 0.141, 0.17...",0.214286
4,10,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1615,0.027833,"[0.145, 0.167, 0.132, 0.153, 0.146, 0.14, 0.17...",0.285714
5,9,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1688,0.030096,"[0.145, 0.166, 0.147, 0.167, 0.152, 0.143, 0.1...",0.357143
6,9,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.1688,0.030096,"[0.145, 0.166, 0.147, 0.167, 0.152, 0.143, 0.1...",0.428571
7,8,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.175,0.032131,"[0.137, 0.169, 0.157, 0.17, 0.157, 0.157, 0.19...",0.5
8,8,"[LA/GA, Polymer_MW, CL Ratio, Drug_Tm, Drug_Pk...",0.175,0.032131,"[0.137, 0.169, 0.157, 0.17, 0.157, 0.157, 0.19...",0.571429
9,7,"[LA/GA, Polymer_MW, Drug_Tm, Drug_Pka, Drug_Mw...",0.1789,0.031198,"[0.148, 0.171, 0.159, 0.17, 0.173, 0.152, 0.20...",0.642857


In [13]:
results_df.to_excel("RF_model_refinement_results.xlsx")