In [13]:
import pandas as pd
import numpy as np
import random
from matplotlib.pyplot import pie, axis, show
import seaborn as sns
import missingno as msno
from scipy import stats
import matplotlib.pyplot as plt
import yaml
import sys
import os

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFE, SelectKBest, f_regression, mutual_info_regression
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn import linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
import statsmodels.regression.linear_model as sm
from sklearn.gaussian_process import GaussianProcessRegressor as GPR
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, CompoundKernel
import sklearn_relief as sr
from skrebate import ReliefF
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import ElasticNet
import lightgbm as ltb
from sklearn.svm import SVR
from scipy.stats import ks_2samp
from tabulate import tabulate

from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.over_sampling import BorderlineSMOTE

from sklearn.multioutput import RegressorChain, MultiOutputRegressor
from sklearn.exceptions import DataConversionWarning

import tensorflow as tf
from tensorflow import keras
import warnings

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../'))) 

from helper import preprocess, get_test_train_data, get_features_kbest, get_features_ref, get_features_ref_multiout,\
    get_features_relieff, get_model_name, cross_val, get_scores, outlier_detect, train_models, pred_all,\
    find_lowest_respponse_value, find_highest_respponse_value, find_closest_to_42, check_aggreement,\
    get_concordant_discordant, print_change_mean, get_perc, percentage_change_original_data, calculate_percentage_change,\
    calculate_percentage_change_othre_responses, calculate_count_diff, calculate_change_diff,\
    drug_class_outlier_remove, plot_scatter_with_CI, plot_scatter

from constants import TRAIN_PATH_WO_LDL_IMPUTATION, TEST_PATH_WO_LDL_IMPUTATION, COMMON_VARIABLE_PATH
warnings.filterwarnings('ignore')

%matplotlib inline

is_train_with_all=False 

# Read common variables from a YAML file
with open('../'+COMMON_VARIABLE_PATH, 'r') as file:
    common_data = yaml.safe_load(file)
    
response_variable_list = common_data['response_variable_list']

correlated_variables = common_data['correlated_variables']

thresh = common_data['thresh']
keep = []
rem = []
# feature selection
items = ['drug_class']




if(is_train_with_all):
    sglt_val = 0.375
    dpp_val = 0.25
else:
    sglt_val = 1
    dpp_val = 0

 
    
def check_distribution(df, df_act, response_variable, predicted_change):
    
    # Find outliers using z-score
    z_scores_col1 = (df_act[response_variable] - np.mean(df_act[response_variable])) / np.std(df_act[response_variable])
    outliers_col1 = df_act[abs(z_scores_col1) > 3]
    outliers_act = outliers_col1.index.to_list()

    z_scores_col2 = (df[predicted_change] - np.mean(df[predicted_change])) / np.std(df[predicted_change])
    outliers_col2 = df[abs(z_scores_col2) > 3]
    outliers_pred = outliers_col2.index.to_list()
    return outliers_act, outliers_pred
    


def drug_class_visualization(df, df_act, response_variable, predicted_change, assigned_drug, baseline_val):
    # glp strata predicted by the model
    df_ = df[[response_variable, predicted_change, 'drug_class', assigned_drug, baseline_val]]
    # glp strata in real
    df_act_ = df_act[[response_variable
                      , predicted_change, 'drug_class', assigned_drug, baseline_val]]

    outliers_act, outliers_pred = check_distribution(df_, df_act_, response_variable, predicted_change)
    
    df_ = df_.drop(outliers_pred)
    df_act_ = df_act_.drop(outliers_act)
    return df_

def train_models(models, X_test, Y_test, X_train, Y_train, train,scaler,X_test_original):
    model_results = {}
    model_results_drugs = {}
    for model in models:
#         print('\n ------------- ' + str(get_model_name(model))+ ' ---------------')
        if str(get_model_name(model)) == 'Sequential':
            model.compile(optimizer='adam', loss='mean_squared_error')
            random.seed(42)
            model = cross_val(model, train, X_test, Y_test, X_train, Y_train, response_variable_list)
            random.seed(42)
            model.fit(X_train, Y_train, epochs=250, batch_size=16, verbose=0)
        else:
            random.seed(42)
            model = cross_val(model, train, X_test, Y_test, X_train, Y_train, response_variable_list)
            random.seed(42)
            model.fit(X_train, Y_train)
        data_pred, model_results, model_results_drugs, score = get_scores(model, X_test, Y_test, X_train, Y_train, model_results, model_results_drugs)

        
    X = X_test.copy()
    X_test_copy = X_test.copy()
    X_test_copy['assigned_drug_hba1c'] = np.nan
    X_test_copy['predicted_change_hba1c'] = np.nan
    X_test_copy['assigned_drug_ldl'] = np.nan
    X_test_copy['predicted_change_ldl'] = np.nan
    X_test_copy['assigned_drug_hdl'] = np.nan
    X_test_copy['predicted_change_hdl'] = np.nan
    X_test_copy['assigned_drug_bmi'] = np.nan
    X_test_copy['predicted_change_bmi'] = np.nan
    
    assigned_drug_class_list = [np.nan] * Y_train.shape[1]
    max_change_list = [np.nan] * Y_train.shape[1]
    
    for index, row in X.iterrows():
        drug_class = row['drug_class']

        pred_original = model.predict(row.values[None])[0]
        pred_sglt, pred_dpp = pred_all(model, row, drug_class) 

        for j in range(Y_train.shape[1]):
            if (Y_train.iloc[:,j].name == 'hdl_12m'):
                temp_max_change, temp_assigned_drug_class = find_highest_respponse_value(pred_sglt[j], pred_dpp[j])
            else:
                temp_max_change, temp_assigned_drug_class = find_lowest_respponse_value(pred_sglt[j], pred_dpp[j])
            
            max_change_list[j] = temp_max_change
            assigned_drug_class_list[j] = temp_assigned_drug_class
            
        X_test_copy.at[index, 'assigned_drug_hba1c'] = assigned_drug_class_list[0]
        X_test_copy.at[index, 'predicted_change_hba1c'] = max_change_list[0]

        X_test_copy.at[index, 'assigned_drug_ldl'] = assigned_drug_class_list[1]
        X_test_copy.at[index, 'predicted_change_ldl'] = max_change_list[1]

        X_test_copy.at[index, 'assigned_drug_hdl'] = assigned_drug_class_list[2]
        X_test_copy.at[index, 'predicted_change_hdl'] = max_change_list[2]

        X_test_copy.at[index, 'assigned_drug_bmi'] = assigned_drug_class_list[3]
        X_test_copy.at[index, 'predicted_change_bmi'] = max_change_list[3]
        
    denormalized_test_data = scaler.inverse_transform(X_test_original)
    denormalized_test_df = pd.DataFrame(denormalized_test_data, columns=X_test_original.columns)
    denormalized_test_df = denormalized_test_df.drop(['drug_class'], axis = 1)

    data = denormalized_test_df
    X_test_ = X_test_copy.copy()
    X_test_= X_test_.reset_index()
    Y_test_ = pd.DataFrame(Y_test)
    Y_test_ = Y_test_.reset_index()
    
    data[response_variable_list] = Y_test_[response_variable_list]
    data['assigned_drug_hba1c'] = X_test_['assigned_drug_hba1c']
    data['predicted_change_hba1c'] = X_test_['predicted_change_hba1c']
    data['assigned_drug_ldl'] = X_test_['assigned_drug_ldl']
    data['predicted_change_ldl'] = X_test_['predicted_change_ldl']
    data['assigned_drug_hdl'] = X_test_['assigned_drug_hdl']
    data['predicted_change_hdl'] = X_test_['predicted_change_hdl']
    data['assigned_drug_bmi'] = X_test_['assigned_drug_bmi']
    data['predicted_change_bmi'] = X_test_['predicted_change_bmi']
    data['drug_class'] = X_test_['drug_class']

    dpp_strata_hba1c = data[(data['assigned_drug_hba1c'] == dpp_val)]
    sglt_strata_hba1c = data[(data['assigned_drug_hba1c'] == sglt_val)] 

    dpp_strata_ldl = data[(data['assigned_drug_ldl'] == dpp_val)]
    sglt_strata_ldl = data[(data['assigned_drug_ldl'] == sglt_val)] 

    dpp_strata_hdl = data[(data['assigned_drug_hdl'] == dpp_val)]
    sglt_strata_hdl = data[(data['assigned_drug_hdl'] == sglt_val)] 

    dpp_strata_bmi = data[(data['assigned_drug_bmi'] == dpp_val)]
    sglt_strata_bmi = data[(data['assigned_drug_bmi'] == sglt_val)] 

    dpp_strata_actual = data[(data['drug_class'] == dpp_val)]
    sglt_strata_actual = data[(data['drug_class'] == sglt_val)]  
    
    dpp_df_hba1c = drug_class_visualization(dpp_strata_hba1c, dpp_strata_actual, 'hba1c_12m','predicted_change_hba1c', 'assigned_drug_hba1c', 'hba1c_bl_6m')
    sglt_df_hba1c = drug_class_visualization(sglt_strata_hba1c, sglt_strata_actual,'hba1c_12m',
                                            'predicted_change_hba1c', 'assigned_drug_hba1c', 'hba1c_bl_6m')
    
    dpp_df_ldl = drug_class_visualization(dpp_strata_ldl, dpp_strata_actual, 'ldl_12m',
                                         'predicted_change_ldl', 'assigned_drug_ldl', 'ldl')
    sglt_df_ldl = drug_class_visualization(sglt_strata_ldl, sglt_strata_actual, 'ldl_12m',
                                          'predicted_change_ldl', 'assigned_drug_ldl', 'ldl')
    
    dpp_df_hdl = drug_class_visualization(dpp_strata_hdl, dpp_strata_actual, 'hdl_12m',
                                         'predicted_change_hdl', 'assigned_drug_hdl', 'hdl')
    sglt_df_hdl = drug_class_visualization(sglt_strata_hdl, sglt_strata_actual, 'hdl_12m',
                                          'predicted_change_hdl', 'assigned_drug_hdl', 'hdl')
    
    dpp_df_bmi = drug_class_visualization(dpp_strata_bmi, dpp_strata_actual, 'bmi_12m',
                                         'predicted_change_bmi', 'assigned_drug_bmi', 'bmi')
    sglt_df_bmi = drug_class_visualization(sglt_strata_bmi, sglt_strata_actual, 'bmi_12m',
                                          'predicted_change_bmi', 'assigned_drug_bmi', 'bmi')

    
    (concordant_dpp_hba1c, discordant_dpp_sglt_hba1c,
    concordant_sglt_hba1c, discordant_sglt_dpp_hba1c ) = get_concordant_discordant(dpp_df_hba1c,sglt_df_hba1c, data,
                                                                                   dpp_strata_actual, sglt_strata_actual,
                                                                                  variable_name = 'assigned_drug_hba1c')
    (concordant_dpp_ldl, discordant_dpp_sglt_ldl,
    concordant_sglt_ldl, discordant_sglt_dpp_ldl ) = get_concordant_discordant(dpp_df_ldl,sglt_df_ldl, data,
                                                                                   dpp_strata_actual, sglt_strata_actual,
                                                                                  variable_name = 'assigned_drug_ldl')
    (concordant_dpp_hdl, discordant_dpp_sglt_hdl,
    concordant_sglt_hdl, discordant_sglt_dpp_hdl ) = get_concordant_discordant(dpp_df_hdl,sglt_df_hdl, data,
                                                                                   dpp_strata_actual, sglt_strata_actual,
                                                                                  variable_name = 'assigned_drug_hdl')
    (concordant_dpp_bmi, discordant_dpp_sglt_bmi,
    concordant_sglt_bmi, discordant_sglt_dpp_bmi ) = get_concordant_discordant(dpp_df_bmi,sglt_df_bmi, data,
                                                                                   dpp_strata_actual, sglt_strata_actual,
                                                                                  variable_name = 'assigned_drug_bmi')


    print('\n -------- Percentage HBA1C  ---------')
    calculate_percentage_change(concordant_dpp_hba1c, discordant_dpp_sglt_hba1c,
                concordant_sglt_hba1c, discordant_sglt_dpp_hba1c, response_variable = 'hba1c_12m', baseline_val='hba1c_bl_6m')
    
    print('\n -------- Percentage LDL  ---------')
    calculate_percentage_change(concordant_dpp_ldl, discordant_dpp_sglt_ldl,
                concordant_sglt_ldl, discordant_sglt_dpp_ldl, response_variable = 'ldl_12m', baseline_val='ldl')
    
    print('\n -------- Percentage HDL  ---------')
    calculate_percentage_change(concordant_dpp_hdl, discordant_dpp_sglt_hdl,
                concordant_sglt_hdl, discordant_sglt_dpp_hdl, response_variable = 'hdl_12m', baseline_val='hdl')
    
    print('\n -------- Percentage BMI  ---------')
    calculate_percentage_change(concordant_dpp_bmi, discordant_dpp_sglt_bmi,
                concordant_sglt_bmi, discordant_sglt_dpp_bmi, response_variable = 'bmi_12m', baseline_val='bmi')
    
    
    return model_results

#         if hasattr(model, 'feature_importances_'):
#             feat_importances = pd.Series(model.feature_importances_, index=X_train.columns)
#             feat_importances.nlargest(20).plot(kind='barh')
#             plt.show()
#         elif hasattr(model, 'coef_'):
#             coef_importances = pd.Series(abs(model.coef_), index=X_train.columns)
#             top_20_features = coef_importances.nlargest(20)
#             top_20_features.plot(kind='barh')
#             plt.show()




In [14]:
def run(algo, i, feats_hc = []):
    df_X_train = pd.read_csv('../../'+TRAIN_PATH_WO_LDL_IMPUTATION, sep = ',',decimal = '.', encoding = 'utf-8', engine ='python',index_col=0)
    df_X_test = pd.read_csv('../../'+TEST_PATH_WO_LDL_IMPUTATION, sep = ',',decimal = '.', encoding = 'utf-8', engine ='python',index_col=0)

    X_train_ = preprocess(df_X_train, response_variable_list)
    X_test_ = preprocess(df_X_test, response_variable_list)
    df, X_train, X_test, Y_train, Y_test, X, Y, scaler, X_test_before_scale = get_test_train_data(X_train_, X_test_, response_variable_list)

    # # train with whole dataset and test with drug class 2,3 and 4 data
    if(is_train_with_all):
        combined_df = pd.concat([X_test, Y_test], axis=1)
        testdf = combined_df[ 
                             (combined_df['drug_class'] == 0.25) | 
                             (combined_df['drug_class'] == 0.375) ]
        X_test = testdf.drop([response_variable_list], axis = 1)
        Y_test = testdf[response_variable_list]

    X_test_original = X_test.copy()

    if(is_train_with_all):
        sglt_val = 0.375
        dpp_val = 0.25
    else:
        sglt_val = 1
        dpp_val = 0


    X_test_ = pd.DataFrame(X_test)
    X_train_ = pd.DataFrame(X_train)

    X_train = X_train.drop(['init_year'], axis = 1)
    X_test = X_test.drop(['init_year'], axis = 1)

    selected_features = []
    
    random.seed(42) 
    if algo == 'kbest':
        for j in range(Y_train.shape[1]):  # Assuming Y.shape[1] is the number of target features
            feats = get_features_kbest(X_train, Y_train.iloc[:, j],i)
            selected_features.append(feats)
    elif algo == 'relieff':
        selected_features.append(feats_hc)
        #for j in range(Y_train.shape[1]):  # Assuming Y.shape[1] is the number of target features
            #feats = get_features_relieff(X_train, Y_train.iloc[:, j],i)
            #selected_features.append(feats)
    elif algo == 'refMulti':
        selected_list = feats_hc
        #selected_list = get_features_ref_multiout(X_train, Y_train, i)
    elif algo=='ref':
        selected_features.append(feats_hc)
        #for j in range(Y_train.shape[1]):  # Assuming Y.shape[1] is the number of target features
            #feats = get_features_ref(X_train, Y_train.iloc[:, j],i)
            #selected_features.append(feats)

    if algo != 'refMulti':
        selected_list = sum(selected_features, [])
    
    for item in items:
        if item not in selected_list:
            selected_list.extend([item])

    # remove duplicate
    selected_list = np.unique(selected_list)
    number_of_features = len(selected_list)
    print('\n\n')
    print(selected_list.tolist())
    X_train_selected = X_train[selected_list]
    X_test_selected = X_test[selected_list]

    ################# OUTLIER CODE ################
    print('Shape of training data before removing outliers:', np.shape(X_train_selected))
    print('Shape of test data before removing outliers:', np.shape(X_test_selected))
    
    out_train, out_test = outlier_detect(X_train_selected, Y_train, X_test_selected, Y_test)
    
    train_ = X_train_selected.copy()
    train_[response_variable_list] = Y_train.values
    
    test_ = X_test_selected.copy()
    test_[response_variable_list] = Y_test.values
    
    train_ = pd.DataFrame(train_.drop(out_train, axis = 0))
    test_ = pd.DataFrame(test_.drop(out_test, axis = 0))
    
    Y_train = train_[response_variable_list]
    X_train_selected = train_.drop(response_variable_list, axis=1)
    
    Y_test = test_[response_variable_list]
    X_test_selected = test_.drop(response_variable_list, axis=1)
    
    print('Shape of training data after removing outliers:', np.shape(X_train_selected))
    print('Shape of test data after removing outliers:', np.shape(X_test_selected))

    X_test_before_scale = pd.DataFrame(X_test_before_scale.drop(out_test, axis = 0)) 
    X_test_original = pd.DataFrame(X_test_original.drop(out_test, axis = 0)) 

    ################
    
    train = X_train_selected.copy()
    train[response_variable_list] = Y_train.values

    xgb = XGBRegressor(
    #n_estimators=100, 
    #eta=0.01, 
    #subsample=0.5, 
    #colsample_bytree=0.8,
    #alpha=0.1,
    #max_depth = 5,
    #max_leaves = 6,
    #learning_rate =0.01
)

    lr = linear_model.LinearRegression(n_jobs = 10)

    kernel = DotProduct()# + WhiteKernel()

    gpr = GPR(kernel, alpha=1e-10, random_state=123)

    rfr = RandomForestRegressor(n_estimators=150, max_depth=10, random_state=123)

    ridge = Ridge(alpha=0.001)

    mlpr = MLPRegressor(random_state=123, 
                         max_iter=2000,
                    hidden_layer_sizes = (128),
                    learning_rate= 'adaptive'
                        )

    gbr = GradientBoostingRegressor(random_state=0)

    catboost = CatBoostRegressor(iterations=40,
                          learning_rate=0.1,
                          depth=6, verbose = 0)

    ltbr = ltb.LGBMRegressor(max_depth = 6, learning_rate = 0.1, verbose = -1, verbose_eval = False)


#    nn = keras.Sequential([
#     keras.layers.Dense(100, activation='relu', input_shape=(number_of_features,)),  # Adjust input shape
# #     keras.layers.Dense(250, activation='relu'),  # Additional hidden layer with 150 neurons
# #     keras.layers.Dropout(0.5),  # Dropout layer for regularization
# #     keras.layers.Dense(100, activation='relu'),  # Another hidden layer with 100 neurons
#     keras.layers.Dense(50, activation='relu'),  # Yet another hidden layer with 50 neurons
#     keras.layers.Dense(25, activation='relu'),  # Additional hidden layer with 25 neurons
#     keras.layers.Dense(1)  # Output layer with a single neuron for regression
#     ])

    nn = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(number_of_features,)),  # Adjust input shape
    keras.layers.Dense(32, activation='relu'),  # Additional hidden layer
#     keras.layers.Dense(64, activation='relu'),  # Another hidden layer
#     keras.layers.Dense(32, activation='relu'),  # Yet another hidden layer
    keras.layers.Dense(4)  # Output layer with a single neuron for regression
])
    random.seed(42)
    #wrapper = MultiOutputRegressor(catboost)
    vr = VotingRegressor([ ('catboost', catboost), ('ltbr', ltbr), ('rfr', rfr), ])
    
# tried combinations for vr 
# rfr + ltbr + cat + xgb
# rfr + ltbr
# rfr + cat
# ltbr + cat
# rfr + ltbr + cat

    wrapper = RegressorChain(vr, order=[0,1,2,3])
    

    models = [wrapper] 
    random.seed(42) 
    model_results = train_models(models, X_test_selected, Y_test, X_train_selected, Y_train, train, scaler, X_test_original)
    return model_results

    

In [15]:

feature_size  = [2,3,4,5,6,7,8,9,10]
warnings.filterwarnings('ignore')
for i in feature_size:
    print('\n\n\n===============  ', i ,' features ','   ===================\n',)
    model_results = run('kbest', i)
    table = []
    for model, score in model_results.items():
        table.append([model, score])

    table_str = tabulate(table, headers=['Model', 'Test R2 Score'], tablefmt='grid')
    print(table_str)
        
    print('\n\n\n')
        






Shape of data after excluding missing response: (1566, 125)
Shape of full data after selecting date range dates > 21 days (1084, 125)
Shape of data after excluding missing response: (156, 125)
Shape of full data after selecting date range dates > 21 days (102, 125)
X_train shape after imputation:  (1077, 107)

 Shape of the data after oversampling
{3.0, 4.0}
[(3.0, 440), (4.0, 637)]
[(3.0, 637), (4.0, 637)]



['bmi', 'drug_class', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'ldl', 'obese', 'trigly']
Shape of training data before removing outliers: (1274, 8)
Shape of test data before removing outliers: (101, 8)
Training set outliers: [640, 834, 238, 1060, 448, 640, 834, 923, 952, 966, 1066, 565, 640, 778, 834]
Testing set outliers: []
Shape of training data after removing outliers: (1263, 8)
Shape of test data after removing outliers: (101, 8)
Cross validation variance 0.0016256645483346528
Cross validation mean score 0.6156694474031493
R2 score Training : 0.7950530047135204
R2 score Tes


KeyboardInterrupt



In [4]:

feature_size  = [2,3,4,5,7,9]


feats = [
    ['bmi', 'cvd_comp', 'drug_class', 'hba1c_bl_6m', 'hdl', 'insulin', 'ldl', 'obese', 'sp'],
    ['bmi', 'comb_comp_enn', 'cvd_comp', 'drug_class', 'eGFR', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'insulin', 'ldl', 'obese', 'sp', 'sum_diab_drugs'],
    ['bmi', 'comb_comp_enn', 'concordant_dis', 'cvd_comp', 'drug_class', 'eGFR', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'ika', 'insulin', 'ldl', 'obese', 'sp', 'sum_diab_drugs', 'trigly'],
    ['MD_RCT_mmol_mol', 'bmi', 'comb_comp_enn', 'concordant_dis', 'cvd_comp', 'drug_class', 'eGFR', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'ika', 'insulin', 'ldl', 'obese', 'sp', 'sum_diab_drugs', 't2d_dur_y', 'trigly'],
    ['MD_RCT_mmol_mol', 'bmi', 'chd', 'comb_comp_enn', 'concordant_dis', 'cvd_comp', 'drug_class', 'eGFR', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'hyperten', 'ika', 'insulin', 'ldl', 'obese', 'sp', 'sum_diab_drugs', 't2d_dur_y', 'trigly'],
    ['MD_RCT_mmol_mol', 'bmi', 'chd', 'comb_comp_enn', 'concordant_dis', 'cvd_comp', 'dg406', 'drug_class', 'eGFR', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'hyperten', 'ika', 'insulin', 'ldl', 'obese', 'sp', 'sum_diab_drugs', 't2d_dur_y', 'trigly'],
    
    ]
warnings.filterwarnings('ignore')
for i, val in enumerate(feature_size):
    print('\n\n\n===============  ', val ,' features ','   ===================\n',)
    model_results = run('relieff', val, feats[i])
    table = []
    for model, score in model_results.items():
        table.append([model, score])

    table_str = tabulate(table, headers=['Model', 'Test R2 Score'], tablefmt='grid')
    print(table_str)
        
    print('\n\n\n')





Shape of data after excluding missing response: (1566, 125)
Shape of full data after selecting date range dates > 21 days (1084, 115)
Shape of data after excluding missing response: (156, 125)
Shape of full data after selecting date range dates > 21 days (102, 115)
X_train shape:  (1077, 107)
(1077, 107)
{3.0, 4.0}
[(3.0, 440), (4.0, 637)]
[(3.0, 637), (4.0, 637)]



['bmi', 'cvd_comp', 'drug_class', 'hba1c_bl_6m', 'hdl', 'insulin', 'ldl', 'obese', 'sp']
Shape of training data before removing outliers: (1274, 9)
Shape of test data before removing outliers: (101, 9)
Training set outliers: [1060, 448, 834, 923, 952, 1066, 565, 670, 778, 834, 1058, 1111]
Testing set outliers: []
Shape of training data after removing outliers: (1263, 9)
Shape of test data after removing outliers: (101, 9)
Cross validation variance 0.0006533138749707186
Cross validation mean score 0.6058475391060799
R2 score Training : 0.7904531788928693
R2 score Testing: 0.4683
RMSE: 5.349908
DPP samples  42 49
SGLT sa

(1077, 107)
{3.0, 4.0}
[(3.0, 440), (4.0, 637)]
[(3.0, 637), (4.0, 637)]



['bmi', 'comb_comp_enn', 'concordant_dis', 'cvd_comp', 'drug_class', 'eGFR', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'ika', 'insulin', 'ldl', 'obese', 'sp', 'sum_diab_drugs', 'trigly']
Shape of training data before removing outliers: (1274, 16)
Shape of test data before removing outliers: (101, 16)
Training set outliers: [340, 521, 640, 762, 972, 238, 252, 1060, 69, 826, 923, 952, 1066, 145, 363, 565, 1058, 1111]
Testing set outliers: []
Shape of training data after removing outliers: (1256, 16)
Shape of test data after removing outliers: (101, 16)
Cross validation variance 0.0002939518635962457
Cross validation mean score 0.6298473176744157
R2 score Training : 0.8320642739037185
R2 score Testing: 0.4651
RMSE: 5.499023
DPP samples  40 49
SGLT samples  61 52


Category    Real value    Predicted value    Count    Percentage of Predicted cases
----------  ------------  -----------------  -------  ------------------

(1077, 107)
{3.0, 4.0}
[(3.0, 440), (4.0, 637)]
[(3.0, 637), (4.0, 637)]



['MD_RCT_mmol_mol', 'bmi', 'chd', 'comb_comp_enn', 'concordant_dis', 'cvd_comp', 'drug_class', 'eGFR', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'hyperten', 'ika', 'insulin', 'ldl', 'obese', 'sp', 'sum_diab_drugs', 't2d_dur_y', 'trigly']
Shape of training data before removing outliers: (1274, 20)
Shape of test data before removing outliers: (101, 20)
Training set outliers: [340, 521, 640, 762, 972, 238, 1060, 69, 315, 826, 923, 952, 1066, 145, 363, 565, 1058, 1111]
Testing set outliers: []
Shape of training data after removing outliers: (1256, 20)
Shape of test data after removing outliers: (101, 20)
Cross validation variance 0.0006281834843839548
Cross validation mean score 0.6358413812737341
R2 score Training : 0.8403686063177956
R2 score Testing: 0.4704
RMSE: 5.430112
DPP samples  44 49
SGLT samples  56 52


Category    Real value    Predicted value    Count    Percentage of Predicted cases
----------  ---------

In [5]:

feats = [
    ['bmi', 'drug_class', 'eGFR', 'hba1c_bl_6m', 'hdl', 'ldl'],
    ['bmi', 'drug_class', 'eGFR', 'gluk', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'ika', 'ldl'],
    ['bmi', 'drug_class', 'eGFR', 'gluk', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'ika', 'ldl', 'trigly'],
    ['P_Krea', 'bmi', 'drug_class', 'eGFR', 'gluk', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'ika', 'ldl', 'trigly'],
    ['P_Krea', 'bmi', 'drug_class', 'eGFR', 'gluk', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'ika', 'ldl', 't2d_dur_y', 'trigly'],
    

]

feature_size  = [2,3,4,5,6]
warnings.filterwarnings('ignore')
for i, val in enumerate(feature_size):
    print('\n\n\n===============  ', val ,' features ','   ===================\n',)
    model_results = run('ref', val, feats[i])
    table = []
    for model, score in model_results.items():
        table.append([model, score])

    table_str = tabulate(table, headers=['Model', 'Test R2 Score'], tablefmt='grid')
    print(table_str)
        
    print('\n\n\n')





Shape of data after excluding missing response: (1566, 125)
Shape of full data after selecting date range dates > 21 days (1084, 115)
Shape of data after excluding missing response: (156, 125)
Shape of full data after selecting date range dates > 21 days (102, 115)
X_train shape:  (1077, 107)
(1077, 107)
{3.0, 4.0}
[(3.0, 440), (4.0, 637)]
[(3.0, 637), (4.0, 637)]



['bmi', 'drug_class', 'eGFR', 'hba1c_bl_6m', 'hdl', 'ldl']
Shape of training data before removing outliers: (1274, 6)
Shape of test data before removing outliers: (101, 6)
Training set outliers: [238, 1060, 448, 834, 923, 834]
Testing set outliers: []
Shape of training data after removing outliers: (1269, 6)
Shape of test data after removing outliers: (101, 6)
Cross validation variance 8.912255373153239e-05
Cross validation mean score 0.5967027247551456
R2 score Training : 0.7908526086243748
R2 score Testing: 0.4490
RMSE: 5.591692
DPP samples  57 49
SGLT samples  44 52


Category    Real value    Predicted value    Cou

(1077, 107)
{3.0, 4.0}
[(3.0, 440), (4.0, 637)]
[(3.0, 637), (4.0, 637)]



['bmi', 'drug_class', 'eGFR', 'gluk', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'ika', 'ldl', 'trigly']
Shape of training data before removing outliers: (1274, 10)
Shape of test data before removing outliers: (101, 10)
Training set outliers: [340, 521, 640, 972, 238, 252, 1060, 69, 315, 826, 923, 952, 1066, 145, 363, 565, 749, 1058, 1111]
Testing set outliers: []
Shape of training data after removing outliers: (1255, 10)
Shape of test data after removing outliers: (101, 10)
Cross validation variance 6.819618142091637e-05
Cross validation mean score 0.6229774527605346
R2 score Training : 0.8287653751617574
R2 score Testing: 0.4447
RMSE: 5.635239
DPP samples  51 49
SGLT samples  50 52


Category    Real value    Predicted value    Count    Percentage of Predicted cases
----------  ------------  -----------------  -------  -------------------------------
Concordant  SGLT          SGLT               25       50.00%
Dis

(1077, 107)
{3.0, 4.0}
[(3.0, 440), (4.0, 637)]
[(3.0, 637), (4.0, 637)]



['P_Krea', 'bmi', 'drug_class', 'eGFR', 'gluk', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'ika', 'ldl', 't2d_dur_y', 'trigly']
Shape of training data before removing outliers: (1274, 12)
Shape of test data before removing outliers: (101, 12)
Training set outliers: [62, 340, 521, 762, 972, 252, 1060, 315, 468, 826, 923, 938, 952, 1066, 145, 363, 565, 749, 1058, 1111]
Testing set outliers: []
Shape of training data after removing outliers: (1254, 12)
Shape of test data after removing outliers: (101, 12)
Cross validation variance 3.9228766755695196e-05
Cross validation mean score 0.6342659494377912
R2 score Training : 0.836688349987735
R2 score Testing: 0.4603
RMSE: 5.454550
DPP samples  47 49
SGLT samples  54 52


Category    Real value    Predicted value    Count    Percentage of Predicted cases
----------  ------------  -----------------  -------  -------------------------------
Concordant  SGLT          SGLT      

In [6]:
feats = [
    ['P_Krea', 'bmi', 'drug_class', 'eGFR', 'gluk', 'hba1c_bl_18m', 'hba1c_bl_6m', 'ika', 'ldl', 't2d_dur_y', 'trigly'],
    ['P_Krea', 'bmi', 'drug_class', 'eGFR', 'gluk', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'ika', 'ldl', 'n_of_dis', 't2d_dur_y', 'trigly'],
    ['MD_RCT_mmol_mol', 'P_Krea', 'bmi', 'drug_class', 'eGFR', 'gluk', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'ika', 'ldl', 'n_of_dis', 'smoking', 'sum_diab_drugs', 't2d_dur_y', 'trigly'],
    ['C10A', 'MD_RCT_mmol_mol', 'P_Krea', 'bmi', 'comb_comp_enn', 'drug_class', 'eGFR', 'gluk', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'ika', 'insulin', 'ldl', 'met_oad0', 'n_of_dis', 'smoking', 'sum_diab_drugs', 't2d_dur_y', 'trigly'],
    ['C10A', 'MD_RCT_mmol_mol', 'P_Krea', 'T2D_nocomp', 'bmi', 'comb_comp_enn', 'drug_class', 'eGFR', 'gluk', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'ika', 'insulin', 'ldl', 'met_oad0', 'n_of_dis', 'renal_insuf', 'smoking', 'sum_diab_drugs', 't2d_dur_y', 'trigly'],
    
   ]

feature_size  = [10,12,15,20,22]
warnings.filterwarnings('ignore')
for i, val in enumerate(feature_size):
    print('\n\n\n===============  ', val ,' features ','   ===================\n',)
    model_results = run('refMulti', val, feats[i])
    table = []
    for model, score in model_results.items():
        table.append([model, score])

    table_str = tabulate(table, headers=['Model', 'Test R2 Score'], tablefmt='grid')
    print(table_str)
        
    print('\n\n\n')
        






Shape of data after excluding missing response: (1566, 125)
Shape of full data after selecting date range dates > 21 days (1084, 115)
Shape of data after excluding missing response: (156, 125)
Shape of full data after selecting date range dates > 21 days (102, 115)
X_train shape:  (1077, 107)
(1077, 107)
{3.0, 4.0}
[(3.0, 440), (4.0, 637)]
[(3.0, 637), (4.0, 637)]



['P_Krea', 'bmi', 'drug_class', 'eGFR', 'gluk', 'hba1c_bl_18m', 'hba1c_bl_6m', 'ika', 'ldl', 't2d_dur_y', 'trigly']
Shape of training data before removing outliers: (1274, 11)
Shape of test data before removing outliers: (101, 11)
Training set outliers: [62, 340, 521, 762, 972, 252, 1060, 219, 278, 288, 875, 924, 1063, 145, 363, 565, 749, 1058, 1111]
Testing set outliers: []
Shape of training data after removing outliers: (1255, 11)
Shape of test data after removing outliers: (101, 11)
Cross validation variance 0.00014121884891293474
Cross validation mean score 0.5087618743882788
R2 score Training : 0.770731919892667
R

(1077, 107)
{3.0, 4.0}
[(3.0, 440), (4.0, 637)]
[(3.0, 637), (4.0, 637)]



['MD_RCT_mmol_mol', 'P_Krea', 'bmi', 'drug_class', 'eGFR', 'gluk', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'ika', 'ldl', 'n_of_dis', 'smoking', 'sum_diab_drugs', 't2d_dur_y', 'trigly']
Shape of training data before removing outliers: (1274, 16)
Shape of test data before removing outliers: (101, 16)
Training set outliers: [62, 340, 521, 762, 972, 1060, 315, 826, 923, 938, 952, 1066, 145, 363, 565, 749, 1058, 1111]
Testing set outliers: []
Shape of training data after removing outliers: (1256, 16)
Shape of test data after removing outliers: (101, 16)
Cross validation variance 7.919007549090501e-05
Cross validation mean score 0.6370364246978598
R2 score Training : 0.8437217837311796
R2 score Testing: 0.4640
RMSE: 5.399471
DPP samples  27 49
SGLT samples  73 52


Category    Real value    Predicted value    Count    Percentage of Predicted cases
----------  ------------  -----------------  -------  ------------------

{3.0, 4.0}
[(3.0, 440), (4.0, 637)]
[(3.0, 637), (4.0, 637)]



['C10A', 'MD_RCT_mmol_mol', 'P_Krea', 'T2D_nocomp', 'bmi', 'comb_comp_enn', 'drug_class', 'eGFR', 'gluk', 'hba1c_bl_18m', 'hba1c_bl_6m', 'hdl', 'ika', 'insulin', 'ldl', 'met_oad0', 'n_of_dis', 'renal_insuf', 'smoking', 'sum_diab_drugs', 't2d_dur_y', 'trigly']
Shape of training data before removing outliers: (1274, 22)
Shape of test data before removing outliers: (101, 22)
Training set outliers: [62, 340, 521, 762, 972, 1060, 315, 468, 826, 923, 938, 952, 1066, 145, 363, 565, 749, 1058, 1111]
Testing set outliers: []
Shape of training data after removing outliers: (1255, 22)
Shape of test data after removing outliers: (101, 22)
Cross validation variance 0.00021744955810470902
Cross validation mean score 0.6363598615833382
R2 score Training : 0.846418069490694
R2 score Testing: 0.4764
RMSE: 5.286809
DPP samples  39 49
SGLT samples  61 52


Category    Real value    Predicted value    Count    Percentage of Predicted cases
--