In [1]:
# This file contains calculations of combine all 4 output predictions based on the majority votes
# if the votes are tie, assigne that sample based on hba1c prediction value

# Majority

In [2]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import sys
import os

from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))  

from utils import calculate_accuracy, ensemble_based_on_majority, find_optimal_threshold, calculate_change_diff
from helper import read_data, calculate_percentage_change, get_concordant_discordant

In [3]:
# read data
df = pd.read_csv('../../resources/output/pred_drug_classes.csv', sep = ',',decimal = '.', encoding = 'utf-8', engine ='python',index_col=0)
df_drugs = df[['assigned_drug_hba1c', 'assigned_drug_ldl', 'assigned_drug_hdl', 'assigned_drug_bmi', 'drug_class', 'hba1c_12m',
              'ldl_12m', 'hdl_12m', 'bmi_12m', 'hba1c_bl_6m', 'ldl', 'hdl', 'bmi', 'predicted_change_hba1c',
              'predicted_change_ldl', 'predicted_change_hdl', 'predicted_change_bmi']]


In [4]:
df_ = df_drugs.copy()
df_.loc[:, 'ensemble_drug'] = ensemble_based_on_majority(df_drugs[['assigned_drug_hba1c', 'assigned_drug_ldl', 'assigned_drug_hdl', 
                                     'assigned_drug_bmi']], 
                                     'assigned_drug_hba1c',
                                     'ensemble_drug')
df_

Unnamed: 0,assigned_drug_hba1c,assigned_drug_ldl,assigned_drug_hdl,assigned_drug_bmi,drug_class,hba1c_12m,ldl_12m,hdl_12m,bmi_12m,hba1c_bl_6m,ldl,hdl,bmi,predicted_change_hba1c,predicted_change_ldl,predicted_change_hdl,predicted_change_bmi,ensemble_drug
0,1.0,1.0,0.0,1.0,0.0,58.0,2.3,1.39,30.430000,63.0,2.6,1.46,30.830000,56.018617,2.528864,1.427429,30.063085,1.0
1,1.0,0.0,1.0,1.0,0.0,78.0,1.4,0.52,32.529999,75.0,3.8,0.72,31.540000,72.320903,3.000471,0.866972,31.366936,1.0
2,1.0,1.0,0.0,1.0,0.0,103.0,2.0,0.77,35.110001,79.0,2.0,0.67,37.760000,71.863396,1.985390,0.831057,37.086272,1.0
3,1.0,1.0,1.0,1.0,1.0,51.0,3.2,0.94,26.770000,67.0,2.6,0.84,27.820000,59.861824,2.508297,0.937501,28.847765,1.0
4,0.0,1.0,0.0,1.0,0.0,61.0,3.0,1.47,29.760000,55.0,3.1,1.33,30.990000,56.061498,2.486481,1.228122,30.005407,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,1.0,0.0,1.0,1.0,1.0,61.0,1.7,0.82,33.200001,55.0,1.7,0.79,33.203125,51.826783,1.836274,0.862424,32.711037,1.0
97,0.0,1.0,0.0,0.0,1.0,63.0,3.4,2.07,22.788994,60.0,3.0,2.16,23.350000,61.985294,2.583750,1.960671,22.721342,0.0
98,1.0,1.0,1.0,1.0,1.0,77.0,1.5,1.00,31.246056,92.0,1.3,0.89,31.730000,67.492458,1.671794,0.986951,31.132784,1.0
99,1.0,1.0,0.0,1.0,1.0,56.0,1.3,0.87,28.212244,59.0,2.3,1.03,29.289796,51.692244,1.888852,1.043621,29.568023,1.0


In [5]:
# evaluate
precision = precision_score(df_['drug_class'], df_['ensemble_drug'])
recall = recall_score(df_['drug_class'], df_['ensemble_drug'])

print('\n ======= Majority vote model results =======')
print(f"Accuracy: {calculate_accuracy(df_, 'drug_class', 'ensemble_drug'):.2f}")
print(f"F1 score: {f1_score(df_['drug_class'], df_['ensemble_drug'], average='weighted')}")
print("Precision:", precision)
print("Recall:", recall)

cm = confusion_matrix(df_['drug_class'], df_['ensemble_drug'])
print("Confusion Matrix:")
print(cm)



Accuracy: 0.56
F1 score: 0.5345531174739097
Precision: 0.5526315789473685
Recall: 0.8076923076923077
Confusion Matrix:
[[15 34]
 [10 42]]


In [6]:
sglt_val = 1
dpp_val = 0
dpp_strata = df_[(df_['ensemble_drug'] == dpp_val)]
sglt_strata = df_[(df_['ensemble_drug'] == sglt_val)] 

dpp_strata_actual = df_[(df_['drug_class'] == dpp_val)]
sglt_strata_actual = df_[(df_['drug_class'] == sglt_val)] 


(concordant_dpp, discordant_dpp_sglt,
    concordant_sglt, discordant_sglt_dpp ) = get_concordant_discordant(dpp_strata,sglt_strata, df_,
                                                                                   dpp_strata_actual, sglt_strata_actual,
                                                                                  variable_name = 'ensemble_drug')

print('\n============= HBA1C ===================')    
#print_change_mean(concordant_dpp, discordant_dpp_sglt,
#            concordant_sglt, discordant_sglt_dpp, response_variable = 'hba1c_12m')

#print('\n====== Percentage =========')
calculate_percentage_change(concordant_dpp, discordant_dpp_sglt,
            concordant_sglt, discordant_sglt_dpp, response_variable = 'hba1c_12m', baseline_val='hba1c_bl_6m')

#print('\n====== Percentage in Original data =========')
#percentage_change_original_data(dpp_strata_actual, sglt_strata_actual,baseline_val='hba1c_bl_6m', response_variable = 'hba1c_12m')


print('\n============= LDL ===================')    

calculate_percentage_change(concordant_dpp, discordant_dpp_sglt,
            concordant_sglt, discordant_sglt_dpp, response_variable = 'ldl_12m', baseline_val='ldl')

print('\n============= HDL ===================')    

calculate_percentage_change(concordant_dpp, discordant_dpp_sglt,
            concordant_sglt, discordant_sglt_dpp, response_variable = 'hdl_12m', baseline_val='hdl')

print('\n============= BMI ===================')    

calculate_percentage_change(concordant_dpp, discordant_dpp_sglt,
            concordant_sglt, discordant_sglt_dpp, response_variable = 'bmi_12m', baseline_val='bmi')


DPP samples  25 49
SGLT samples  76 52


Category    Real value    Predicted value    Count    Percentage of Predicted cases
----------  ------------  -----------------  -------  -------------------------------
Concordant  SGLT          SGLT               42       55.26%
Discordant  DPP           SGLT               34       40.00%

Concordant  DPP           DPP                15       60.00%
Discordant  SGLT          DPP                10       44.74%



Category    Real value    Predicted value    Mean Change from Baseline    std    treatment difference
----------  ------------  -----------------  ---------------------------  -----  ----------------------
Concordant  SGLT          SGLT               -11.57                       15.52  -8.19
Discordant  DPP           SGLT               -3.38                        11.67

Concordant  DPP           DPP                -5.73                        17.66  -5.23
Discordant  SGLT          DPP                -0.50                        8.21



In [7]:
# Change calculated with respect to baseline - calculated for concordant only - double check
calculate_change_diff(concordant_dpp, discordant_dpp_sglt, concordant_sglt, discordant_sglt_dpp,
                     'hba1c_12m', 'hba1c_bl_6m', 'predicted_change_hba1c', 'hba1c')

calculate_change_diff(concordant_dpp, discordant_dpp_sglt, concordant_sglt, discordant_sglt_dpp,
                     'ldl_12m', 'ldl', 'predicted_change_ldl', 'ldl')

calculate_change_diff(concordant_dpp, discordant_dpp_sglt, concordant_sglt, discordant_sglt_dpp,
                     'hdl_12m', 'hdl', 'predicted_change_hdl', 'hdl')

calculate_change_diff(concordant_dpp, discordant_dpp_sglt, concordant_sglt, discordant_sglt_dpp,
                     'bmi_12m', 'bmi', 'predicted_change_bmi', 'bmi')

The number of patients who showed improvement over 12-month with  hba1c  change (observed vs predicted) 24 : 33
The number of patients who showed improvement over 12-month with  ldl  change (observed vs predicted) 29 : 28
The number of patients who showed improvement over 12-month with  hdl  change (observed vs predicted) 29 : 28
The number of patients who showed improvement over 12-month with  bmi  change (observed vs predicted) 23 : 34


# Based on regression model weights


In [8]:

regression_model_fi = pd.read_csv('../../resources/output/feature_importance_dataframe.csv', sep = ',',decimal = '.', encoding = 'utf-8', engine ='python')

df_reg_weights = df_drugs.copy()

# Create a dictionary to store the variables
variables = {}

# Iterate over the DataFrame and assign values to the dictionary
for index, row in regression_model_fi.iterrows():
    variables[row['Feature']] = row['Importance']

hba1c_cost = variables['hba1c_bl_6m']
ldl_cost = variables['ldl']
hdl_cost = variables['hdl']
bmi_cost = variables['bmi']

weighted_sum = (
    df_reg_weights['assigned_drug_hba1c'] * hba1c_cost +
    df_reg_weights['assigned_drug_ldl'] * ldl_cost +
    df_reg_weights['assigned_drug_hdl'] * hdl_cost +
    df_reg_weights['assigned_drug_bmi'] * bmi_cost
)

actual_values = df_reg_weights['drug_class']

optimal_threshold = weighted_sum.mean()
#optimal_threshold = weighted_sum.median()

#optimal_threshold = find_optimal_threshold(actual_values, weighted_sum)

# Create the new binary column based on the optimal threshold
df_reg_weights['weighted_binary'] = (weighted_sum >= optimal_threshold).astype(int)

precision = precision_score(df_reg_weights['drug_class'], df_reg_weights['weighted_binary'])
recall = recall_score(df_reg_weights['drug_class'], df_reg_weights['weighted_binary'])

print('\n ======= Model based on regression model weights =======')

print(f"Optimal Threshold: {optimal_threshold}")
print(f"Accuracy: {calculate_accuracy(df_reg_weights, 'drug_class', 'weighted_binary'):.2f}")
print(f"F1 score: {f1_score(df_reg_weights['drug_class'], df_reg_weights['weighted_binary'], average='weighted')}")
print("Precision:", precision)
print("Recall:", recall)

cm = confusion_matrix(df_reg_weights['drug_class'], df_reg_weights['weighted_binary'])
print("Confusion Matrix:")
print(cm)



Optimal Threshold: 1.265009949306025
Accuracy: 0.50
F1 score: 0.48727715350245665
Precision: 0.5079365079365079
Recall: 0.6153846153846154
Confusion Matrix:
[[18 31]
 [20 32]]


In [9]:
sglt_val = 1
dpp_val = 0
dpp_strata = df_reg_weights[(df_reg_weights['weighted_binary'] == dpp_val)]
sglt_strata = df_reg_weights[(df_reg_weights['weighted_binary'] == sglt_val)] 

dpp_strata_actual = df_reg_weights[(df_reg_weights['drug_class'] == dpp_val)]
sglt_strata_actual = df_reg_weights[(df_reg_weights['drug_class'] == sglt_val)] 


(concordant_dpp, discordant_dpp_sglt,
    concordant_sglt, discordant_sglt_dpp ) = get_concordant_discordant(dpp_strata,sglt_strata, df_reg_weights,
                                                                                   dpp_strata_actual, sglt_strata_actual,
                                                                                  variable_name = 'weighted_binary')

print('\n============= HBA1C ===================')    
#print_change_mean(concordant_dpp, discordant_dpp_sglt,
#            concordant_sglt, discordant_sglt_dpp, response_variable = 'hba1c_12m')

#print('\n====== Percentage =========')
calculate_percentage_change(concordant_dpp, discordant_dpp_sglt,
            concordant_sglt, discordant_sglt_dpp, response_variable = 'hba1c_12m', baseline_val='hba1c_bl_6m')

#print('\n====== Percentage in Original data =========')
#percentage_change_original_data(dpp_strata_actual, sglt_strata_actual,baseline_val='hba1c_bl_6m', response_variable = 'hba1c_12m')


print('\n============= LDL ===================')    

calculate_percentage_change(concordant_dpp, discordant_dpp_sglt,
            concordant_sglt, discordant_sglt_dpp, response_variable = 'ldl_12m', baseline_val='ldl')

print('\n============= HDL ===================')    

calculate_percentage_change(concordant_dpp, discordant_dpp_sglt,
            concordant_sglt, discordant_sglt_dpp, response_variable = 'hdl_12m', baseline_val='hdl')

print('\n============= BMI ===================')    

calculate_percentage_change(concordant_dpp, discordant_dpp_sglt,
            concordant_sglt, discordant_sglt_dpp, response_variable = 'bmi_12m', baseline_val='bmi')


DPP samples  38 49
SGLT samples  63 52


Category    Real value    Predicted value    Count    Percentage of Predicted cases
----------  ------------  -----------------  -------  -------------------------------
Concordant  SGLT          SGLT               32       50.79%
Discordant  DPP           SGLT               31       52.63%

Concordant  DPP           DPP                18       47.37%
Discordant  SGLT          DPP                20       49.21%



Category    Real value    Predicted value    Mean Change from Baseline    std    treatment difference
----------  ------------  -----------------  ---------------------------  -----  ----------------------
Concordant  SGLT          SGLT               -11.59                       17.04  -8.72
Discordant  DPP           SGLT               -2.87                        11.45

Concordant  DPP           DPP                -6.22                        16.90  -0.22
Discordant  SGLT          DPP                -6.00                        10.48


In [10]:
calculate_change_diff(concordant_dpp, discordant_dpp_sglt, concordant_sglt, discordant_sglt_dpp,
                     'hba1c_12m', 'hba1c_bl_6m', 'predicted_change_hba1c', 'hba1c')

calculate_change_diff(concordant_dpp, discordant_dpp_sglt, concordant_sglt, discordant_sglt_dpp,
                     'ldl_12m', 'ldl', 'predicted_change_ldl', 'ldl')

calculate_change_diff(concordant_dpp, discordant_dpp_sglt, concordant_sglt, discordant_sglt_dpp,
                     'hdl_12m', 'hdl', 'predicted_change_hdl', 'hdl')

calculate_change_diff(concordant_dpp, discordant_dpp_sglt, concordant_sglt, discordant_sglt_dpp,
                     'bmi_12m', 'bmi', 'predicted_change_bmi', 'bmi')

The number of patients who showed improvement over 12-month with  hba1c  change (observed vs predicted) 22 : 28
The number of patients who showed improvement over 12-month with  ldl  change (observed vs predicted) 26 : 24
The number of patients who showed improvement over 12-month with  hdl  change (observed vs predicted) 26 : 24
The number of patients who showed improvement over 12-month with  bmi  change (observed vs predicted) 22 : 28
