In [1]:
# This file contains calculations of combine all 4 output predictions based on the majority votes
# if the votes are tie, assigne that sample based on hba1c prediction value

# Majority

In [48]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import sys
import os

from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from helper import calculate_accuracy, ensemble_based_on_majority, find_optimal_threshold,\
    preprocess, cross_val, get_concordant_discordant, print_change_mean,\
    percentage_change_original_data, calculate_percentage_change, calculate_change_diff


In [49]:
# read data
df = pd.read_csv('../../resources/output/pred_drug_classes.csv', sep = ',',decimal = '.', encoding = 'utf-8', engine ='python',index_col=0)
df_drugs = df[['assigned_drug_hba1c', 'assigned_drug_ldl', 'assigned_drug_hdl', 'assigned_drug_bmi', 'drug_class', 'hba1c_12m',
              'ldl_12m', 'hdl_12m', 'bmi_12m', 'hba1c_bl_6m', 'ldl', 'hdl', 'bmi', 'predicted_change_hba1c',
              'predicted_change_ldl', 'predicted_change_hdl', 'predicted_change_bmi']]


In [50]:
df_ = df_drugs.copy()
df_.loc[:, 'ensemble_drug'] = ensemble_based_on_majority(df_drugs[['assigned_drug_hba1c', 'assigned_drug_ldl', 'assigned_drug_hdl', 
                                     'assigned_drug_bmi']], 
                                     'assigned_drug_hba1c',
                                     'ensemble_drug')
df_

Unnamed: 0,assigned_drug_hba1c,assigned_drug_ldl,assigned_drug_hdl,assigned_drug_bmi,drug_class,hba1c_12m,ldl_12m,hdl_12m,bmi_12m,hba1c_bl_6m,ldl,hdl,bmi,predicted_change_hba1c,predicted_change_ldl,predicted_change_hdl,predicted_change_bmi,ensemble_drug
0,0.0,1.0,0.0,1.0,0.0,58.0,2.3,1.39,30.430000,63.0,2.6,1.46,30.830000,55.456286,2.430687,1.460468,29.687271,0.0
1,1.0,0.0,1.0,0.0,0.0,78.0,1.4,0.52,32.529999,75.0,3.8,0.72,31.540000,67.246703,3.050544,0.832553,31.436039,1.0
2,1.0,1.0,0.0,1.0,0.0,103.0,2.0,0.77,35.110001,79.0,2.0,0.67,37.760000,73.985764,2.022018,0.814987,37.919748,1.0
3,1.0,1.0,0.0,1.0,1.0,51.0,3.2,0.94,26.770000,67.0,2.6,0.84,27.820000,55.529919,2.636080,0.947397,28.623580,1.0
4,1.0,1.0,0.0,1.0,0.0,61.0,3.0,1.47,29.760000,55.0,3.1,1.33,30.990000,60.048111,2.385945,1.231697,29.971377,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0.0,0.0,0.0,1.0,1.0,61.0,1.7,0.82,33.200001,55.0,1.7,0.79,33.203125,50.442575,1.862125,0.903474,32.563001,0.0
97,0.0,0.0,0.0,1.0,1.0,63.0,3.4,2.07,22.788994,60.0,3.0,2.16,23.350000,61.088641,2.922136,2.030098,21.830292,0.0
98,1.0,0.0,0.0,1.0,1.0,77.0,1.5,1.00,31.246056,92.0,1.3,0.89,31.730000,64.061304,1.719694,1.005502,30.828090,1.0
99,1.0,1.0,0.0,0.0,1.0,56.0,1.3,0.87,28.212244,59.0,2.3,1.03,29.289796,50.757059,1.657861,0.999599,29.229344,1.0


In [51]:
# evaluate
precision = precision_score(df_['drug_class'], df_['ensemble_drug'])
recall = recall_score(df_['drug_class'], df_['ensemble_drug'])

print('\n ======= Majority vote model results =======')
print(f"Accuracy: {calculate_accuracy(df_, 'drug_class', 'ensemble_drug'):.2f}")
print(f"F1 score: {f1_score(df_['drug_class'], df_['ensemble_drug'], average='weighted')}")
print("Precision:", precision)
print("Recall:", recall)

cm = confusion_matrix(df_['drug_class'], df_['ensemble_drug'])
print("Confusion Matrix:")
print(cm)



Accuracy: 0.47
F1 score: 0.45250881020305417
Precision: 0.48484848484848486
Recall: 0.6153846153846154
Confusion Matrix:
[[15 34]
 [20 32]]


In [52]:
sglt_val = 1
dpp_val = 0
dpp_strata = df_[(df_['ensemble_drug'] == dpp_val)]
sglt_strata = df_[(df_['ensemble_drug'] == sglt_val)] 

dpp_strata_actual = df_[(df_['drug_class'] == dpp_val)]
sglt_strata_actual = df_[(df_['drug_class'] == sglt_val)] 


(concordant_dpp, discordant_dpp_sglt,
    concordant_sglt, discordant_sglt_dpp ) = get_concordant_discordant(dpp_strata,sglt_strata, df_,
                                                                                   dpp_strata_actual, sglt_strata_actual,
                                                                                  variable_name = 'ensemble_drug')

print('\n============= HBA1C ===================')    
#print_change_mean(concordant_dpp, discordant_dpp_sglt,
#            concordant_sglt, discordant_sglt_dpp, response_variable = 'hba1c_12m')

#print('\n====== Percentage =========')
calculate_percentage_change(concordant_dpp, discordant_dpp_sglt,
            concordant_sglt, discordant_sglt_dpp, response_variable = 'hba1c_12m', baseline_val='hba1c_bl_6m')

#print('\n====== Percentage in Original data =========')
#percentage_change_original_data(dpp_strata_actual, sglt_strata_actual,baseline_val='hba1c_bl_6m', response_variable = 'hba1c_12m')


print('\n============= LDL ===================')    

calculate_percentage_change(concordant_dpp, discordant_dpp_sglt,
            concordant_sglt, discordant_sglt_dpp, response_variable = 'ldl_12m', baseline_val='ldl')

print('\n============= HDL ===================')    

calculate_percentage_change(concordant_dpp, discordant_dpp_sglt,
            concordant_sglt, discordant_sglt_dpp, response_variable = 'hdl_12m', baseline_val='hdl')

print('\n============= BMI ===================')    

calculate_percentage_change(concordant_dpp, discordant_dpp_sglt,
            concordant_sglt, discordant_sglt_dpp, response_variable = 'bmi_12m', baseline_val='bmi')


DPP samples  35 49
SGLT samples  66 52


Category    Real value    Predicted value    Count    Percentage of Predicted cases
----------  ------------  -----------------  -------  -------------------------------
Concordant  SGLT          SGLT               32       48.48%
Discordant  DPP           SGLT               34       57.14%

Concordant  DPP           DPP                15       42.86%
Discordant  SGLT          DPP                20       51.52%



Category    Real value    Predicted value    Mean Change from Baseline    std    treatment difference
----------  ------------  -----------------  ---------------------------  -----  ----------------------
Concordant  SGLT          SGLT               -13.38                       17.40  -11.55
Discordant  DPP           SGLT               -1.82                        10.86

Concordant  DPP           DPP                -9.27                        17.82  -6.12
Discordant  SGLT          DPP                -3.15                        6.49


In [53]:
# Change calculated with respect to baseline - calculated for concordant only - double check
calculate_change_diff(concordant_dpp, discordant_dpp_sglt, concordant_sglt, discordant_sglt_dpp,
                     'hba1c_12m', 'hba1c_bl_6m', 'predicted_change_hba1c', 'hba1c')

calculate_change_diff(concordant_dpp, discordant_dpp_sglt, concordant_sglt, discordant_sglt_dpp,
                     'ldl_12m', 'ldl', 'predicted_change_ldl', 'ldl')

calculate_change_diff(concordant_dpp, discordant_dpp_sglt, concordant_sglt, discordant_sglt_dpp,
                     'hdl_12m', 'hdl', 'predicted_change_hdl', 'hdl')

calculate_change_diff(concordant_dpp, discordant_dpp_sglt, concordant_sglt, discordant_sglt_dpp,
                     'bmi_12m', 'bmi', 'predicted_change_bmi', 'bmi')

The number of patients who showed improvement over 12-month with  hba1c  change (observed vs predicted) 20 : 27
The number of patients who showed improvement over 12-month with  ldl  change (observed vs predicted) 22 : 25
The number of patients who showed improvement over 12-month with  hdl  change (observed vs predicted) 23 : 24
The number of patients who showed improvement over 12-month with  bmi  change (observed vs predicted) 17 : 30


# Based on regression model weights


In [8]:

regression_model_fi = pd.read_csv('../resources/output/feature_importance_dataframe.csv', sep = ',',decimal = '.', encoding = 'utf-8', engine ='python')

df_reg_weights = df_drugs.copy()

# Create a dictionary to store the variables
variables = {}

# Iterate over the DataFrame and assign values to the dictionary
for index, row in regression_model_fi.iterrows():
    variables[row['Feature']] = row['Importance']

hba1c_cost = variables['hba1c_bl_6m']
ldl_cost = variables['ldl']
hdl_cost = variables['hdl']
bmi_cost = variables['bmi']

weighted_sum = (
    df_reg_weights['assigned_drug_hba1c'] * hba1c_cost +
    df_reg_weights['assigned_drug_ldl'] * ldl_cost +
    df_reg_weights['assigned_drug_hdl'] * hdl_cost +
    df_reg_weights['assigned_drug_bmi'] * bmi_cost
)

actual_values = df_reg_weights['drug_class']

optimal_threshold = weighted_sum.mean()
#optimal_threshold = weighted_sum.median()

#optimal_threshold = find_optimal_threshold(actual_values, weighted_sum)

# Create the new binary column based on the optimal threshold
df_reg_weights['weighted_binary'] = (weighted_sum >= optimal_threshold).astype(int)

precision = precision_score(df_reg_weights['drug_class'], df_reg_weights['weighted_binary'])
recall = recall_score(df_reg_weights['drug_class'], df_reg_weights['weighted_binary'])

print('\n ======= Model based on regression model weights =======')

print(f"Optimal Threshold: {optimal_threshold}")
print(f"Accuracy: {calculate_accuracy(df_reg_weights, 'drug_class', 'weighted_binary'):.2f}")
print(f"F1 score: {f1_score(df_reg_weights['drug_class'], df_reg_weights['weighted_binary'], average='weighted')}")
print("Precision:", precision)
print("Recall:", recall)

cm = confusion_matrix(df_reg_weights['drug_class'], df_reg_weights['weighted_binary'])
print("Confusion Matrix:")
print(cm)



Optimal Threshold: 1.628104318926101
Accuracy: 0.43
F1 score: 0.4196141353265762
Precision: 0.425
Recall: 0.3269230769230769
Confusion Matrix:
[[26 23]
 [35 17]]


In [9]:
sglt_val = 1
dpp_val = 0
dpp_strata = df_reg_weights[(df_reg_weights['weighted_binary'] == dpp_val)]
sglt_strata = df_reg_weights[(df_reg_weights['weighted_binary'] == sglt_val)] 

dpp_strata_actual = df_reg_weights[(df_reg_weights['drug_class'] == dpp_val)]
sglt_strata_actual = df_reg_weights[(df_reg_weights['drug_class'] == sglt_val)] 


(concordant_dpp, discordant_dpp_sglt,
    concordant_sglt, discordant_sglt_dpp ) = get_concordant_discordant(dpp_strata,sglt_strata, df_reg_weights,
                                                                                   dpp_strata_actual, sglt_strata_actual,
                                                                                  variable_name = 'weighted_binary')

print('\n============= HBA1C ===================')    
#print_change_mean(concordant_dpp, discordant_dpp_sglt,
#            concordant_sglt, discordant_sglt_dpp, response_variable = 'hba1c_12m')

#print('\n====== Percentage =========')
calculate_percentage_change(concordant_dpp, discordant_dpp_sglt,
            concordant_sglt, discordant_sglt_dpp, response_variable = 'hba1c_12m', baseline_val='hba1c_bl_6m')

#print('\n====== Percentage in Original data =========')
#percentage_change_original_data(dpp_strata_actual, sglt_strata_actual,baseline_val='hba1c_bl_6m', response_variable = 'hba1c_12m')


print('\n============= LDL ===================')    

calculate_percentage_change(concordant_dpp, discordant_dpp_sglt,
            concordant_sglt, discordant_sglt_dpp, response_variable = 'ldl_12m', baseline_val='ldl')

print('\n============= HDL ===================')    

calculate_percentage_change(concordant_dpp, discordant_dpp_sglt,
            concordant_sglt, discordant_sglt_dpp, response_variable = 'hdl_12m', baseline_val='hdl')

print('\n============= BMI ===================')    

calculate_percentage_change(concordant_dpp, discordant_dpp_sglt,
            concordant_sglt, discordant_sglt_dpp, response_variable = 'bmi_12m', baseline_val='bmi')


DPP samples  61 49
SGLT samples  40 52


Category    Real value    Predicted value    Count    Percentage of Predicted cases
----------  ------------  -----------------  -------  -------------------------------
Concordant  SGLT          SGLT               17       42.50%
Discordant  DPP           SGLT               23       57.38%

Concordant  DPP           DPP                26       42.62%
Discordant  SGLT          DPP                35       57.50%



Category    Real value    Predicted value    Mean Change from Baseline    std    treatment difference
----------  ------------  -----------------  ---------------------------  -----  ----------------------
Concordant  SGLT          SGLT               -12.82                       18.74  -10.26
Discordant  DPP           SGLT               -2.57                        12.31

Concordant  DPP           DPP                -5.46                        14.81  2.34
Discordant  SGLT          DPP                -7.80                        12.80


In [10]:
calculate_change_diff(concordant_dpp, discordant_dpp_sglt, concordant_sglt, discordant_sglt_dpp,
                     'hba1c_12m', 'hba1c_bl_6m', 'predicted_change_hba1c', 'hba1c')

calculate_change_diff(concordant_dpp, discordant_dpp_sglt, concordant_sglt, discordant_sglt_dpp,
                     'ldl_12m', 'ldl', 'predicted_change_ldl', 'ldl')

calculate_change_diff(concordant_dpp, discordant_dpp_sglt, concordant_sglt, discordant_sglt_dpp,
                     'hdl_12m', 'hdl', 'predicted_change_hdl', 'hdl')

calculate_change_diff(concordant_dpp, discordant_dpp_sglt, concordant_sglt, discordant_sglt_dpp,
                     'bmi_12m', 'bmi', 'predicted_change_bmi', 'bmi')

The number of patients who showed improvement over 12-month with  hba1c  change (observed vs predicted) 20 : 23
The number of patients who showed improvement over 12-month with  ldl  change (observed vs predicted) 23 : 20
The number of patients who showed improvement over 12-month with  hdl  change (observed vs predicted) 21 : 22
The number of patients who showed improvement over 12-month with  bmi  change (observed vs predicted) 19 : 24
