In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import seaborn as sns

from sklearn.preprocessing import scale, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier

import joblib


In [None]:
demoDF = pd.read_csv('./data/demographics.csv')
quesDF = pd.read_csv('./data/questionnaire.csv')
examDF = pd.read_csv('./data/examination.csv')
medsDF = pd.read_csv('./data/medications.csv')
labsDF= pd.read_csv('./data/labs.csv')
dietDF= pd.read_csv('./data/diet.csv')

medsDF = medsDF.groupby('SEQN').agg({'RXDUSE': 'sum', 'RXDDAYS': 'max'}).reset_index()

In [None]:
fullDF = demoDF.merge(quesDF, on='SEQN', how='inner')
fullDF = fullDF.merge(medsDF, on='SEQN', how='inner')
fullDF = fullDF.merge(labsDF, on='SEQN', how='left')
fullDF = fullDF.merge(examDF, on='SEQN', how='left')
fullDF = fullDF.merge(dietDF, on='SEQN', how='left')

In [None]:
#seperate dataset into different races

nonwhiteDF= fullDF[fullDF['RIDRETH3'].isin([2,1,3,6,7])]
whiteDF= fullDF[fullDF['RIDRETH3'].isin([3])]
hispanicDF= fullDF[fullDF['RIDRETH3'].isin([1,2])]
blackDF= fullDF[fullDF['RIDRETH3'].isin([4])]
asianDF= fullDF[fullDF['RIDRETH3'].isin([6])]
multiraceDF = fullDF[fullDF['RIDRETH3'].isin([7])]

In [None]:
#listing how many patients are there based on races

print(len(nonwhiteDF))
print(len(whiteDF))
print(len(hispanicDF))
print(len(blackDF))
print(len(asianDF))
print(len(multiraceDF))

In [None]:
#selecting columns that we would like to further study on

deprCode = [
    'SEQN', 'RIDRETH3', 'RIAGENDR', 'RIDAGEYR', 'DMDBORN4', 'DMDCITZN',
    'DMDMARTL', 'DMDEDUC2', 'DMDHHSIZ', 'DMDFMSIZ', 'DBD895', 'DBD905',
    'DBD910', 'CBD120', 'CBD130', 'DIQ010', 'DID040', 'FSDHH', 'FSD032C',
    'PFQ061B', 'PFQ061D', 'PFQ061E', 'PAQ610', 'PAQ625', 'PAQ640', 'PAQ655',
    'PAQ670', 'PAQ710', 'PAQ715', 'RXDUSE', 'RXDDAYS', 'SLD010H', 'SLQ050',
    'SLQ060', 'SMD641', 'SMD650', 'SMD030', 'SMQ858', 'SMQ872', 'SMQ878',
    'BMXBMI', 'WHQ070', 'WHD080A', 'WHD080B', 'WHD080D', 'WHD080M', 'MCQ025',
    'MCQ160A', 'MCQ180A', 'MCQ160N', 'MCQ180N', 'MCQ160C', 'MCQ180C', 'MCQ160D',
    'MCQ180D', 'MCQ160E', 'MCQ180E', 'MCQ160F', 'MCQ180F', 'MCQ220', 'BPXSY1',
    'BPXDI1', 'BPQ080', 'LBXTR', 'LBDLDL', 'LBXTC', 'INDHHIN2', 'INDFMIN2',
    'BPQ020', 'BMXWT', 'BMXHT', 'DIQ050', 'DIQ175A', 'BMXWAIST', 'DBD900',
    'CBQ505', 'CBQ535', 'DPQ010', 'DPQ020', 'DPQ030', 'DPQ040', 'DPQ050',
    'DPQ060', 'DPQ070', 'DPQ080', 'DPQ090', 'DPQ100', 'ALQ120Q', 'URXUMS',
    'URDACT', 'LBXSTR', 'LBXSUA', 'LBXSCH', 'LBDSGLSI', 'DR1TKCAL', 'DR1TCARB',
    'DR1TSUGR'
]


In [None]:
#apply all the selected columns to all different races

fullDF = fullDF[deprCode]
nonwhiteDF= nonwhiteDF[deprCode]
whiteDF= whiteDF[deprCode]
hispanicDF= hispanicDF[deprCode]
blackDF= blackDF[deprCode]
asianDF= asianDF[deprCode]
multiraceDF = multiraceDF[deprCode]

In [None]:
#Dealing with missing values

def replace_na_mental(df, col):
    for c in col:
        # Replace 7 and 9 with NaN in this column
        df[c].replace({7: np.nan, 9: np.nan}, inplace=True)
    return df

def replace_na(df, col):
    df = df.copy()  
    for c in col:
        df.loc[:, c] = df[c].fillna(value=0).replace({10: 1, 11: 1, 13: 1, 34: 1})
    return df

def replace_na_7_9(df, col):
  for c in col:
    df[c]= df[c].fillna(value=0).replace({7: 0, 9: 0})
  return df

def replace_na_1_7_9(df, col):
  for c in col:
    df[c]= df[c].fillna(value=1).replace({7: 1, 9: 1})
  return df

def replace_na_2_7_9(df, col):
  for c in col:
    df[c]= df[c].fillna(value=0).replace({7: 0, 9: 0, 2:0})
  return df

def replace_na_2_3_7_9(df, col):
  for c in col:
    df[c]= df[c].fillna(value=0).replace({7: 0, 9: 0, 2:0, 1:2, 3:1})
  return df


def replace_na_77_99(df, col):
  for c in col:
    df[c]= df[c].fillna(value=0).replace({77: 0, 99: 0})
  return df


def replace_none_77_99(df, col):
    for c in col:
        df[c] = df[c].replace({77: None, 99: None})
    return df

def replace_na_77_99_8(df, col):
  for c in col:
    df[c]= df[c].fillna(value=0).replace({77: 0, 8:0, 99: 0})
  return df

def replace_na_777_999(df, col):
  for c in col:
    df[c]= df[c].fillna(value=0).replace({777: 0, 999: 0})
  return df

def replace_na_7777_9999(df, col):
  for c in col:
    df[c]= df[c].fillna(value=0).replace({7777: 0, 9999: 0})
  return df

def replace_na_77777_99999(df, col):
  for c in col:
    df[c]= df[c].fillna(value=0).replace({77777: 0, 99999: 0})
  return df

def replace_na_10_77_99(df, col):
  for c in col:
    df[c]= df[c].fillna(value=0).replace({77: 0, 99: 0, 10:1})
  return df

def replace_BPXSY1(df, col):
    for c in col:
     df[c]= df[c].fillna(value=115)
    return df

def replace_BPXDI1(df, col):
    for c in col:
     df[c]= df[c].fillna(value=75)
    return df

def replace_LBXTR(df, col):
    for c in col:
     df[c]= df[c].fillna(value=145)
    return df

def replace_LBDLDL(df, col):
    for c in col:
     df[c]= df[c].fillna(value=130)
    return df

def replace_LBXTC(df, col):
    for c in col:
     df[c]= df[c].fillna(value=200)
    return df

def replace_mean_age(df, col):
  for c in col:
    df.groupby('RIDAGEYR')[col].apply(lambda x: x.fillna(x.mean()))
    return df

def replace_mean_bmi(df, col):
  for c in col:
    df.groupby('newbmi')[col].apply(lambda x: x.fillna(x.mean()))
    return df

def calculate_bmi(row):
    height_meters = row['BMXHT'] / 100  
    bmi = row['BMXWT'] / (height_meters ** 2)  # Calculate BMI using the formula
    return bmi


def categorize_bmi(bmi):
    if bmi < 25:
        return 0
    elif 25 <= bmi < 30:
        return 1
    elif bmi >= 30:
        return 2

def categorize_abdominal_obesity(row):
    if row['RIAGENDR'] == 1 and row['BMXWAIST'] >= 102:
        return 1
    elif row['RIAGENDR'] == 2 and row['BMXWAIST'] >= 88:
        return 1
    else:
        return 0
##########################################

column_na = ['WHD080B' ,'WHD080D', 'WHD080M', 'WHD080A']
fullDF = replace_na(fullDF, column_na)
nonwhiteDF = replace_na(nonwhiteDF, column_na)
whiteDF = replace_na(whiteDF, column_na)
hispanicDF = replace_na(hispanicDF, column_na)
blackDF = replace_na(blackDF, column_na)
asianDF = replace_na(asianDF, column_na)
multiraceDF = replace_na(multiraceDF, column_na)

column_na_7_9 = ['DPQ010', 'DPQ020', 'DPQ030', 'DPQ040', 'DPQ050', 'DPQ060','DPQ070','DPQ080', 'DPQ090', 'DPQ100']
fullDF = replace_na_7_9(fullDF, column_na_7_9)
nonwhiteDF = replace_na_7_9(nonwhiteDF, column_na_7_9)
whiteDF = replace_na_7_9(whiteDF, column_na_7_9)
hispanicDF = replace_na_7_9(hispanicDF, column_na_7_9)
blackDF = replace_na_7_9(blackDF, column_na_7_9)
asianDF = replace_na_7_9(asianDF, column_na_7_9)
multiraceDF = replace_na_7_9(multiraceDF, column_na_7_9)

column_na_1_7_9 = ['PFQ061B' ,'PFQ061D', 'PFQ061E']
fullDF = replace_na_1_7_9(fullDF, column_na_1_7_9)
nonwhiteDF = replace_na_1_7_9(nonwhiteDF, column_na_1_7_9)
whiteDF = replace_na_1_7_9(whiteDF, column_na_1_7_9)
hispanicDF = replace_na_1_7_9(hispanicDF, column_na_1_7_9)
blackDF = replace_na_1_7_9(blackDF, column_na_1_7_9)
asianDF = replace_na_1_7_9(asianDF, column_na_1_7_9)
multiraceDF = replace_na_1_7_9(multiraceDF, column_na_1_7_9)

column_na_2_3_7_9 = ['DIQ010']
fullDF = replace_na_2_3_7_9(fullDF, column_na_2_3_7_9)
nonwhiteDF = replace_na_2_3_7_9(nonwhiteDF, column_na_2_3_7_9)
whiteDF = replace_na_2_3_7_9(whiteDF, column_na_2_3_7_9)
hispanicDF = replace_na_2_3_7_9(hispanicDF, column_na_2_3_7_9)
blackDF = replace_na_2_3_7_9(blackDF, column_na_2_3_7_9)
asianDF = replace_na_2_3_7_9(asianDF, column_na_2_3_7_9)
multiraceDF = replace_na_2_3_7_9(multiraceDF, column_na_2_3_7_9)

column_na_2_7_9 = ['SMQ858', 'SMQ872', 'SMQ878', 'BPQ020', \
   'BPQ080',  'RXDUSE', 'SLQ050',\
  'SLQ060', 'MCQ160A', 'MCQ160N', 'MCQ160C', 'MCQ160D', 'MCQ160E',\
   'MCQ160F', 'MCQ220','WHQ070', 'DIQ050','CBQ505', 'CBQ535' ]
fullDF = replace_na_2_7_9(fullDF, column_na_2_7_9)
nonwhiteDF = replace_na_2_7_9(nonwhiteDF, column_na_2_7_9)
whiteDF = replace_na_2_7_9(whiteDF, column_na_2_7_9)
hispanicDF = replace_na_2_7_9(hispanicDF, column_na_2_7_9)
blackDF = replace_na_2_7_9(blackDF, column_na_2_7_9)
asianDF = replace_na_2_7_9(asianDF, column_na_2_7_9)
multiraceDF = replace_na_2_7_9(multiraceDF, column_na_2_7_9)

column_na_77_99 = ['WHD080A','SMD641', 'PAQ610', 'PAQ625',  'PAQ640', 'PAQ655' ,'PAQ670'  ]
fullDF = replace_na_77_99(fullDF, column_na_77_99)
nonwhiteDF = replace_na_77_99(nonwhiteDF, column_na_77_99)
whiteDF = replace_na_77_99(whiteDF, column_na_77_99)
hispanicDF = replace_na_77_99(hispanicDF, column_na_77_99)
blackDF = replace_na_77_99(blackDF, column_na_77_99)
asianDF = replace_na_77_99(asianDF, column_na_77_99)
multiraceDF = replace_na_77_99(multiraceDF, column_na_77_99)

column_none_77_99 = ['INDHHIN2']
fullDF = replace_none_77_99(fullDF, column_none_77_99)
nonwhiteDF = replace_none_77_99(nonwhiteDF, column_none_77_99)
whiteDF = replace_none_77_99(whiteDF, column_none_77_99)
hispanicDF = replace_none_77_99(hispanicDF, column_none_77_99)
blackDF = replace_none_77_99(blackDF, column_none_77_99)
asianDF = replace_none_77_99(asianDF, column_none_77_99)
multiraceDF = replace_none_77_99(multiraceDF, column_none_77_99)

column_na_10_77_99 = [ 'DIQ175A' ]
fullDF = replace_na_10_77_99(fullDF, column_na_10_77_99)
nonwhiteDF = replace_na_10_77_99(nonwhiteDF, column_na_10_77_99)
whiteDF = replace_na_10_77_99(whiteDF, column_na_10_77_99)
hispanicDF = replace_na_10_77_99(hispanicDF, column_na_10_77_99)
blackDF = replace_na_10_77_99(blackDF, column_na_10_77_99)
asianDF = replace_na_10_77_99(asianDF, column_na_10_77_99)
multiraceDF = replace_na_10_77_99(multiraceDF, column_na_10_77_99)

column_na_77_99_8 = ['PAQ710' ,'PAQ715']
fullDF = replace_na_77_99_8(fullDF, column_na_77_99_8)
nonwhiteDF = replace_na_77_99_8(nonwhiteDF, column_na_77_99_8 )
whiteDF = replace_na_77_99_8(whiteDF, column_na_77_99_8 )
hispanicDF = replace_na_77_99_8(hispanicDF, column_na_77_99_8 )
blackDF = replace_na_77_99_8(blackDF, column_na_77_99_8 )
asianDF = replace_na_77_99_8(asianDF, column_na_77_99_8 )
multiraceDF = replace_na_77_99_8(multiraceDF, column_na_77_99_8 )

column_na_777_999 = ['DID040' ,'SMD650', 'SMD030',  'ALQ120Q']
fullDF = replace_na_777_999(fullDF, column_na_777_999)
nonwhiteDF = replace_na_777_999(nonwhiteDF, column_na_777_999 )
whiteDF = replace_na_777_999(whiteDF, column_na_777_999 )
hispanicDF = replace_na_777_999(hispanicDF, column_na_777_999 )
blackDF = replace_na_777_999(blackDF, column_na_777_999 )
asianDF = replace_na_777_999(asianDF, column_na_777_999 )
multiraceDF = replace_na_777_999(multiraceDF, column_na_777_999 )

column_na_7777_9999 = [  'RXDDAYS', 'MCQ180A', 'MCQ180N',\
                       'MCQ180C', 'MCQ180D', 'MCQ180E', 'MCQ180F', 'MCQ025', \
                       'DBD895', 'DBD900', 'DBD905', 'DBD910','DBD900','DBD905','DBD910'  ]
fullDF = replace_na_7777_9999(fullDF, column_na_7777_9999)
nonwhiteDF = replace_na_7777_9999(nonwhiteDF, column_na_7777_9999 )
whiteDF = replace_na_7777_9999(whiteDF, column_na_7777_9999 )
hispanicDF = replace_na_7777_9999(hispanicDF, column_na_7777_9999 )
blackDF = replace_na_7777_9999(blackDF, column_na_7777_9999 )
asianDF = replace_na_7777_9999(asianDF, column_na_7777_9999 )
multiraceDF = replace_na_7777_9999(multiraceDF, column_na_7777_9999 )

column_na_77777_99999 = ['CBD120', 'CBD130']
fullDF = replace_na_77777_99999(fullDF, column_na_77777_99999)
nonwhiteDF = replace_na_77777_99999(nonwhiteDF, column_na_77777_99999 )
whiteDF = replace_na_77777_99999(whiteDF, column_na_77777_99999 )
hispanicDF = replace_na_77777_99999(hispanicDF, column_na_77777_99999 )
blackDF = replace_na_77777_99999(blackDF, column_na_77777_99999 )
asianDF = replace_na_77777_99999(asianDF, column_na_77777_99999 )
multiraceDF = replace_na_77777_99999(multiraceDF, column_na_77777_99999 )

column_mean_age = [ 'BMXBMI', 'SLD010H', 'INDHHIN2', 'INDFMIN2']
fullDF = replace_mean_age(fullDF, column_mean_age)
nonwhiteDF = replace_mean_age(nonwhiteDF, column_mean_age )
whiteDF = replace_mean_age(whiteDF, column_mean_age )
hispanicDF = replace_mean_age(hispanicDF, column_mean_age )
blackDF = replace_mean_age(blackDF, column_mean_age )
asianDF = replace_mean_age(asianDF, column_mean_age )
multiraceDF = replace_mean_age(multiraceDF, column_mean_age )


column_BPXSY1 = ['BPXSY1']
fullDF = replace_BPXSY1(fullDF, column_BPXSY1)
nonwhiteDF = replace_BPXSY1(nonwhiteDF, column_BPXSY1 )
whiteDF = replace_BPXSY1(whiteDF, column_BPXSY1 )
hispanicDF = replace_BPXSY1(hispanicDF, column_BPXSY1 )
blackDF = replace_BPXSY1(blackDF, column_BPXSY1 )
asianDF = replace_BPXSY1(asianDF, column_BPXSY1 )
multiraceDF = replace_BPXSY1(multiraceDF, column_BPXSY1 )



column_BPXDI1 = ['BPXDI1']
fullDF = replace_BPXDI1(fullDF, column_BPXDI1)
nonwhiteDF = replace_BPXDI1(nonwhiteDF, column_BPXDI1 )
whiteDF = replace_BPXDI1(whiteDF, column_BPXDI1 )
hispanicDF = replace_BPXDI1(hispanicDF, column_BPXDI1 )
blackDF = replace_BPXDI1(blackDF, column_BPXDI1 )
asianDF = replace_BPXDI1(asianDF, column_BPXDI1 )
multiraceDF = replace_BPXDI1(multiraceDF, column_BPXDI1 )

column_LBXTR = ['LBXTR']
fullDF = replace_LBXTR(fullDF, column_LBXTR)
nonwhiteDF = replace_LBXTR(nonwhiteDF, column_LBXTR )
whiteDF = replace_LBXTR(whiteDF, column_LBXTR )
hispanicDF = replace_LBXTR(hispanicDF, column_LBXTR )
blackDF = replace_LBXTR(blackDF, column_LBXTR )
asianDF = replace_LBXTR(asianDF, column_LBXTR )
multiraceDF = replace_LBXTR (multiraceDF, column_LBXTR )

column_LBDLDL = ['LBDLDL']
fullDF = replace_LBDLDL(fullDF, column_LBDLDL)
nonwhiteDF = replace_LBDLDL(nonwhiteDF, column_LBDLDL )
whiteDF = replace_LBDLDL(whiteDF, column_LBDLDL )
hispanicDF = replace_LBDLDL(hispanicDF, column_LBDLDL )
blackDF = replace_LBDLDL(blackDF, column_LBDLDL )
asianDF = replace_LBDLDL(asianDF, column_LBDLDL )
multiraceDF = replace_LBDLDL(multiraceDF, column_LBDLDL )

column_LBXTC = ['LBXTC']
fullDF = replace_LBXTC(fullDF, column_LBXTC)
nonwhiteDF = replace_LBXTC(nonwhiteDF, column_LBXTC )
whiteDF = replace_LBXTC(whiteDF, column_LBXTC )
hispanicDF = replace_LBXTC(hispanicDF, column_LBXTC )
blackDF = replace_LBXTC(blackDF, column_LBXTC )
asianDF = replace_LBXTC(asianDF, column_LBXTC )
multiraceDF = replace_LBXTC(multiraceDF, column_LBXTC )

fullDF['newbmi'] = fullDF.apply(calculate_bmi, axis=1)
nonwhiteDF['newbmi'] = nonwhiteDF.apply(calculate_bmi, axis=1)
whiteDF['newbmi'] = whiteDF.apply(calculate_bmi, axis=1)
hispanicDF['newbmi'] = hispanicDF.apply(calculate_bmi, axis=1)
blackDF['newbmi'] = blackDF.apply(calculate_bmi, axis=1)
asianDF['newbmi'] = asianDF.apply(calculate_bmi, axis=1)
multiraceDF['newbmi'] = multiraceDF.apply(calculate_bmi, axis=1)


fullDF['abdominal_obesity'] = fullDF.apply(categorize_abdominal_obesity, axis=1)
nonwhiteDF['abdominal_obesity'] = nonwhiteDF.apply(categorize_abdominal_obesity, axis=1)
whiteDF['abdominal_obesity'] = whiteDF.apply(categorize_abdominal_obesity, axis=1)
hispanicDF['abdominal_obesity'] = hispanicDF.apply(categorize_abdominal_obesity, axis=1)
blackDF['abdominal_obesity'] = blackDF.apply(categorize_abdominal_obesity, axis=1)
asianDF['abdominal_obesity'] = asianDF.apply(categorize_abdominal_obesity, axis=1)
multiraceDF['abdominal_obesity'] = multiraceDF.apply(categorize_abdominal_obesity, axis=1)

fullDF['bmi_category'] = fullDF['newbmi'].apply(categorize_bmi)
nonwhiteDF['bmi_category'] = nonwhiteDF['newbmi'].apply(categorize_bmi)
whiteDF['bmi_category'] = whiteDF['newbmi'].apply(categorize_bmi)
hispanicDF['bmi_category'] = hispanicDF['newbmi'].apply(categorize_bmi)
blackDF['bmi_category'] = blackDF['newbmi'].apply(categorize_bmi)
asianDF['bmi_category'] = asianDF['newbmi'].apply(categorize_bmi)
multiraceDF['bmi_category'] = multiraceDF['newbmi'].apply(categorize_bmi)

column_mean_bmi = ['BMXWAIST']
fullDF = replace_mean_bmi(fullDF, column_mean_bmi)
nonwhiteDF = replace_mean_bmi(nonwhiteDF, column_mean_bmi )
whiteDF = replace_mean_bmi(whiteDF, column_mean_bmi )
hispanicDF = replace_mean_bmi(hispanicDF, column_mean_bmi )
blackDF = replace_mean_bmi(blackDF, column_mean_bmi )
asianDF = replace_mean_bmi(asianDF, column_mean_bmi )
multiraceDF = replace_mean_bmi(multiraceDF, column_mean_bmi )

mental_health=['DPQ010', 'DPQ020', 'DPQ030', 'DPQ040', 'DPQ050', 'DPQ060','DPQ070','DPQ080', 'DPQ090', 'DPQ100']
fullDF['total_mental'] = fullDF[mental_health].sum(axis=1)
nonwhiteDF['total_mental'] = nonwhiteDF[mental_health].sum(axis=1)
whiteDF['total_mental'] = whiteDF[mental_health].sum(axis=1)
hispanicDF['total_mental'] = hispanicDF[mental_health].sum(axis=1)
blackDF['total_mental'] = blackDF[mental_health].sum(axis=1)
asianDF['total_mental'] = asianDF[mental_health].sum(axis=1)
multiraceDF['total_mental'] = multiraceDF[mental_health].sum(axis=1)


In [None]:
#removing all the null values
fullDF= fullDF.dropna(subset= deprCode)
whiteDF= whiteDF.dropna(subset= deprCode)
nonwhiteDF= nonwhiteDF.dropna(subset= deprCode)
hispanicDF= hispanicDF.dropna(subset= deprCode)
blackDF= blackDF.dropna(subset= deprCode)
asianDF= asianDF.dropna(subset= deprCode)
multiraceDF= multiraceDF.dropna(subset= deprCode)

In [None]:
corr_matrix = fullDF.corr()
corr_to_col = corr_matrix['BPQ020']
corr_sorted = corr_to_col.abs().sort_values(ascending=False)
corr_sorted = corr_sorted[corr_sorted >0.1]
n_top = 100  # you can set any number here
top_corr_cols = corr_sorted[1:n_top+1].index.tolist()

corr_matrix = fullDF.corr()
corr_to_col = corr_matrix['BPQ080']
corr_sorted = corr_to_col.abs().sort_values(ascending=False)
corr_sorted = corr_sorted[corr_sorted >0.1]
n_top = 100  # you can set any number here
top_corr_cols1 = corr_sorted[1:n_top+1].index.tolist()

In [None]:
numerical_column=['RIDAGEYR', 'RXDDAYS', 'BMXWAIST', 'DMDHHSIZ', 'DMDFMSIZ', 'PAQ710', 'DID040', 'BMXBMI',
                  'LBXSUA','LBDSGLSI', 'MCQ180E', 'total_mental', 'PAQ655', 'URDACT', 'URXUMS', 'BPXSY1', 'newbmi', 'BMXWT', 'MCQ180E', 'LBXTC']

categorical_column= [word for word in top_corr_cols if word not in numerical_column]

#hypertension for model
top_corr_cols.append('BPQ020')
top_corr_cols.append('LBXTC')
top_corr_cols.append('RIAGENDR')
top_corr_cols.append('RIDRETH3')

fullDF1 = fullDF[top_corr_cols]
nonwhiteDF1= nonwhiteDF[top_corr_cols]
whiteDF1= whiteDF[top_corr_cols]
hispanicDF1= hispanicDF[top_corr_cols]
blackDF1= blackDF[top_corr_cols]
asianDF1= asianDF[top_corr_cols]
multiraceDF1 = multiraceDF[top_corr_cols]

In [None]:
numerical_column_chol= ['DID040', 'BMXWAIST', 'LBXTC', 'DMDHHSIZ', 'LBDSGLSI','LBXSCH',  'BPXSY1',
                        'MCQ180E', 'DMDFMSIZ', 'LBXSUA', 'PAQ710', 'BMXBMI', 'RXDDAYS', 'LBDLDL', 'RIDAGEYR' ]
categorical_column_chol= [word for word in top_corr_cols1 if word not in numerical_column_chol]

#cholesterol for model
top_corr_cols1.append('BPQ080')
top_corr_cols1.append('RIAGENDR')

fullDF2 = fullDF[top_corr_cols1]
nonwhiteDF2= nonwhiteDF[top_corr_cols1]
whiteDF2= whiteDF[top_corr_cols1]
hispanicDF2= hispanicDF[top_corr_cols1]
blackDF2= blackDF[top_corr_cols1]
asianDF2= asianDF[top_corr_cols1]
multiraceDF2 = multiraceDF[top_corr_cols1]

In [None]:
column_names = {
    'SEQN': 'Sequence Number',
    'RIDRETH3': 'Race/Ethnicity',
    'RIAGENDR': 'Gender',
    'RIDAGEYR': 'Age (years)',
    'DMDBORN4': 'Country of Birth',
    'DMDCITZN': 'Citizenship Status',
    'DMDMARTL': 'Marital Status',
    'DMDEDUC2': 'Education Level',
    'DMDHHSIZ': 'Household Size',
    'DMDFMSIZ': 'Family Size',
    'DBD895': '# of meals not home prepared',
    'DBD905': '#_ready-to-eat_foods_30D',
    'DBD910': '# of frozen meals/pizza in past 30 days',
    'CBD120': 'Money spent on eating out',
    'CBD130': 'Money spent on carryout/delivered foods',
    'DIQ010': 'Diabetes Diagnosis',
    'DID040': 'Age when first told you had diabetes',
    'FSDHH': 'Household Food Security',
    'FSD032C': 'Could not afford balanced meals',
    'PFQ061B': 'Walking for a quarter mile difficulty',
    'PFQ061D': 'Stooping, crouching, kneeling difficulty',
    'PFQ061E': 'Lifting or carrying difficulty',
    'PAQ610': 'Days vigorous work',
    'PAQ625': 'Number of days moderate work',
    'PAQ640': 'Number of days walk or bicycle',
    'PAQ655': 'Days vigorous recreational activities',
    'PAQ670': 'Days moderate recreational activities',
    'PAQ710': 'Hours watch TV or videos past 30 days',
    'PAQ715': 'Hours use computer past 30 days',
    'RXDUSE': 'Prescription Drug Use',
    'RXDDAYS': 'Prescription Drug Days',
    'SLD010H': 'Hours of Sleep',
    'SLQ050': 'Sleep Quality',
    'SLQ060': 'Sleep Disorder',
    'SMD641': '# days smoked cigs during past 30 days',
    'SMD650': 'Mental Health Medication Days',
    'SMD030': 'Avg # cigarettes/day during past 30 days',
    'SMQ858': 'Last 7-d at job someone smoked indoors?	',
    'SMQ872': 'Last 7-d someone smoked in car?',
    'SMQ878': 'Last 7-d in other indoor area?',
    'BMXBMI': 'Body Mass Index (kg/m²)',
    'WHQ070': 'Tried to lose weight in past year',
    'WHD080A': 'Ate less food to lose weight',
    'WHD080B': 'Switched to foods with lower calories',
    'WHD080D': 'Exercised to lose weight',
    'WHD080M': 'Drank a lot of water to lose weight',
    'MCQ025': 'Age when first had asthma',
    'MCQ160A': 'Doctor ever said you had arthritis',
    'MCQ180A': 'Age when told you had arthritis',
    'MCQ160N': 'Doctor ever told you that you had gout?',
    'MCQ180N': 'Age when told you had gout',
    'MCQ160C': 'Ever told you had coronary heart disease',
    'MCQ180C': 'Doctor ever said you had skin cancer',
    'MCQ160D': 'angina/angina pectoris',
    'MCQ180D': 'Age when told you had angina pectoris',
    'MCQ160E': 'heart attack',
    'MCQ180E': 'Age when told you had heart attack',
    'MCQ160F': 'stroke',
    'MCQ180F': 'Age when told you had a stroke',
    'MCQ220': 'cancer/malignancy',
    'BPXSY1': 'Blood Pressure - Systolic (1st)',
    'BPXDI1': 'Blood Pressure - Diastolic (1st)',
    'BPQ080': 'Hypercholestrolemia',
    'LBXTR': 'Triglycerides (mg/dL)',
    'LBDLDL': 'LDL Cholesterol (mg/dL)',
    'LBXTC': 'Total Cholesterol (mg/dL)',
    'INDHHIN2': 'Household Income',
    'INDFMIN2': 'Family Income',
    'BPQ020': 'hypertension',
    'BMXWT': 'Weight (kg)',
    'BMXHT': 'Height (cm)',
    'DIQ050': 'Insulin',
    'DIQ175A': 'Family history Diabetes',
    'BMXWAIST': 'Waist Circumference (cm)',
    'DBD900': '# of meals from fast food or pizza place',
    'CBQ505': 'Eat at fast food/pizza places',
    'CBQ535': 'Saw nutrition info on fast food menu',
    'DPQ010': 'Little Interest in Doing Things',
    'DPQ020': 'Feeling Down, Depressed, or Hopeless',
    'DPQ030': 'Trouble Sleeping or Sleeping Too Much',
    'DPQ040': 'Feeling Tired or Having Little Energy',
    'DPQ050': 'Poor Appetite or Overeating',
    'DPQ060': 'Feeling Bad About Yourself',
    'DPQ070': 'Trouble Concentrating on Things',
    'DPQ080': 'Moving or Speaking Slowly or Too Fast',
    'DPQ090': 'Thought you would be better off dead	',
    'DPQ100': 'Difficulty these problems have caused',
    'ALQ120Q': 'Alcohol Consumption Frequency',
    'URXUMS': 'Albumin, urine (mg/L)',
    'URDACT': 'First albumin creatinine ratio (mg/g)',
    'LBXSTR': 'Triglycerides (mg/dL)',
    'LBXSUA': 'Uric acid (mg/dL)',
    'LBXSCH': 'Cholesterol (mg/dL)',
    'LBDSGLSI': 'Glucose, refrigerated serum (mmol/L)',
    'DR1TKCAL': 'Energy (kcal)',
    'DR1TCARB': 'Carbohydrate (gm)',
    'DR1TSUGR': 'Total sugars (gm)',
    'newbmi': 'New BMI',
    'abdominal_obesity': 'Abdominal Obesity',
    'bmi_category': 'BMI Category',
    'total_mental': 'Total Mental Health Score'
}

# Rename the columns
fullDF1.rename(columns=column_names, inplace=True)
nonwhiteDF1.rename(columns=column_names, inplace=True)
whiteDF1.rename(columns=column_names, inplace=True)
hispanicDF1.rename(columns=column_names, inplace=True)
blackDF1.rename(columns=column_names, inplace=True)
asianDF1.rename(columns=column_names, inplace=True)
multiraceDF1.rename(columns=column_names, inplace=True)

fullDF2.rename(columns=column_names, inplace=True)
nonwhiteDF2.rename(columns=column_names, inplace=True)
whiteDF2.rename(columns=column_names, inplace=True)
hispanicDF2.rename(columns=column_names, inplace=True)
blackDF2.rename(columns=column_names, inplace=True)
asianDF2.rename(columns=column_names, inplace=True)
multiraceDF2.rename(columns=column_names, inplace=True)

fullDF.rename(columns=column_names, inplace=True)
nonwhiteDF.rename(columns=column_names, inplace=True)
whiteDF.rename(columns=column_names, inplace=True)
hispanicDF.rename(columns=column_names, inplace=True)
blackDF.rename(columns=column_names, inplace=True)
asianDF.rename(columns=column_names, inplace=True)
multiraceDF.rename(columns=column_names, inplace=True)

demographics= ['RIAGENDR', 'RIDAGEYR','RIDRETH3', 'DMDCITZN', 'DMDMARTL', 'DMDEDUC2', 'DMDHHSIZ', 'DMDFMSIZ', 'INDHHIN2', 'Bmi_category','newbmi', 'BPQ020']
demographic_categorical=['RIAGENDR','RIDRETH3', 'DMDCITZN', 'DMDMARTL', 'DMDEDUC2', 'INDHHIN2']
demographic_numerical= ['RIDAGEYR', 'newbmi','BPXSY1', 'newbmi', 'total_mental', 'BMXWAIST', 'LBXTC']
legend_category=('BPQ020', 'BPQ080', 'MCQ160C',  'MCQ220', 'MCQ160N', 'DIQ010')

demographics = [column_names.get(col, col) for col in demographics]
demographic_categorical = [column_names.get(col, col) for col in demographic_categorical]
demographic_numerical = [column_names.get(col, col) for col in demographic_numerical]
legend_category = [column_names.get(col, col) for col in legend_category]
categorical_column_chol = [column_names.get(col, col) for col in categorical_column_chol]
categorical_column = [column_names.get(col, col) for col in categorical_column]

In [None]:
column_names_final = {
    'Age (years)': 'age',
    'Race/Ethnicity':'race',
    'Blood Pressure - Systolic (1st)': 'bpSys',
    'Hypercholestrolemia': 'cholesterol',
    'Stooping, crouching, kneeling difficulty': 'crouchDiff',
    'Doctor ever said you had arthritis': 'arthritis',
    'Diabetes Diagnosis': 'diabetes',
    'Waist Circumference (cm)': 'waistCirc',
    'Body Mass Index (kg/m²)': 'bmi',
    'Hours watch TV or videos past 30 days': 'tvHours',
    'Ever told you had coronary heart disease': 'coronaryHeartDisease',
    'Sleep Quality': 'sleepQuality',
    'Household Size': 'householdSize',
    'heart attack': 'heartAttack',
    'Walking for a quarter mile difficulty': 'walkDiff',
    'Family Size': 'familySize',
    'cancer/malignancy': 'cancer',
    'stroke': 'stroke',
    'Sleep Disorder': 'sleepDisorder',
    'Doctor ever told you that you had gout?': 'gout',
    'Total Mental Health Score': 'mentalHealthScore',
    'Days vigorous recreational activities': 'vigorousActivities',
    'Marital Status': 'maritalStatus',
    'Little Interest in Doing Things': 'interestInDoingThings',
    'Feeling Tired or Having Little Energy': 'tiredOrLowEnergy',
    'Feeling Down, Depressed, or Hopeless': 'depressedOrHopeless',
    'angina/angina pectoris': 'angina',
    'Trouble Sleeping or Sleeping Too Much': 'troubleSleeping',
    'Trouble Concentrating on Things': 'troubleConcentrating',
    'hypertension': 'hypertension',
    'Gender': 'gender',
    'Glucose, refrigerated serum (mmol/L)': 'glucose_level',
}

column_names_replace = [
    'age',
    'race',
    'gender',
    'maritalStatus',
    'householdSize',
    'familySize',
    'bpSys',
    'cholesterol',
    'waistCirc',
    'bmi',
    'arthritis',
    'diabetes',
    'coronaryHeartDisease',
    'heartAttack',
    'cancer',
    'stroke',
    'gout',
    'hypertension',
    'angina',
    'crouchDiff',
    'walkDiff',
    'vigorousActivities',
    'mentalHealthScore',
    'interestInDoingThings',
    'tiredOrLowEnergy',
    'depressedOrHopeless',
    'troubleConcentrating',
    'sleepQuality',
    'tvHours',
    'sleepDisorder',
    'troubleSleeping',
    'glucose_level'
]

column_names_ML = [
    #demographics
    'age',
    'race',
    'gender',
    'maritalStatus',
    'householdSize',
    #health_parameters
    'bpSys',
    'cholesterol',
    'glucose_level',
    'bmi',
    #disease_history
    'arthritis',
    'diabetes',
    'heartAttack',
    'stroke',
    'hypertension',
    
    #physical_activity_mental_health
    'walkDiff',
    'interestInDoingThings',
    'tiredOrLowEnergy',
    'depressedOrHopeless',
    'troubleSleeping' 
]


In [None]:
newDF= fullDF1.rename(columns=column_names_final)
newDF= newDF[column_names_replace]

In [None]:
# Define mapping dictionaries
gender_mapping = {2: 'Female', 1: 'Male'}
marital_status_mapping = {1: 'Married', 2: 'Widowed', 3: 'Divorced', 4: 'Separated', 5: 'Single', 6: 'Living with Partner'}
binary_mapping = {0: 'No', 1: 'Yes'}
difficulty_mapping = {1: 'No Difficulty', 2: 'Some Difficulty', 3: 'Much Difficulty', 4: 'Unable to do', 5: 'Do not do this activity'}
tv_hours_mapping = {0: 'Less than 1 hour', 1: '1 hour', 2: '2 hours', 3: '3 hours', 4: '4 hours', 5: '5 hours or more', 8: 'Don\'t watch TV'}
diabetes_mapping = {2: 'Diabetes', 1: 'Pre-diabetes', 0: 'No Diabetes'}
sleep_mapping = {0: 'Not at all', 1: 'Several Days', 2: 'More than half the days', 3: 'Nearly every day'}
racial_mapping = {1: 'Mexican American', 2: 'Other Hispanic', 3: 'White', 4: 'Black', 6: 'Asian', 7: 'Others'}
family_mapping = {1: 'Alone', 2: '2 people', 3: '3 people', 4: '4 people', 5: '5 people', 6: '6 people', 7: '7 or more people'}

df_viz = newDF.copy()

# Replace numerical values with text
df_viz['gender'] = newDF['gender'].replace(gender_mapping)
df_viz['maritalStatus'] = newDF['maritalStatus'].replace(marital_status_mapping)
df_viz['cholesterol'] = newDF['cholesterol'].replace(binary_mapping)
df_viz['arthritis'] = newDF['arthritis'].replace(binary_mapping)
df_viz['coronaryHeartDisease'] = newDF['coronaryHeartDisease'].replace(binary_mapping)
df_viz['heartAttack'] = newDF['heartAttack'].replace(binary_mapping)
df_viz['cancer'] = newDF['cancer'].replace(binary_mapping)
df_viz['stroke'] = newDF['stroke'].replace(binary_mapping)
df_viz['gout'] = newDF['gout'].replace(binary_mapping)
df_viz['hypertension'] = newDF['hypertension'].replace(binary_mapping)
df_viz['angina'] = newDF['angina'].replace(binary_mapping)
df_viz['crouchDiff'] = newDF['crouchDiff'].replace(difficulty_mapping)
df_viz['walkDiff'] = newDF['walkDiff'].replace(difficulty_mapping)
df_viz['tvHours'] = newDF['tvHours'].replace(tv_hours_mapping)
df_viz['sleepDisorder'] = newDF['sleepDisorder'].replace(binary_mapping)
df_viz['troubleSleeping'] = newDF['troubleSleeping'].replace(sleep_mapping)
df_viz['diabetes'] = newDF['diabetes'].replace(diabetes_mapping)
df_viz['race'] = newDF['race'].replace(racial_mapping)
df_viz['familySize'] = newDF['familySize'].replace(family_mapping)

print(df_viz)

In [None]:
newDF.to_csv("updatedDF.csv", index=False)
df_viz.to_csv("vizDF.csv", index=False)

In [None]:
df_ml = newDF.copy()

gender_mapping_ml = {2: 0, 1: 1}
diabetes_mapping_ml = {2: 1, 1: 2}


df_ml['gender'] = newDF['gender'].replace(gender_mapping_ml)
df_ml['maritalStatus'] = newDF['maritalStatus'].replace(marital_status_mapping)
df_ml['race'] = newDF['race'].replace(racial_mapping)
df_ml['diabetes'] = newDF['diabetes'].replace(diabetes_mapping_ml)

df_ml= df_ml[column_names_ML]
df_ml['race'] = df_ml['race'].astype('category')
df_ml['maritalStatus'] = df_ml['maritalStatus'].astype('category')

In [None]:
df_ml.to_csv("mlDF.csv", index=False)
df_ml.reset_index(drop=True, inplace=True)

In [None]:

encoder = OneHotEncoder(sparse_output=False, dtype=int)  # Initialize the encoder
encoded_cols = encoder.fit_transform(df_ml[['race', 'maritalStatus']])  # Fit and transform the selected columns
encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(['race', 'maritalStatus']))  # Create a DataFrame from the encoded columns

# Concatenate the original DataFrame with the encoded DataFrame
df_ml_one_hot = pd.concat([df_ml.drop(['race', 'maritalStatus'], axis=1), encoded_df], axis=1)

In [475]:
corr_matrix = df_ml.corr()
corr_to_col = corr_matrix['diabetes']
corr_sorted = corr_to_col.abs().sort_values(ascending=False)
corr_sorted = corr_sorted[corr_sorted >0.1]
n_top = 100  
top_corr_cols = corr_sorted[1:n_top+1].index.tolist()

corr_sorted

diabetes         1.000000
glucose_level    0.421341
cholesterol      0.255820
age              0.254105
hypertension     0.252963
bmi              0.182417
heartAttack      0.163205
arthritis        0.155529
bpSys            0.146487
stroke           0.112175
Name: diabetes, dtype: float64

In [473]:
df_ml_one_hot['diabetes'] = df_ml_one_hot['diabetes'].astype('category')
(df_ml_one_hot.dtypes)

age                                     int64
gender                                  int64
householdSize                           int64
bpSys                                 float64
cholesterol                           float64
glucose_level                         float64
bmi                                   float64
arthritis                             float64
diabetes                             category
heartAttack                           float64
stroke                                float64
hypertension                          float64
walkDiff                              float64
interestInDoingThings                 float64
tiredOrLowEnergy                      float64
depressedOrHopeless                   float64
troubleSleeping                       float64
race_Asian                              int32
race_Black                              int32
race_Mexican American                   int32
race_Other Hispanic                     int32
race_Others                       

In [None]:
df_final_ml= df_ml_one_hot[df_ml_one_hot['diabetes'].isin([1, 0])]
df_final_ml['diabetes'] = df_final_ml['diabetes'].astype('int')

In [None]:
#Data Processing
X_other= df_final_ml.drop(['diabetes'], axis= 1)
y_other= df_final_ml['diabetes']

cols = X_other.columns
index = X_other.index

transformer = StandardScaler().fit(X_other)
X_other = transformer.transform(X_other)
X_other = pd.DataFrame(X_other, columns = cols, index = index)

x_train_other,x_test_other,y_train_other,y_test_other = train_test_split(X_other,y_other,test_size=0.3,random_state=96)

#Synthetic Minority Oversampling Technique: To address the class imbalance problem, we will use the SMOTE technique to oversample the minority class.

sm_other = SMOTE()

X_res_other, y_res_other = sm_other.fit_resample(x_train_other, y_train_other)

print("Before OverSampling, counts of label '1': {}".format(sum(y_other==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_other==0)))

print('After OverSampling, the shape of train_X: {}'.format(X_res_other.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_res_other.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_res_other==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_res_other==0)))

In [None]:
#List of ML Models
models = []
models.append(('LR', LogisticRegression(random_state=12345)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier(random_state=12345)))
models.append(('RF', RandomForestClassifier(random_state=12345)))
models.append(('SVM', SVC(gamma='auto', random_state=12345)))
models.append(('XGB', GradientBoostingClassifier(random_state=12345)))
models.append(("LightGBM", LGBMClassifier(random_state=12345, verbosity=-1)))

results = []
names = []

# Perform cross-validation on all the models
for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_res_other, y_res_other, cv=10, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# Visualizing the cross-validation scores using a boxplot
fig = plt.figure(figsize=(15, 10))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
#Create Neural Network Model (Keras)
model = Sequential([
    Dense(64, input_shape=(28,), activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# Compile the model with a different optimizer and learning rate
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Assuming X_res_other and y_res_other are your training features and labels respectively
# and x_test_other and y_test_other are your test features and labels respectively

# Train the model with early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_res_other, y_res_other, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping], verbose=2)

# Evaluate the model
loss, accuracy = model.evaluate(x_test_other, y_test_other, verbose=2)
print(f"Test Accuracy: {accuracy:.4f}")

# Assuming model.predict returns probabilities
y_pred_prob = model.predict(x_test_other)
# Convert probabilities to binary predictions
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate classification report
print("Classification Report:")
print(classification_report(y_test_other, y_pred))


In [None]:
#Using Voting Classifier with Neural Network and Random Forest
class_weights = compute_class_weight('balanced', classes=np.unique(y_res_other), y=y_res_other)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Define the neural network model
def create_nn_model():
    model = Sequential([
        Dense(64, input_shape=(28,), activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(16, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrap the Keras model for use in scikit-learn
nn_model = KerasClassifier(build_fn=create_nn_model, epochs=100, batch_size=32, validation_split=0.2, class_weight=class_weights_dict, verbose=0)

# Train the neural network model
nn_model.fit(X_res_other, y_res_other)

# Train a RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_res_other, y_res_other)

# Combine the models using a VotingClassifier
voting_clf = VotingClassifier(estimators=[('nn', nn_model), ('rf', rf)], voting='soft')

# Train the VotingClassifier
voting_clf.fit(X_res_other, y_res_other)

# Evaluate the ensemble model
pred = voting_clf.predict(x_test_other)
print(classification_report(y_test_other, pred))

# Plot confusion matrix

cm = confusion_matrix(y_test_other, pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
#Random Forest Model

#Define the parameter grid
param_distributions = {
    'n_estimators': [100, 300, 500, 1000],
    'max_features': [ 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'criterion': ['gini', 'entropy']
}

#Initialize the RandomForestClassifier
rf = RandomForestClassifier()

#Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions, n_iter=50, cv=5, n_jobs=-1, verbose=2, random_state=42)

#Fit the model on the resampled data
random_search.fit(X_res_other, y_res_other)

#Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

#Initialize a new RandomForestClassifier with the best parameters
best_rf = RandomForestClassifier(**best_params)

#Fit this new model on the training data
best_rf.fit(X_res_other, y_res_other)

#Make predictions and evaluate the model
pred = best_rf.predict(x_test_other)
print(classification_report(y_test_other, pred))

#Get the mean cross-validation score
mean_cv_score = random_search.best_score_
print(f"Mean Cross-Validation Score: {mean_cv_score:.4f}")

In [None]:
# Define the parameter grid for LightGBM
param_grid = {
    'n_estimators': [100, 300, 500, 1000],
    'max_depth': [-1, 10, 20, 30],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [31, 50, 70],
    'boosting_type': ['gbdt', 'dart']
}

#Initialize the LGBMClassifier
lgbm = LGBMClassifier()

#Initialize GridSearchCV
grid_search = RandomizedSearchCV(estimator=lgbm, param_distributions=param_distributions, n_iter=50, cv=5, n_jobs=-1, verbose=2, random_state=42)

# Fit the model on the resampled data
grid_search.fit(X_res_other, y_res_other)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Initialize a new LGBMClassifier with the best parameters
best_lgbm = LGBMClassifier(**best_params)

# Fit this new model on the training data
best_lgbm.fit(X_res_other, y_res_other)

# Make predictions and evaluate the model
pred = best_lgbm.predict(x_test_other)
print(classification_report(y_test_other, pred))

# Calculate the mean cross-validation score
cv_scores = cross_val_score(best_lgbm, X_res_other, y_res_other, cv=10)
print("Mean Cross-Validation Score:", cv_scores.mean())


In [None]:
cm = confusion_matrix(y_test_other, pred, labels=best_rf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_rf.classes_)
disp.plot()
plt.show()

In [None]:

y_prob = best_rf.predict_proba(x_test_other)[:,1]
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test_other, y_prob)
roc_auc = roc_auc_score(y_test_other, y_prob)

plt.plot(false_positive_rate, true_positive_rate, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
# Save the trained model as a .pkl file
joblib.dump(model, 'your_trained_model.pkl')
# Save the StandardScaler object as a .pkl file
joblib.dump(transformer, 'your_scaler.pkl')
# Save the OneHotEncoder object as a .pkl file
joblib.dump(encoder, 'your_encoder.pkl')

In [None]:
#Testing one hot encoding as pickle file

form_data = {
    "age": 30,
    "race": "White",
    "gender": 1,
    "maritalStatus": "Married",
    "householdSize": 3,
    "bpSys": 120,
    "cholesterol": 1,
    "glucose_level": 90,
    "bmi": 25.5,
    "arthritis": 0,
    "diabetes": 0,
    "heartAttack": 0,
    "stroke": 0,
    "hypertension": 1,
    "walkDiff": 2,
    "interestInDoingThings": 1,
    "tiredOrLowEnergy": 2,
    "depressedOrHopeless": 0,
    "troubleSleeping": 1
}

# Defining the list of columns as per the HTML form
input_features = [
    'age',
    'race',
    'gender',
    'maritalStatus',
    'householdSize',
    'bpSys',
    'cholesterol',
    'glucose_level',
    'bmi',
    'arthritis',
    'diabetes',
    'heartAttack',
    'stroke',
    'hypertension',
    'walkDiff',
    'interestInDoingThings',
    'tiredOrLowEnergy',
    'depressedOrHopeless',
    'troubleSleeping'
]

# Create a DataFrame with one row from the form data
df = pd.DataFrame([form_data])

print(df)

In [None]:
#Testing the one hot encoding
encoder2 = joblib.load('your_encoder.pkl')

# Select the specific columns to be transformed
columns_to_transform = ['race', 'maritalStatus']
df_selected = df[columns_to_transform]

# Transform the selected columns
transformed_data = encoder2.transform(df_selected)

# Get feature names (assuming the encoder supports get_feature_names_out)
feature_names = encoder2.get_feature_names_out(columns_to_transform)

# Convert the transformed data to a DataFrame
transformed_df = pd.DataFrame(transformed_data, columns=feature_names)

# Drop the original columns that were transformed
df.drop(columns=columns_to_transform, inplace=True)

# Concatenate the transformed columns with the original dataframe
df_ml_transformed = pd.concat([df, transformed_df], axis=1)

# Print the resulting dataframe
print(df_ml_transformed)