In [133]:
# Import libraries and packages
import pandas as pd
import numpy as np
from pathlib import Path
import hvplot.pandas
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from keras.layers import Dropout
from keras.regularizers import l2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score
import pickle
import warnings
from pandas.errors import SettingWithCopyWarning

# BRFSS Analysis

In [2]:
# Read the data from CSV file
brfss_1_df = pd.read_csv(Path("Resources/2021 BRFSS Survey Data/LLCP2021_1.csv"))
brfss_2_df = pd.read_csv(Path("Resources/2021 BRFSS Survey Data/LLCP2021_2.csv"))
brfss_3_df = pd.read_csv(Path("Resources/2021 BRFSS Survey Data/LLCP2021_3.csv"))
brfss_4_df = pd.read_csv(Path("Resources/2021 BRFSS Survey Data/LLCP2021_4.csv"))
brfss_5_df = pd.read_csv(Path("Resources/2021 BRFSS Survey Data/LLCP2021_5.csv"))

In [3]:
# Concatenate the dataframe
brfss_data_df = pd.concat([brfss_1_df, brfss_2_df, brfss_3_df, brfss_4_df, brfss_5_df], axis = "rows")

# Reset index 
brfss_data_df = brfss_data_df.reset_index()
brfss_data_df = brfss_data_df.drop(columns = ["index"])

# Display the BRFSS DF
brfss_data_df

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_FRTRES1,_VEGRES1,_FRUTSU1,_VEGESU1,_FRTLT1A,_VEGLT1A,_FRT16A,_VEG23A,_FRUITE1,_VEGETE1
0,1.0,1.0,b'01192021',b'01',b'19',b'2021',1100.0,b'2021000001',2.021000e+09,1.0,...,1.0,1.0,100.0,214.0,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79
1,1.0,1.0,b'01212021',b'01',b'21',b'2021',1100.0,b'2021000002',2.021000e+09,1.0,...,1.0,1.0,100.0,128.0,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79
2,1.0,1.0,b'01212021',b'01',b'21',b'2021',1100.0,b'2021000003',2.021000e+09,1.0,...,1.0,1.0,100.0,71.0,1.0,2.0,1.0,1.0,5.397605e-79,5.397605e-79
3,1.0,1.0,b'01172021',b'01',b'17',b'2021',1100.0,b'2021000004',2.021000e+09,1.0,...,1.0,1.0,114.0,165.0,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79
4,1.0,1.0,b'01152021',b'01',b'15',b'2021',1100.0,b'2021000005',2.021000e+09,1.0,...,1.0,1.0,100.0,258.0,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438688,78.0,12.0,b'01062022',b'01',b'06',b'2022',1100.0,b'2021001381',2.021001e+09,,...,1.0,1.0,157.0,393.0,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79
438689,78.0,12.0,b'01122022',b'01',b'12',b'2022',1100.0,b'2021001382',2.021001e+09,,...,1.0,1.0,200.0,157.0,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79
438690,78.0,12.0,b'12212021',b'12',b'21',b'2021',1100.0,b'2021001383',2.021001e+09,,...,1.0,1.0,200.0,143.0,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79
438691,78.0,12.0,b'01112022',b'01',b'11',b'2022',1100.0,b'2021001384',2.021001e+09,,...,1.0,1.0,100.0,156.0,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79


## Cleaning and Wrangling the Data

In [4]:
# Select relevant columns and drop the rest
brfss_df = brfss_data_df[["_SEX", "_AGE80", "GENHLTH", "EXERANY2", "ASTHMA3", "CHCOCNCR", 
                          "WTKG3", "HTM4", "_BMI5", "_SMOKER3", "_CURECI1", "DRNKANY5", 
                          "_FRUTSU1", "FRNCHDA_", "POTADA1_", "_VEGESU1", "PNEUVAC4", 
                          "DIABETE4", "CHCCOPD3"]]

# Display the Dataframe
brfss_df

Unnamed: 0,_SEX,_AGE80,GENHLTH,EXERANY2,ASTHMA3,CHCOCNCR,WTKG3,HTM4,_BMI5,_SMOKER3,_CURECI1,DRNKANY5,_FRUTSU1,FRNCHDA_,POTADA1_,_VEGESU1,PNEUVAC4,DIABETE4,CHCCOPD3
0,2.0,70.0,5.0,2.0,1.0,2.0,3266.0,150.0,1454.0,3.0,1.0,2.0,100.0,4.300000e+01,14.0,214.0,1.0,3.0,1.0
1,2.0,67.0,3.0,1.0,2.0,2.0,,168.0,,4.0,1.0,2.0,100.0,5.397605e-79,14.0,128.0,2.0,1.0,2.0
2,2.0,72.0,2.0,2.0,2.0,2.0,7711.0,165.0,2829.0,4.0,1.0,2.0,100.0,1.400000e+01,14.0,71.0,2.0,1.0,2.0
3,2.0,62.0,2.0,1.0,2.0,2.0,8845.0,163.0,3347.0,4.0,1.0,1.0,114.0,5.700000e+01,27.0,165.0,2.0,1.0,2.0
4,1.0,76.0,5.0,1.0,2.0,2.0,9344.0,180.0,2873.0,4.0,1.0,2.0,100.0,2.900000e+01,29.0,258.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438688,2.0,30.0,2.0,1.0,1.0,2.0,6123.0,157.0,2469.0,3.0,1.0,1.0,157.0,1.400000e+01,50.0,393.0,1.0,2.0,1.0
438689,1.0,80.0,3.0,2.0,2.0,2.0,,157.0,,4.0,1.0,2.0,200.0,1.400000e+01,14.0,157.0,2.0,1.0,2.0
438690,1.0,54.0,2.0,1.0,2.0,2.0,9979.0,180.0,3068.0,4.0,1.0,1.0,200.0,5.397605e-79,14.0,143.0,2.0,3.0,2.0
438691,1.0,67.0,2.0,1.0,2.0,2.0,7938.0,183.0,2373.0,4.0,1.0,1.0,100.0,5.397605e-79,13.0,156.0,2.0,3.0,2.0


In [91]:
# Remap categorical feature values to their labels based on the documentation and/or Rename Columns

brfss_cleaned_df = pd.DataFrame()

# Remap "SEX" column
brfss_cleaned_df["SEX"] = brfss_df["_SEX"].apply(lambda x: "MALE" if x == 1.0 else "FEMALE")

# Add the "AGE" column
brfss_cleaned_df["AGE"] = brfss_df["_AGE80"]

# Remap "GENHLTH" column
def genhlth(x):
    if x == 1.0:
        return "Excellent"
    elif x == 2.0:
        return "Very Good"
    elif x == 3.0:
        return "Good"
    elif x == 4.0:
        return "Fair"
    elif x == 5.0:
        return "Poor"
    elif x == 7.0:
        return "Don't Know/Not Sure"
    else:
        return np.nan
    
brfss_cleaned_df["GENHLTH"] = brfss_df["GENHLTH"].apply(genhlth)

# Remap "EXERCISE" column
def yesno(x):
    if x == 1.0:
        return "Yes"
    elif x == 2.0:
        return "No"
    elif x == 7.0:
        return "Don't Know/Not Sure"
    else:
        return np.nan

brfss_cleaned_df["EXERCISE"] = brfss_df["EXERANY2"].apply(yesno)

# Remap "ASTHMA" column
brfss_cleaned_df["ASTHMA"] = brfss_df["ASTHMA3"].apply(yesno)

# Remap "CANCER" column
brfss_cleaned_df["CANCER"] = brfss_df["CHCOCNCR"].apply(yesno)

# Remap "WEIGHT" column
brfss_cleaned_df["WEIGHT"] = brfss_df["WTKG3"].apply(lambda x: np.nan if x == np.nan else (x/100))

# Remap "HEIGHT" column
brfss_cleaned_df["HEIGHT"] = brfss_df["HTM4"].apply(lambda x: np.nan if x == np.nan else (x/100))

# Remap "BMI" column
brfss_cleaned_df["BMI"] = brfss_df["_BMI5"].apply(lambda x: np.nan if x == np.nan else (x/100))

# Remap "SMOKER" column
def smoker(x):
    if x == 1.0:
        return "Everyday smoker"
    elif x == 2.0:
        return "Someday smoker"
    elif x == 3.0:
        return "Former smoker"
    elif x == 4.0:
        return "Never smoked"
    else:
        return np.nan
    
brfss_cleaned_df["SMOKER"] = brfss_df["_SMOKER3"].apply(smoker)

# Remap "ECIG" column
def ecig(x):
    if x == 1.0:
        return "Not currently using E-cigarettes"
    elif x == 2.0:
        return "Current E-cigarette user"
    else:
        return np.nan
    
brfss_cleaned_df["ECIG"] = brfss_df["_CURECI1"].apply(ecig)

# Remap "ALCOHOL" column
brfss_cleaned_df["ALCOHOL"] = brfss_df["DRNKANY5"].apply(yesno)

# Remap "FRUIT" column
brfss_cleaned_df["FRUIT"] = brfss_df["_FRUTSU1"].apply(lambda x: np.nan if x == np.nan else (x/100))

# Remap "FRIES" column
brfss_cleaned_df["FRIES"] = brfss_df["FRNCHDA_"].apply(lambda x: np.nan if x == np.nan else round((x/100),2))

# Add the "POTATO" column
brfss_cleaned_df["POTATO"] = brfss_df["POTADA1_"]

# Remap "VEGES" column
brfss_cleaned_df["VEGES"] = brfss_df["_VEGESU1"].apply(lambda x: np.nan if x == np.nan else (x/100))

# Remap "PNEUVAC" column
brfss_cleaned_df["PNEUVAC"] = brfss_df["PNEUVAC4"].apply(yesno)

# Remap "DIABETES" column
def diabetes(x):
    if x == 1.0:
        return "Yes"
    elif x == 2.0:
        return "Yes, but female and only during pregnancy"
    elif x == 3.0:
        return "No"
    elif x == 4.0:
        return "No,but pre-diabetes or borderline diabetes"
    else:
        return np.nan
    
brfss_cleaned_df["DIABETES"] = brfss_df["DIABETE4"].apply(diabetes)

# Remap "COPD" column
def copd(x):
    if x == 1.0:
        return 1
    elif x == 2.0:
        return 0
    else:
        return np.nan
    
brfss_cleaned_df["COPD"] = brfss_df["CHCCOPD3"].apply(copd)

In [92]:
# Drop columns with missing values
brfss_cleaned_df.dropna(inplace = True)

# Reset index 
brfss_cleaned_df = brfss_cleaned_df.reset_index()
brfss_cleaned_df = brfss_cleaned_df.drop(columns = ["index"])

# View the cleaned DataFrame
brfss_cleaned_df

Unnamed: 0,SEX,AGE,GENHLTH,EXERCISE,ASTHMA,CANCER,WEIGHT,HEIGHT,BMI,SMOKER,ECIG,ALCOHOL,FRUIT,FRIES,POTATO,VEGES,PNEUVAC,DIABETES,COPD
0,FEMALE,70.0,Poor,No,Yes,No,32.66,1.50,14.54,Former smoker,Not currently using E-cigarettes,No,1.00,0.43,14.0,2.14,Yes,No,1.0
1,FEMALE,72.0,Very Good,No,No,No,77.11,1.65,28.29,Never smoked,Not currently using E-cigarettes,No,1.00,0.14,14.0,0.71,No,Yes,0.0
2,FEMALE,62.0,Very Good,Yes,No,No,88.45,1.63,33.47,Never smoked,Not currently using E-cigarettes,Yes,1.14,0.57,27.0,1.65,No,Yes,0.0
3,MALE,76.0,Poor,Yes,No,No,93.44,1.80,28.73,Never smoked,Not currently using E-cigarettes,No,1.00,0.29,29.0,2.58,Yes,Yes,0.0
4,MALE,80.0,Good,No,No,No,88.45,1.91,24.37,Former smoker,Not currently using E-cigarettes,No,0.29,0.00,14.0,0.42,Yes,No,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340207,MALE,66.0,Fair,Yes,No,No,69.85,1.80,21.48,Never smoked,Not currently using E-cigarettes,Yes,2.50,0.14,14.0,2.99,No,Yes,0.0
340208,FEMALE,30.0,Very Good,Yes,Yes,No,61.23,1.57,24.69,Former smoker,Not currently using E-cigarettes,Yes,1.57,0.14,50.0,3.93,Yes,"Yes, but female and only during pregnancy",1.0
340209,MALE,54.0,Very Good,Yes,No,No,99.79,1.80,30.68,Never smoked,Not currently using E-cigarettes,Yes,2.00,0.00,14.0,1.43,No,No,0.0
340210,MALE,67.0,Very Good,Yes,No,No,79.38,1.83,23.73,Never smoked,Not currently using E-cigarettes,Yes,1.00,0.00,13.0,1.56,No,No,0.0


In [124]:
# plot the data

## Encode Categorical Features

In [8]:
# Remap "SEX" column
brfss_cleaned_df["SEX_MALE"] = brfss_cleaned_df["SEX"].apply(lambda x: 1.0 if x == "MALE" else 0.0)
brfss_cleaned_df = brfss_cleaned_df.drop(columns = "SEX")

# View the cleaned DataFrame
brfss_cleaned_df

Unnamed: 0,AGE,GENHLTH,EXERCISE,ASTHMA,CANCER,WEIGHT,HEIGHT,BMI,SMOKER,ECIG,ALCOHOL,FRUIT,FRIES,POTATO,VEGES,PNEUVAC,DIABETES,COPD,SEX_MALE
0,70.0,Poor,No,Yes,No,32.66,1.50,14.54,Former smoker,Not currently using E-cigarettes,No,1.00,0.43,14.0,2.14,Yes,No,1.0,0.0
1,72.0,Very Good,No,No,No,77.11,1.65,28.29,Never smoked,Not currently using E-cigarettes,No,1.00,0.14,14.0,0.71,No,Yes,0.0,0.0
2,62.0,Very Good,Yes,No,No,88.45,1.63,33.47,Never smoked,Not currently using E-cigarettes,Yes,1.14,0.57,27.0,1.65,No,Yes,0.0,0.0
3,76.0,Poor,Yes,No,No,93.44,1.80,28.73,Never smoked,Not currently using E-cigarettes,No,1.00,0.29,29.0,2.58,Yes,Yes,0.0,1.0
4,80.0,Good,No,No,No,88.45,1.91,24.37,Former smoker,Not currently using E-cigarettes,No,0.29,0.00,14.0,0.42,Yes,No,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340207,66.0,Fair,Yes,No,No,69.85,1.80,21.48,Never smoked,Not currently using E-cigarettes,Yes,2.50,0.14,14.0,2.99,No,Yes,0.0,1.0
340208,30.0,Very Good,Yes,Yes,No,61.23,1.57,24.69,Former smoker,Not currently using E-cigarettes,Yes,1.57,0.14,50.0,3.93,Yes,"Yes, but female and only during pregnancy",1.0,0.0
340209,54.0,Very Good,Yes,No,No,99.79,1.80,30.68,Never smoked,Not currently using E-cigarettes,Yes,2.00,0.00,14.0,1.43,No,No,0.0,1.0
340210,67.0,Very Good,Yes,No,No,79.38,1.83,23.73,Never smoked,Not currently using E-cigarettes,Yes,1.00,0.00,13.0,1.56,No,No,0.0,1.0


In [9]:
# Encode with OneHotEncoder

# Create list with variables to encode
brfss_categorical_variables = list(brfss_cleaned_df.dtypes[brfss_cleaned_df.dtypes == "object"].index)

# Create a OneHotEncoder instance
brfss_enc = OneHotEncoder(sparse_output = False)

# Encode the variables using OneHotEncoder
brfss_encoded_data = brfss_enc.fit_transform(brfss_cleaned_df[brfss_categorical_variables])

# Create a DataFrame with the encoded variables
brfss_encoded_df = pd.DataFrame(
    brfss_encoded_data,
    columns = brfss_enc.get_feature_names_out(brfss_categorical_variables)
)

# Review the DataFrame
brfss_encoded_df.head()

Unnamed: 0,GENHLTH_Don't Know/Not Sure,GENHLTH_Excellent,GENHLTH_Fair,GENHLTH_Good,GENHLTH_Poor,GENHLTH_Very Good,EXERCISE_Don't Know/Not Sure,EXERCISE_No,EXERCISE_Yes,ASTHMA_Don't Know/Not Sure,...,ALCOHOL_Don't Know/Not Sure,ALCOHOL_No,ALCOHOL_Yes,PNEUVAC_Don't Know/Not Sure,PNEUVAC_No,PNEUVAC_Yes,DIABETES_No,"DIABETES_No,but pre-diabetes or borderline diabetes",DIABETES_Yes,"DIABETES_Yes, but female and only during pregnancy"
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [10]:
# Add encoded DataFrame back to cleaned DataFrame
brfss_encoded_df = pd.concat([brfss_encoded_df, brfss_cleaned_df.drop(columns = brfss_categorical_variables)], axis = "columns")

# Review the Dataframe
brfss_encoded_df.head()

Unnamed: 0,GENHLTH_Don't Know/Not Sure,GENHLTH_Excellent,GENHLTH_Fair,GENHLTH_Good,GENHLTH_Poor,GENHLTH_Very Good,EXERCISE_Don't Know/Not Sure,EXERCISE_No,EXERCISE_Yes,ASTHMA_Don't Know/Not Sure,...,AGE,WEIGHT,HEIGHT,BMI,FRUIT,FRIES,POTATO,VEGES,COPD,SEX_MALE
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,70.0,32.66,1.5,14.54,1.0,0.43,14.0,2.14,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,72.0,77.11,1.65,28.29,1.0,0.14,14.0,0.71,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,62.0,88.45,1.63,33.47,1.14,0.57,27.0,1.65,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,76.0,93.44,1.8,28.73,1.0,0.29,29.0,2.58,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,80.0,88.45,1.91,24.37,0.29,0.0,14.0,0.42,1.0,1.0


## Create the Features and Target

In [11]:
## Select Features and Label
brfss_X = brfss_encoded_df.drop(["COPD"], axis="columns").copy()
brfss_y = brfss_encoded_df["COPD"]

In [12]:
# Check balance of Target 
brfss_y.value_counts()

0.0    313419
1.0     26793
Name: COPD, dtype: int64

## Split Data into Training and Testing Datasets

In [13]:
# Generate training and testing dataframes
brfss_X_train, brfss_X_test, brfss_y_train, brfss_y_test = train_test_split(brfss_X, brfss_y, random_state = 1, stratify = brfss_y)

## Scale Continous Features

In [14]:
# Scale features with StandardScaler

# Create a StandardScaler instance
brfss_scaler = StandardScaler()

# Fit the scaler to the features training dataset
brfss_X_scaler = brfss_scaler.fit(brfss_X_train)

# Fit the scaler to the features training dataset
brfss_X_train_scaled = brfss_X_scaler.transform(brfss_X_train)
brfss_X_test_scaled = brfss_X_scaler.transform(brfss_X_test)

# Save the scaler for making predictions
joblib.dump(brfss_scaler, Path("Resources/2021 BRFSS Survey Data/brfss_scaler.bin"), compress = True)

['Resources\\2021 BRFSS Survey Data\\brfss_scaler.bin']

## Fit a Machine Learning Model and Make Predictions

In [14]:
# Instantiate a ML Classifier and fit the training data

# Import the SVC module from SKLearn
from sklearn.svm import SVC

# Instantiate the Logistic Regression model
svm_classifier = SVC(random_state = 1)

# Fit the model using training data
svm_classifier.fit(brfss_X_train_scaled, brfss_y_train)

## Evaluate the ML Model

In [26]:
# Generate the predictions
brfss_svm_predictions = svm_classifier.predict(brfss_X_test_scaled)

In [16]:
# Print the classification report
brfss_svm_testing_report = classification_report(brfss_y_test, brfss_svm_predictions)

print(brfss_svm_testing_report)

              precision    recall  f1-score   support

         0.0       0.93      0.99      0.96     78355
         1.0       0.64      0.14      0.23      6698

    accuracy                           0.93     85053
   macro avg       0.78      0.57      0.60     85053
weighted avg       0.91      0.93      0.90     85053



## Save the ML Model

In [18]:
pickle.dump(svm_classifier, open(Path("Resources/2021 BRFSS Survey Data/svm_classifier.sav"), "wb"))

## Fit a Second Machine Learning Model and Make Predictions

In [19]:
# Instantiate a ML Classifier and fit the training data

# Import KNN Classifier from SKLearn
from sklearn.neighbors import KNeighborsClassifier

# Initiate the model instance
knn_classifier = KNeighborsClassifier(n_neighbors = 3)

# Fit the model using the training data
knn_classifier.fit(brfss_X_train_scaled, brfss_y_train)

## Evaluate the Second ML Model

In [29]:
# Generate the predictions
brfss_knn_predictions = knn_classifier.predict(brfss_X_test_scaled)

In [21]:
# Print the classification report
brfss_knn_testing_report = classification_report(brfss_y_test, brfss_knn_predictions)

print(brfss_knn_testing_report)

              precision    recall  f1-score   support

         0.0       0.94      0.97      0.95     78355
         1.0       0.40      0.23      0.29      6698

    accuracy                           0.91     85053
   macro avg       0.67      0.60      0.62     85053
weighted avg       0.89      0.91      0.90     85053



## Save the Second ML Model

In [22]:
pickle.dump(knn_classifier, open(Path("Resources/2021 BRFSS Survey Data/knn_classifier.sav"), "wb"))

## Fit a Third Machine Learning Model and Make Predictions

In [23]:
# Instantiate a ML Classifier and fit the training data

# Import Gradient Tree Boosting Classifier from SKLearn
from sklearn.ensemble import GradientBoostingClassifier

# Initiate the model instance
gtb_classifier = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 1)

# Fit the model using the training data
gtb_classifier.fit(brfss_X_train_scaled, brfss_y_train)

## Evaluate the Third ML Model

In [32]:
# Generate the predictions
brfss_gtb_predictions = gtb_classifier.predict(brfss_X_test_scaled)

In [25]:
# Print the classification report
brfss_gtb_testing_report = classification_report(brfss_y_test, brfss_gtb_predictions)

print(brfss_gtb_testing_report)

              precision    recall  f1-score   support

         0.0       0.93      0.99      0.96     78355
         1.0       0.62      0.19      0.29      6698

    accuracy                           0.93     85053
   macro avg       0.78      0.59      0.62     85053
weighted avg       0.91      0.93      0.91     85053



## Save the Third ML Model

In [26]:
pickle.dump(gtb_classifier, open(Path("Resources/2021 BRFSS Survey Data/gtb_classifier.sav"), "wb"))

## Fit a Fourth Machine Learning Model and Make Predictions

In [27]:
# Instantiate a ML Classifier and fit the training data

# Import Gaussian Naive Bayes Classifier from SKLearn
from sklearn.naive_bayes import GaussianNB

# Initiate the model instance
gnb_classifier = GaussianNB()

# Fit the model using the training data
gnb_classifier.fit(brfss_X_train_scaled, brfss_y_train)

## Evaluate the Fourth ML Model

In [35]:
# Generate the predictions
brfss_gnb_predictions = gnb_classifier.predict(brfss_X_test_scaled)

In [29]:
# Print the classification report
brfss_gnb_testing_report = classification_report(brfss_y_test, brfss_gnb_predictions)

print(brfss_gnb_testing_report)

              precision    recall  f1-score   support

         0.0       0.96      0.87      0.91     78355
         1.0       0.27      0.56      0.37      6698

    accuracy                           0.85     85053
   macro avg       0.61      0.72      0.64     85053
weighted avg       0.90      0.85      0.87     85053



## Save the Fourth ML Model

In [30]:
pickle.dump(gnb_classifier, open(Path("Resources/2021 BRFSS Survey Data/gnb_classifier.sav"), "wb"))

## Compare and Choose the best of the Four ML Models

In [31]:
print("Model 1 (Support Vector Machines Algorithm) Results")
# Print the classification report
print(brfss_svm_testing_report)

Model 1 (Support Vector Machines Algorithm) Results
              precision    recall  f1-score   support

         0.0       0.93      0.99      0.96     78355
         1.0       0.64      0.14      0.23      6698

    accuracy                           0.93     85053
   macro avg       0.78      0.57      0.60     85053
weighted avg       0.91      0.93      0.90     85053



In [32]:
print("Model 2 (k-Nearest Neighbors Classifier) Results")
# Print the classification report
print(brfss_knn_testing_report)

Model 1 (k-Nearest Neighbors Classifier) Results
              precision    recall  f1-score   support

         0.0       0.94      0.97      0.95     78355
         1.0       0.40      0.23      0.29      6698

    accuracy                           0.91     85053
   macro avg       0.67      0.60      0.62     85053
weighted avg       0.89      0.91      0.90     85053



In [33]:
print("Model 3 (Gradient Boosting Classifier) Results")
# Print the classification report
print(brfss_gtb_testing_report)

Model 1 (Gradient Boosting Classifier) Results
              precision    recall  f1-score   support

         0.0       0.93      0.99      0.96     78355
         1.0       0.62      0.19      0.29      6698

    accuracy                           0.93     85053
   macro avg       0.78      0.59      0.62     85053
weighted avg       0.91      0.93      0.91     85053



In [34]:
print("Model 4 (Gaussian Naive Bayes Classifier) Results")
# Print the classification report
print(brfss_gnb_testing_report)

Model 4 (Gaussian Naive Bayes Classifier) Results
              precision    recall  f1-score   support

         0.0       0.96      0.87      0.91     78355
         1.0       0.27      0.56      0.37      6698

    accuracy                           0.85     85053
   macro avg       0.61      0.72      0.64     85053
weighted avg       0.90      0.85      0.87     85053



In [145]:
type(accuracy_score(brfss_y_test, brfss_svm_predictions))


numpy.float64

In [181]:
# Visualize the scores

# Create a DataFrame to contain the scores
svm_scores = pd.DataFrame(precision_recall_fscore_support(brfss_y_test, brfss_svm_predictions))[:3][1]
knn_scores = pd.DataFrame(precision_recall_fscore_support(brfss_y_test, brfss_knn_predictions))[:3][1]
gtb_scores = pd.DataFrame(precision_recall_fscore_support(brfss_y_test, brfss_gtb_predictions))[:3][1]
gnb_scores = pd.DataFrame(precision_recall_fscore_support(brfss_y_test, brfss_gnb_predictions))[:3][1]

accuracy = {}
accuracy["Support Vector Machine"] = accuracy_score(brfss_y_test, brfss_svm_predictions)
accuracy["K-Nearest Neighbors"] = accuracy_score(brfss_y_test, brfss_knn_predictions)
accuracy["Gradient Boosting"] = accuracy_score(brfss_y_test, brfss_gtb_predictions)
accuracy["Gaussian Naive Bayes"] = accuracy_score(brfss_y_test, brfss_gnb_predictions)

accuracy_df = pd.DataFrame(accuracy, index = ["Accuracy"])

scores_df = pd.concat([svm_scores, knn_scores, gtb_scores, gnb_scores], axis = "columns")
scores_df.index = ["Precision", "Recall", "F1-score"]
scores_df.columns = ["Support Vector Machine", "K-Nearest Neighbors", "Gradient Boosting", "Gaussian Naive Bayes"]
scores_df = pd.concat([scores_df, accuracy_df], axis = "rows")

# View the Data
scores_df

Unnamed: 0,Support Vector Machine,K-Nearest Neighbors,Gradient Boosting,Gaussian Naive Bayes
Precision,0.638414,0.39881,0.625,0.270344
Recall,0.139445,0.230069,0.18513,0.561959
F1-score,0.228894,0.291801,0.285648,0.365065
Accuracy,0.926011,0.912055,0.927081,0.846061


In [193]:
scores_plot = scores_df.hvplot.bar(
    title = "ML Model Scores for Positive for C.O.P.D.", 
    frame_width = 800,
    frame_height = 400,
    ylabel = "Score",
    rot = 90
)

scores_plot

## Optimize the best ML Model

In [46]:
# Drop "BMI" column
brfss_X_train_opt = brfss_X_train.drop(columns = ["BMI", "ALCOHOL_Don't Know/Not Sure", "ALCOHOL_No", "ALCOHOL_Yes"])
brfss_X_test_opt = brfss_X_test.drop(columns = ["BMI", "ALCOHOL_Don't Know/Not Sure", "ALCOHOL_No", "ALCOHOL_Yes"])

# View the new DataFrames
display(brfss_X_train_opt.head())
display(brfss_X_test_opt.head())

Unnamed: 0,GENHLTH_Don't Know/Not Sure,GENHLTH_Excellent,GENHLTH_Fair,GENHLTH_Good,GENHLTH_Poor,GENHLTH_Very Good,EXERCISE_Don't Know/Not Sure,EXERCISE_No,EXERCISE_Yes,ASTHMA_Don't Know/Not Sure,...,DIABETES_Yes,"DIABETES_Yes, but female and only during pregnancy",AGE,WEIGHT,HEIGHT,FRUIT,FRIES,POTATO,VEGES,SEX_MALE
3987,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,54.0,74.84,1.68,1.07,0.03,43.0,1.53,0.0
296054,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,69.0,88.0,1.88,2.07,0.14,14.0,1.57,1.0
175036,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,36.0,106.59,1.83,1.0,0.57,14.0,2.42,1.0
78976,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,31.0,102.97,1.7,1.03,0.1,10.0,5.2,0.0
114481,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,72.0,77.11,1.52,2.0,0.13,33.0,1.73,0.0


Unnamed: 0,GENHLTH_Don't Know/Not Sure,GENHLTH_Excellent,GENHLTH_Fair,GENHLTH_Good,GENHLTH_Poor,GENHLTH_Very Good,EXERCISE_Don't Know/Not Sure,EXERCISE_No,EXERCISE_Yes,ASTHMA_Don't Know/Not Sure,...,DIABETES_Yes,"DIABETES_Yes, but female and only during pregnancy",AGE,WEIGHT,HEIGHT,FRUIT,FRIES,POTATO,VEGES,SEX_MALE
274207,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,44.0,68.04,1.6,1.0,0.03,14.0,1.74,0.0
89306,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,57.0,98.88,1.65,1.07,0.57,14.0,2.0,0.0
87287,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,43.0,58.97,1.68,1.0,0.14,14.0,3.28,0.0
43796,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,62.0,61.23,1.6,0.43,0.0,17.0,1.67,0.0
7483,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,66.0,108.86,1.73,1.29,0.14,5.397605e-79,0.57,0.0


In [47]:
# Scale features with StandardScaler

# Create a StandardScaler instance
brfss_scaler_opt = StandardScaler()

# Fit the scaler to the features training dataset
brfss_X_scaler_opt = brfss_scaler_opt.fit(brfss_X_train_opt)

# Fit the scaler to the features training dataset
brfss_X_train_scaled_opt = brfss_X_scaler_opt.transform(brfss_X_train_opt)
brfss_X_test_scaled_opt = brfss_X_scaler_opt.transform(brfss_X_test_opt)

# Save the scaler for making predictions
joblib.dump(brfss_scaler_opt, Path("Resources/2021 BRFSS Survey Data/brfss_scaler_opt.bin"), compress = True)

['Resources\\2021 BRFSS Survey Data\\brfss_scaler_opt.bin']

In [48]:
# Initiate the model instance
gnb_classifier_opt = GaussianNB()

# Fit the model using the training data
gnb_classifier_opt.fit(brfss_X_train_scaled_opt, brfss_y_train)

In [49]:
# Generate the predictions
brfss_gnb_opt_predictions = gnb_classifier_opt.predict(brfss_X_test_scaled_opt)

In [50]:
# Print the classification report
brfss_gnb_opt_testing_report = classification_report(brfss_y_test, brfss_gnb_opt_predictions)

print(brfss_gnb_opt_testing_report)

              precision    recall  f1-score   support

         0.0       0.96      0.87      0.91     78355
         1.0       0.27      0.57      0.37      6698

    accuracy                           0.85     85053
   macro avg       0.61      0.72      0.64     85053
weighted avg       0.90      0.85      0.87     85053



In [51]:
# Save the optimized model
pickle.dump(gnb_classifier_opt, open(Path("Resources/2021 BRFSS Survey Data/gnb_classifier_opt.sav"), "wb"))

# NHAMCS Analysis

In [2]:
# Read the data from CSV file
nhamcs_2021_df = pd.read_sas(Path("Resources/2017-2021 NHAMCS Data/2021/ed2021_sas.sas7bdat"))
nhamcs_2020_df = pd.read_sas(Path("Resources/2017-2021 NHAMCS Data/2020/ed2020_sas.sas7bdat"))
nhamcs_2019_df = pd.read_sas(Path("Resources/2017-2021 NHAMCS Data/2019/ed2019_sas.sas7bdat"))
nhamcs_2018_df = pd.read_sas(Path("Resources/2017-2021 NHAMCS Data/2018/ed2018_sas.sas7bdat"))
nhamcs_2017_df = pd.read_sas(Path("Resources/2017-2021 NHAMCS Data/2017/ed2017_sas.sas7bdat"))

In [3]:
# Concatenate all the dataframes
nhamcs_df = pd.concat([nhamcs_2017_df, nhamcs_2018_df, nhamcs_2019_df, nhamcs_2020_df, nhamcs_2021_df], axis = "rows")

# Reset index 
nhamcs_df = nhamcs_df.reset_index()
nhamcs_df = nhamcs_df.drop(columns = ["index"])

# Display the NHAMCS DF
nhamcs_df

Unnamed: 0,VMONTH,VDAYR,ARRTIME,WAITTIME,BLANK1,AGE,AGER,AGEDAYS,RESIDNCE,SEX,...,CSTRATM,CPSUM,PATWT,EDWT,LOV,LOS,OBSSTAY,STAY24,BOARDED,COVID_VALIDATION
0,6.0,6.0,b'2056',72.0,,27.0,3.0,-7.0,1.0,2.0,...,40100000.0,4.0,3723.12641,21.58043,,,,,,
1,6.0,2.0,b'1417',64.0,,1.0,1.0,-7.0,1.0,1.0,...,40100000.0,4.0,3723.12641,,,,,,,
2,6.0,2.0,b'2303',-7.0,,52.0,4.0,-7.0,1.0,2.0,...,40100000.0,4.0,3723.12641,,,,,,,
3,6.0,5.0,b'0930',29.0,,5.0,1.0,-7.0,1.0,2.0,...,40100000.0,4.0,3723.12641,,,,,,,
4,6.0,2.0,b'1332',20.0,,4.0,1.0,-7.0,1.0,1.0,...,40100000.0,4.0,3723.12641,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87543,2.0,7.0,b'2110',24.0,,8.0,1.0,-7.0,1.0,2.0,...,40400000.0,40.0,5590.83951,,779.0,-7.0,-7.0,2.0,-7.0,-7.0
87544,2.0,1.0,b'0539',32.0,,2.0,1.0,-7.0,1.0,1.0,...,40400000.0,40.0,5590.83951,,278.0,-7.0,-7.0,2.0,-7.0,-7.0
87545,2.0,1.0,b'1228',18.0,,1.0,1.0,-7.0,1.0,2.0,...,40400000.0,40.0,5590.83951,,169.0,-7.0,-7.0,2.0,-7.0,-7.0
87546,2.0,1.0,b'1644',25.0,,4.0,1.0,-7.0,1.0,1.0,...,40400000.0,40.0,5590.83951,,247.0,-7.0,-7.0,2.0,-7.0,-7.0


## Cleaning and Wrangling the Data

In [4]:
# Select relevant columns and drop the rest
nhamcs_cleaned_df = nhamcs_df[["AGE", "SEX", "ETHIM", "RACEUN", "ETOHAB", "ALZHD", "ASTHMA", "CANCER", "CEBVD",
                               "CKD", "COPD", "CHF", "CAD", "DEPRN", "DIABTYP1", "DIABTYP2", "DIABTYP0", "ESRD",
                               "HPE", "EDHIV", "HYPLIPID", "HTN", "OBESITY", "OSA", "OSTPRSIS", "SUBSTAB", "NOCHRON",
                               "DIAG1", "DIAG2", "DIAG3", "DIAG4", "DIAG5"]]

# Display the Dataframe
nhamcs_cleaned_df

Unnamed: 0,AGE,SEX,ETHIM,RACEUN,ETOHAB,ALZHD,ASTHMA,CANCER,CEBVD,CKD,...,OBESITY,OSA,OSTPRSIS,SUBSTAB,NOCHRON,DIAG1,DIAG2,DIAG3,DIAG4,DIAG5
0,27.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,b'S810',b'-9',b'-9',b'-9',b'-9'
1,1.0,1.0,1.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,b'J209',b'-9',b'-9',b'-9',b'-9'
2,52.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,b'ZZZ1',b'-9',b'-9',b'-9',b'-9'
3,5.0,2.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,b'J050',b'-9',b'-9',b'-9',b'-9'
4,4.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,b'S672',b'-9',b'-9',b'-9',b'-9'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87543,8.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,b'K529',b'-9',b'-9',b'-9',b'-9'
87544,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,b'R109',b'-9',b'-9',b'-9',b'-9'
87545,1.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,b'R05-',b'R062',b'-9',b'-9',b'-9'
87546,4.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,b'S621',b'-9',b'-9',b'-9',b'-9'


In [5]:
# Remap categorical feature values to their labels based on the documentation

# Ignore warnings
warnings.simplefilter(action = "ignore", category = SettingWithCopyWarning)

# Remap "SEX" column
nhamcs_cleaned_df["SEX"] = nhamcs_cleaned_df["SEX"].apply(lambda x: "FEMALE" if x == 1.0 else "MALE")

# Remap "ETHIM" column
nhamcs_cleaned_df["ETHIM"] = nhamcs_cleaned_df["ETHIM"].apply(lambda x: "Hispanic or Latino" if x == 1.0 else "Not Hispanic or Latino")

# Remap "RACEUN" column
def race(x):
    if x == 1.0:
        return "White Only"
    elif x == 2.0:
        return "Black/African American Only"
    elif x == 3.0:
        return "Asian Only"
    elif x == 4.0:
        return "Native Hawaiian/Oth Pac Isl Only"
    elif x == 5.0:
        return "American Indian/Alaska Native Only"
    elif x == 6.0:
        return "More than one race"
    else:
        return 'N/A'
    
nhamcs_cleaned_df["RACEUN"] = nhamcs_cleaned_df["RACEUN"].apply(race)

In [6]:
# Reformat the "DIAG" columns

# Duplicate the columns to new DataFrames
diagnosis_1_df = nhamcs_cleaned_df[["DIAG1"]].copy()
diagnosis_2_df = nhamcs_cleaned_df[["DIAG2"]].copy()
diagnosis_3_df = nhamcs_cleaned_df[["DIAG3"]].copy()
diagnosis_4_df = nhamcs_cleaned_df[["DIAG4"]].copy()
diagnosis_5_df = nhamcs_cleaned_df[["DIAG5"]].copy()

# Clean the strings
diagnosis_1_df["DIAG1"] = diagnosis_1_df["DIAG1"].apply(lambda x: str(x).strip("b'"))
diagnosis_2_df["DIAG2"] = diagnosis_2_df["DIAG2"].apply(lambda x: str(x).strip("b'"))
diagnosis_3_df["DIAG3"] = diagnosis_3_df["DIAG3"].apply(lambda x: str(x).strip("b'"))
diagnosis_4_df["DIAG4"] = diagnosis_4_df["DIAG4"].apply(lambda x: str(x).strip("b'"))
diagnosis_5_df["DIAG5"] = diagnosis_5_df["DIAG5"].apply(lambda x: str(x).strip("b'"))

# Remove the column names
diagnosis_1_df.columns = [""]
diagnosis_2_df.columns = [""]
diagnosis_3_df.columns = [""]
diagnosis_4_df.columns = [""]
diagnosis_5_df.columns = [""]

# View the first diagnosis DataFrame
display(diagnosis_1_df)

Unnamed: 0,Unnamed: 1
0,S810
1,J209
2,ZZZ1
3,J050
4,S672
...,...
87543,K529
87544,R109
87545,R05-
87546,S621


In [7]:
# Encode the diagnosis dataframes

# Create a OneHotEncoder instance for each diagnosis DataFrame
enc_1 = OneHotEncoder(sparse_output = False)
enc_2 = OneHotEncoder(sparse_output = False)
enc_3 = OneHotEncoder(sparse_output = False)
enc_4 = OneHotEncoder(sparse_output = False)
enc_5 = OneHotEncoder(sparse_output = False)

# Encode the diagnosis DataFrames using OneHotEncoder
encoded_diagnosis_1_data = enc_1.fit_transform(diagnosis_1_df)
encoded_diagnosis_2_data = enc_2.fit_transform(diagnosis_2_df)
encoded_diagnosis_3_data = enc_3.fit_transform(diagnosis_3_df)
encoded_diagnosis_4_data = enc_4.fit_transform(diagnosis_4_df)
encoded_diagnosis_5_data = enc_5.fit_transform(diagnosis_5_df)

encoded_diagnosis_1_df = pd.DataFrame(
    encoded_diagnosis_1_data,
    columns = enc_1.get_feature_names_out([""])
).reset_index()
encoded_diagnosis_2_df = pd.DataFrame(
    encoded_diagnosis_2_data,
    columns = enc_2.get_feature_names_out([""])
).reset_index()
encoded_diagnosis_3_df = pd.DataFrame(
    encoded_diagnosis_3_data,
    columns = enc_3.get_feature_names_out([""])
).reset_index()
encoded_diagnosis_4_df = pd.DataFrame(
    encoded_diagnosis_4_data,
    columns = enc_4.get_feature_names_out([""])
).reset_index()
encoded_diagnosis_5_df = pd.DataFrame(
    encoded_diagnosis_5_data,
    columns = enc_5.get_feature_names_out([""])
).reset_index()

In [8]:
# Aggregate the diagnosis DataFrames
encoded_diagnosis_df = pd.concat([encoded_diagnosis_1_df, encoded_diagnosis_2_df, encoded_diagnosis_3_df, encoded_diagnosis_4_df, encoded_diagnosis_5_df]).groupby(['index']).sum()

In [9]:
# Flatten any duplicate diagnosis to 1
encoded_diagnosis_df = encoded_diagnosis_df.applymap(lambda x: 1 if x >= 1 else 0)

# Remove "_" from column names
encoded_diagnosis_df.columns = [x.strip("_") for x in list(encoded_diagnosis_df.columns)]

# Drop the column corresponding to no diagnosis
encoded_diagnosis_df = encoded_diagnosis_df.drop(columns = "-9")

# Drop the columns corresponding to Dementia/Alzheimer related diagnosis
encoded_diagnosis_df = encoded_diagnosis_df.drop(columns = ["F039", "F028", "F01-", "G301", "G309"])

# Display the endcoded diagnosis DataFrame
encoded_diagnosis_df

Unnamed: 0_level_0,A02-,A029,A047,A049,A059,A07-,A079,A08-,A081,A083,...,Y920,Z123,Z593,Z759,Z801,Z809,Z813,Z852,Z906,Z953
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87543,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87544,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87545,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87546,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Concatenate the encoded diagnosis DataFrame to the cleaned DataFrame

# Drop the "DIAG" columns
nhamcs_cleaned_df = nhamcs_cleaned_df.drop(columns = ["DIAG1", "DIAG2", "DIAG3", "DIAG4", "DIAG5"])

# Concatenate the DataFrames
nhamcs_cleaned_df = pd.concat([nhamcs_cleaned_df, encoded_diagnosis_df], axis = "columns")

# Display the cleaned NHAMCS DataFrame
nhamcs_cleaned_df

Unnamed: 0,AGE,SEX,ETHIM,RACEUN,ETOHAB,ALZHD,ASTHMA,CANCER,CEBVD,CKD,...,Y920,Z123,Z593,Z759,Z801,Z809,Z813,Z852,Z906,Z953
0,27.0,MALE,Not Hispanic or Latino,White Only,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,FEMALE,Hispanic or Latino,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,52.0,MALE,Not Hispanic or Latino,White Only,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,5.0,MALE,Not Hispanic or Latino,White Only,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,4.0,FEMALE,Not Hispanic or Latino,White Only,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87543,8.0,MALE,Hispanic or Latino,White Only,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
87544,2.0,FEMALE,Hispanic or Latino,White Only,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
87545,1.0,MALE,Not Hispanic or Latino,White Only,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
87546,4.0,FEMALE,Hispanic or Latino,White Only,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Export the cleaned DataFrame
# nhamcs_cleaned_df[:14000].to_csv(Path("Resources/2017-2021 NHAMCS Data/NHAMCS_Data_Cleaned_1"), index = True)
# nhamcs_cleaned_df[14000:28000].to_csv(Path("Resources/2017-2021 NHAMCS Data/NHAMCS_Data_Cleaned_2"), index = True)
# nhamcs_cleaned_df[28000:42000].to_csv(Path("Resources/2017-2021 NHAMCS Data/NHAMCS_Data_Cleaned_3"), index = True)
# nhamcs_cleaned_df[42000:56000].to_csv(Path("Resources/2017-2021 NHAMCS Data/NHAMCS_Data_Cleaned_4"), index = True)
# nhamcs_cleaned_df[56000:70000].to_csv(Path("Resources/2017-2021 NHAMCS Data/NHAMCS_Data_Cleaned_5"), index = True)
# nhamcs_cleaned_df[70000:84000].to_csv(Path("Resources/2017-2021 NHAMCS Data/NHAMCS_Data_Cleaned_6"), index = True)
# nhamcs_cleaned_df[84000:].to_csv(Path("Resources/2017-2021 NHAMCS Data/NHAMCS_Data_Cleaned_7"), index = True)

# Read the cleaned DataFrame
nhamcs_cleaned_df = pd.DataFrame()

for i in range(7):
    read_nhamcs_df = pd.read_csv(Path(f"Resources/2017-2021 NHAMCS Data/NHAMCS_Data_Cleaned_{i+1}"), index_col = [0])
    nhamcs_cleaned_df = pd.concat([nhamcs_cleaned_df, read_nhamcs_df], axis = "rows")

nhamcs_cleaned_df

Unnamed: 0,AGE,SEX,ETHIM,RACEUN,ETOHAB,ALZHD,ASTHMA,CANCER,CEBVD,CKD,...,Y920,Z123,Z593,Z759,Z801,Z809,Z813,Z852,Z906,Z953
0,27.0,MALE,Not Hispanic or Latino,White Only,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,FEMALE,Hispanic or Latino,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,52.0,MALE,Not Hispanic or Latino,White Only,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,5.0,MALE,Not Hispanic or Latino,White Only,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,4.0,FEMALE,Not Hispanic or Latino,White Only,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87543,8.0,MALE,Hispanic or Latino,White Only,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
87544,2.0,FEMALE,Hispanic or Latino,White Only,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
87545,1.0,MALE,Not Hispanic or Latino,White Only,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
87546,4.0,FEMALE,Hispanic or Latino,White Only,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Plot the data

## Encode Categorical Features

In [17]:
# Encode with OneHotEncoder

# Create list with variables to encode
nhamcs_categorical_variables = ["SEX", "ETHIM", "RACEUN"]

# Create a OneHotEncoder instance
nhamcs_enc = OneHotEncoder(sparse_output = False)

# Encode the variables using OneHotEncoder
nhamcs_encoded_data = nhamcs_enc.fit_transform(nhamcs_cleaned_df[nhamcs_categorical_variables])

# Create a DataFrame with the encoded variables
nhamcs_encoded_df = pd.DataFrame(
    nhamcs_encoded_data,
    columns = nhamcs_enc.get_feature_names_out(nhamcs_categorical_variables)
)

# Review the DataFrame
nhamcs_encoded_df.head()

Unnamed: 0,SEX_FEMALE,SEX_MALE,ETHIM_Hispanic or Latino,ETHIM_Not Hispanic or Latino,RACEUN_American Indian/Alaska Native Only,RACEUN_Asian Only,RACEUN_Black/African American Only,RACEUN_More than one race,RACEUN_Native Hawaiian/Oth Pac Isl Only,RACEUN_White Only,RACEUN_nan
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [18]:
# Add encoded DataFrame back to cleaned DataFrame
nhamcs_encoded_df = pd.concat([nhamcs_encoded_df, nhamcs_cleaned_df.drop(columns = nhamcs_categorical_variables)], axis = "columns")

# Review the Dataframe
nhamcs_encoded_df.head()

Unnamed: 0,SEX_FEMALE,SEX_MALE,ETHIM_Hispanic or Latino,ETHIM_Not Hispanic or Latino,RACEUN_American Indian/Alaska Native Only,RACEUN_Asian Only,RACEUN_Black/African American Only,RACEUN_More than one race,RACEUN_Native Hawaiian/Oth Pac Isl Only,RACEUN_White Only,...,Y920,Z123,Z593,Z759,Z801,Z809,Z813,Z852,Z906,Z953
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0


## Create the Features and Target

In [19]:
# Select Features and target
nhamcs_X = nhamcs_encoded_df.drop(["ALZHD"], axis="columns").copy()
nhamcs_y = nhamcs_encoded_df["ALZHD"]

In [20]:
# Check balance of Target 
nhamcs_y.value_counts()

0.0    86369
1.0     1179
Name: ALZHD, dtype: int64

## Split Data into Training and Testing Datasets

In [21]:
# Generate training and testing dataframes
nhamcs_X_train, nhamcs_X_test, nhamcs_y_train, nhamcs_y_test = train_test_split(nhamcs_X, nhamcs_y, random_state = 1, stratify = nhamcs_y)

In [22]:
# Check balance of Target training
nhamcs_y_train.value_counts()

0.0    64777
1.0      884
Name: ALZHD, dtype: int64

## Scale Continous Features

In [23]:
# Scale features with StandardScaler

# Create a StandardScaler instance
nhamcs_scaler = StandardScaler()

# Fit the scaler to the features training dataset
nhamcs_X_scaler = nhamcs_scaler.fit(nhamcs_X_train)

# Fit the scaler to the features training dataset
nhamcs_X_train_scaled = nhamcs_X_scaler.transform(nhamcs_X_train)
nhamcs_X_test_scaled = nhamcs_X_scaler.transform(nhamcs_X_test)

# Save the scaler for making predictions
joblib.dump(nhamcs_scaler, Path("Resources/2017-2021 NHAMCS Data/nhamcs_scaler.bin"), compress = True)

['Resources\\2017-2021 NHAMCS Data\\nhamcs_scaler.bin']

## Fit a Machine Learning Model and Make Predictions

In [22]:
# Instantiate a ML Classifier and fit the training data

# Import the SVC module from SKLearn
from sklearn.svm import SVC

# Instantiate the Logistic Regression model
svm_model = SVC(random_state = 1)

# Fit the model using training data
svm_model.fit(nhamcs_X_train_scaled, nhamcs_y_train)

## Evaluate the ML Model

In [71]:
# Generate the predictions
nhamcs_svm_predictions = svm_model.predict(nhamcs_X_test_scaled)

In [24]:
# Print the classification report
nhamcs_svm_testing_report = classification_report(nhamcs_y_test, nhamcs_svm_predictions)

print(nhamcs_svm_testing_report)

              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     21592
         1.0       0.94      0.05      0.10       295

    accuracy                           0.99     21887
   macro avg       0.96      0.53      0.54     21887
weighted avg       0.99      0.99      0.98     21887



## Save the ML Model

In [25]:
pickle.dump(svm_model, open(Path("Resources/2017-2021 NHAMCS Data/svm_model.sav"), "wb"))

## Fit a Second Machine Learning Model and Make Predictions

In [26]:
# Instantiate a ML Classifier and fit the training data

# Import KNN Classifier from SKLearn
from sklearn.neighbors import KNeighborsClassifier

# Initiate the model instance
knn_model = KNeighborsClassifier(n_neighbors = 3)

# Fit the model using the training data
knn_model.fit(nhamcs_X_train_scaled, nhamcs_y_train)

## Evaluate the Second ML Model

In [74]:
# Generate the predictions
nhamcs_knn_predictions = knn_model.predict(nhamcs_X_test_scaled)

In [75]:
# Print the classification report
nhamcs_knn_testing_report = classification_report(nhamcs_y_test, nhamcs_knn_predictions)

print(nhamcs_knn_testing_report)

              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     21592
         1.0       0.47      0.09      0.16       295

    accuracy                           0.99     21887
   macro avg       0.73      0.55      0.58     21887
weighted avg       0.98      0.99      0.98     21887



## Save the Second ML Model

In [29]:
pickle.dump(knn_model, open(Path("Resources/2017-2021 NHAMCS Data/knn_model.sav"), "wb"))

## Fit a Third Machine Learning Model and Make Predictions

In [30]:
# Instantiate a ML Classifier and fit the training data

# Import Gradient Tree Boosting Classifier from SKLearn
from sklearn.ensemble import GradientBoostingClassifier

# Initiate the model instance
gtb_model = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 1)

# Fit the model using the training data
gtb_model.fit(nhamcs_X_train_scaled, nhamcs_y_train)

## Evaluate the Third ML Model

In [78]:
# Generate the predictions
nhamcs_gtb_predictions = gtb_model.predict(nhamcs_X_test_scaled)

In [32]:
# Print the classification report
nhamcs_gtb_testing_report = classification_report(nhamcs_y_test, nhamcs_gtb_predictions)

print(nhamcs_gtb_testing_report)

              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     21592
         1.0       0.62      0.09      0.15       295

    accuracy                           0.99     21887
   macro avg       0.80      0.54      0.57     21887
weighted avg       0.98      0.99      0.98     21887



## Save the Third ML Model

In [33]:
pickle.dump(gtb_model, open(Path("Resources/2017-2021 NHAMCS Data/gtb_model.sav"), "wb"))

## Compare and Choose the best of the Three ML Models

In [80]:
print("Model 1 (Support Vector Machines Algorithm) Results")
# Print the classification report
print(nhamcs_svm_testing_report)

Model 1 (Support Vector Machines Algorithm) Results
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     21592
         1.0       0.94      0.05      0.10       295

    accuracy                           0.99     21887
   macro avg       0.96      0.53      0.54     21887
weighted avg       0.99      0.99      0.98     21887



In [81]:
print("Model 2 (k-Nearest Neighbors Classifier) Results")
# Print the classification report
print(nhamcs_knn_testing_report)

Model 2 (k-Nearest Neighbors Classifier) Results
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     21592
         1.0       0.47      0.09      0.16       295

    accuracy                           0.99     21887
   macro avg       0.73      0.55      0.58     21887
weighted avg       0.98      0.99      0.98     21887



In [82]:
print("Model 3 (Gradient Boosting Classifier) Results")
# Print the classification report
print(nhamcs_gtb_testing_report)

Model 3 (Gradient Boosting Classifier) Results
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     21592
         1.0       0.62      0.09      0.15       295

    accuracy                           0.99     21887
   macro avg       0.80      0.54      0.57     21887
weighted avg       0.98      0.99      0.98     21887



## Optimize the best ML Model

### Resample Training Data

In [35]:
# Import the SMOTEENN module form imbalanced-learn
from imblearn.combine import SMOTEENN

# Instantiate the random oversampler model
smote_enn = SMOTEENN(random_state=1)

# Fit the original training data to the random_oversampler model
nhamcs_X_resampled, nhamcs_y_resampled = smote_enn.fit_resample(nhamcs_X_train, nhamcs_y_train)

In [36]:
# Count the distinct values of the resampled labels data
nhamcs_y_resampled.value_counts()

1.0    64772
0.0    58036
Name: ALZHD, dtype: int64

In [37]:
# Scale resampled features with StandardScaler

# Create a StandardScaler instance
nhamcs_resampled_scaler = StandardScaler()

# Fit the scaler to the features training dataset
nhamcs_X_resampled_scaler = nhamcs_resampled_scaler.fit(nhamcs_X_resampled)

# Fit the scaler to the features training dataset
nhamcs_X_train_resampled_scaled = nhamcs_X_resampled_scaler.transform(nhamcs_X_resampled)
nhamcs_X_test_resampled_scaled = nhamcs_X_resampled_scaler.transform(nhamcs_X_test)

In [38]:
# Initiate the model instance
opt_gtb_model = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 1)

# Fit the model using the training data
opt_gtb_model.fit(nhamcs_X_train_resampled_scaled, nhamcs_y_resampled)

### Evaluate the Optimized Model

In [39]:
# Generate the predictions
nhamcs_opt_gtb_predictions = opt_gtb_model.predict(nhamcs_X_test_resampled_scaled)

In [40]:
# Print the classification report
nhamcs_opt_gtb_testing_report = classification_report(nhamcs_y_test, nhamcs_opt_gtb_predictions)

print(nhamcs_opt_gtb_testing_report)

              precision    recall  f1-score   support

         0.0       1.00      0.92      0.96     21592
         1.0       0.11      0.77      0.20       295

    accuracy                           0.91     21887
   macro avg       0.55      0.84      0.58     21887
weighted avg       0.98      0.91      0.94     21887



### Save the Optimized Model

In [41]:
pickle.dump(opt_gtb_model, open(Path("Resources/2017-2021 NHAMCS Data/opt_gtb_model.sav"), "wb"))

## Fit a Neural Network Model and make Predictions

In [11]:
# Define number of inputs, hidden layers and outputs
num_input_features = len(nhamcs_X_train.columns)

num_output_neurons = 1

num_hidden_layers = 8

In [12]:
# Compile and fit the model

# Create a Sequential model instance
nhamcs_nn = Sequential()

# Add the layers to the model
def add_layers(nn, num_input_features, num_output_neurons, num_hidden_layers):
    num_hidden_nodes = (num_input_features + 1) // 2 
    
    # Add the first hidden layer
    nn.add(Dense(units = num_hidden_nodes, input_dim = num_input_features, activation = "relu"))
    
    # Add the remaining hidden layers
    n = min(int(np.floor(np.log2(num_hidden_nodes))), num_hidden_layers - 1)
    
    for _ in range(n):
        num_hidden_nodes = (num_hidden_nodes + 1) // 2 
        nn.add(Dense(units = num_hidden_nodes, activation = "relu"))
    
    # Add the output layer
    nn.add(Dense(units = num_output_neurons, activation = "sigmoid"))
    
    return None

add_layers(nhamcs_nn, num_input_features, num_output_neurons, num_hidden_layers)

In [13]:
# Display the Sequential model summary
nhamcs_nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1727)              5966785   
                                                                 
 dense_1 (Dense)             (None, 864)               1492992   
                                                                 
 dense_2 (Dense)             (None, 432)               373680    
                                                                 
 dense_3 (Dense)             (None, 216)               93528     
                                                                 
 dense_4 (Dense)             (None, 108)               23436     
                                                                 
 dense_5 (Dense)             (None, 54)                5886      
                                                                 
 dense_6 (Dense)             (None, 27)                1

In [14]:
# Compile the Sequential model
nhamcs_nn.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

In [15]:
# Fit the model using 50 epochs and the training data
fit_nhamcs_nn_model = nhamcs_nn.fit(nhamcs_X_train_scaled, nhamcs_y_train, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


## Evaluate the Neural Network Model

In [16]:
# Evaluate relevant metrics

# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
nhamcs_nn_model_loss, nhamcs_nn_model_accuracy = nhamcs_nn.evaluate(nhamcs_X_test_scaled, nhamcs_y_test, verbose = 2)

# Display the model loss and accuracy results
print(f"Loss: {nhamcs_nn_model_loss}, Accuracy: {nhamcs_nn_model_accuracy}")

684/684 - 5s - loss: 0.3229 - accuracy: 0.9822 - 5s/epoch - 7ms/step
Loss: 0.3228524327278137, Accuracy: 0.982181191444397


## Save the Neural Network Model

In [17]:
# Export the model to a HDF5 file
nhamcs_nn.save(Path("Resources/2017-2021 NHAMCS Data/nn_model.h5"))

  saving_api.save_model(


## Optimize the Neural Network Model

- Add dropout layers
- Add regulizers

In [18]:
# Define number of inputs, hidden layers and outputs
num_input_features = len(nhamcs_X_train.columns)

num_output_neurons = 1

num_hidden_layers = 8

In [19]:
# Compile and fit the optimized model

# Create a Sequential model instance
nhamcs_nn_opt = Sequential()

# Add the layers to the model
def add_layers_opt(nn, num_input_features, num_output_neurons, num_hidden_layers):
    num_hidden_nodes = (num_input_features + 1) // 2 
    
    # Add the first hidden layer with regulizer
    nn.add(Dense(units = num_hidden_nodes, 
                 input_dim = num_input_features, 
                 activation = "relu", 
                 kernel_regularizer=l2(0.01), 
                 bias_regularizer=l2(0.01)))
    
    # Add the remaining hidden layers and dropout
    n = min(int(np.floor(np.log2(num_hidden_nodes))), num_hidden_layers - 1)
    i = 0

    for _ in range(n):
        if (((num_hidden_nodes + 1) // 2) == 1) or (i >= n):
            break
        elif (((num_hidden_nodes + 1) // 2) == 2) or (i + 1 >= n):
            # Add a layer with regularizer before output
            num_hidden_nodes = (num_hidden_nodes + 1) // 2 
            nn.add(Dense(units = num_hidden_nodes, 
                         activation = "relu", 
                         kernel_regularizer=l2(0.01), 
                         bias_regularizer=l2(0.01)))
            break
        else:
            i += 1
            # Add a layer with dropout
            num_hidden_nodes = (num_hidden_nodes + 1) // 2 
            nn.add(Dense(units = num_hidden_nodes, activation = "relu"))
            nn.add(Dropout(.2, input_shape = (num_hidden_nodes,)))

    # Add the output layer
    nn.add(Dense(units = num_output_neurons, activation = "sigmoid"))
    
    return None

add_layers_opt(nhamcs_nn_opt, num_input_features, num_output_neurons, num_hidden_layers)

In [20]:
# Display the Sequential model summary
nhamcs_nn_opt.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 1727)              5966785   
                                                                 
 dense_10 (Dense)            (None, 864)               1492992   
                                                                 
 dropout (Dropout)           (None, 864)               0         
                                                                 
 dense_11 (Dense)            (None, 432)               373680    
                                                                 
 dropout_1 (Dropout)         (None, 432)               0         
                                                                 
 dense_12 (Dense)            (None, 216)               93528     
                                                                 
 dropout_2 (Dropout)         (None, 216)              

In [21]:
# Compile the Sequential model
nhamcs_nn_opt.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

In [22]:
# Fit the model using 50 epochs and the training data
fit_nhamcs_nn_opt_model = nhamcs_nn_opt.fit(nhamcs_X_train_scaled, nhamcs_y_train, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [23]:
# Evaluate relevant metrics

# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
nhamcs_nn_opt_model_loss, nhamcs_nn_opt_model_accuracy = nhamcs_nn_opt.evaluate(nhamcs_X_test_scaled, nhamcs_y_test, verbose = 2)

# Display the model loss and accuracy results
print(f"Loss: {nhamcs_nn_opt_model_loss}, Accuracy: {nhamcs_nn_opt_model_accuracy}")

684/684 - 5s - loss: 0.1697 - accuracy: 0.9868 - 5s/epoch - 7ms/step
Loss: 0.16971436142921448, Accuracy: 0.9867501258850098


In [24]:
# Export the model to a HDF5 file
nhamcs_nn_opt.save(Path("Resources/2017-2021 NHAMCS Data/nn_opt_model.h5"))