# Lab 4: Neural Networks with Keras

## Data

In [73]:
#| echo: False
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score

First we will begin by importing the data, the dataset warns that the data is imbalanced, so we will examine that aspect next. One nice thing about the dataset is that it is completely numeric, meaning there is no need for OneHotEncoding to deal with categorical variables.

In [9]:
data = pd.read_csv("/Users/Bnkes/Desktop/GitHub/AdvancedMachineLearning/Data/DiabetesData/diabetes_binary_health_indicators_BRFSS2015.csv")
data.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [24]:
data["Diabetes_binary"].value_counts()

Diabetes_binary
0.0    218334
1.0     35346
Name: count, dtype: int64

As can be see above, the majority of the target variable are people without diabetes (~86% of the dataset). This suggests that we may want to pursue some form of sampling to get rid of this bias, but first we will begin without sampling the data so as to establish a baseline score.

For each sampling technique, I will try three different neural networks, as well as a random forest model.

## Sampling Method 1: No Sampling

In [53]:
X = data.drop("Diabetes_binary", axis = 1)
y = data["Diabetes_binary"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

### Random Forest Baseline Model

In [55]:
ct = ColumnTransformer(
    [
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ]
)

In [58]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("forest", RandomForestClassifier(n_jobs = -1))
    ]
)

parameters = {
    "forest__min_samples_leaf": [1, 2, 3, 4, 5, 10, 15, 25],
    "forest__min_samples_split": [2, 3, 4, 5, 10, 15, 25],
    "forest__ccp_alpha": [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='f1', n_jobs=1, verbose = 1)
gscv_fitted = gscv.fit(X_train, y_train)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

Fitting 5 folds for each of 392 candidates, totalling 1960 fits


In [60]:
pd.DataFrame(gscv_fitted.cv_results_).sort_values(by = "rank_test_score", ascending = True).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_forest__ccp_alpha,param_forest__min_samples_leaf,param_forest__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
57,1.202987,0.019463,0.086366,0.001047,1e-06,1,3,"{'forest__ccp_alpha': 1e-06, 'forest__min_samp...",0.255337,0.244863,0.258777,0.250499,0.262461,0.254388,0.006184,1
1,1.089153,0.012393,0.086221,0.001327,0.0,1,3,"{'forest__ccp_alpha': 1e-07, 'forest__min_samp...",0.255973,0.246692,0.256084,0.252268,0.256468,0.253497,0.003727,2
56,1.124127,0.0138,0.08624,0.001455,1e-06,1,2,"{'forest__ccp_alpha': 1e-06, 'forest__min_samp...",0.2506,0.243117,0.254105,0.25598,0.261944,0.253149,0.00622,3
58,1.337528,0.024672,0.079644,0.005414,1e-06,1,4,"{'forest__ccp_alpha': 1e-06, 'forest__min_samp...",0.252135,0.243564,0.252804,0.248989,0.255734,0.250645,0.004139,4
0,1.13933,0.010523,0.133413,0.087801,0.0,1,2,"{'forest__ccp_alpha': 1e-07, 'forest__min_samp...",0.246348,0.242544,0.253482,0.255778,0.253076,0.250246,0.004972,5


In [66]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("forest", RandomForestClassifier(n_jobs=-1, ccp_alpha=1e-6, min_samples_leaf=1, min_samples_split=3))
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

In [68]:
y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual No Diabetes", "Actual Diabetes"], columns=["Predicted No Diabetes", "Predicted Diabetes"])

cm_df

Unnamed: 0,Predicted No Diabetes,Predicted Diabetes
Actual No Diabetes,216855,1479
Actual Diabetes,10496,24850


In [75]:
f1 = f1_score(y_true=y, y_pred = y_pred)
print(f"F1 Score for Random Forest Classifier: {f1}")

F1 Score for Random Forest Classifier: 0.805837049047426


### Neural Network 1
We will begin with a simple neural network. It will have three layers all with the input size of 21.

In [210]:
inputs = keras.Input(shape = (21, ))
x = layers.Dense(21, activation="relu")(inputs)
x = layers.Dense(21, activation="relu")(x)
outputs = layers.Dense(1, activation = "relu")(x)
model = keras.Model(inputs=inputs, outputs=outputs, name="diabetes_model_1")
model.summary()

In [212]:
model.compile(
    loss=keras.losses.BinaryCrossentropy(from_logits=False),
    optimizer=keras.optimizers.RMSprop(),
    metrics=[keras.metrics.Recall()],
)

history = model.fit(X_train, y_train, batch_size = 64, epochs=10, callbacks=keras.callbacks.EarlyStopping(start_from_epoch=5), validation_split=.2)

Epoch 1/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 724us/step - loss: 1.6947 - recall_1: 0.1331 - val_loss: 0.9429 - val_recall_1: 0.1389
Epoch 2/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 667us/step - loss: 0.9274 - recall_1: 0.2510 - val_loss: 1.0344 - val_recall_1: 0.0023
Epoch 3/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 665us/step - loss: 0.5755 - recall_1: 0.2156 - val_loss: 0.4142 - val_recall_1: 0.3573
Epoch 4/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 660us/step - loss: 0.4944 - recall_1: 0.1719 - val_loss: 0.5739 - val_recall_1: 0.0710
Epoch 5/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 659us/step - loss: 0.5285 - recall_1: 0.1976 - val_loss: 0.4162 - val_recall_1: 0.1571
Epoch 6/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 663us/step - loss: 0.4810 - recall_1: 0.1325 - val_loss: 0.4396 - val_recall_1: 0.2965
Epoc

In [214]:
scores = model.evaluate(X_test, y_test, verbose=2)

1982/1982 - 1s - 424us/step - loss: 0.6782 - recall_1: 0.1165


In [216]:
y_pred_prob = model.predict(X)  # Get predicted probabilities for each class
y_pred = np.argmax(y_pred_prob, axis=1)  # Convert probabilities to class labels
pd.DataFrame(confusion_matrix(y, y_pred), columns=["Predicted No Diabetes", "Predicted Diabetes"], index = ["Actual No Diabetes", "Actual Diabetes"])

[1m7928/7928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 479us/step


Unnamed: 0,Predicted No Diabetes,Predicted Diabetes
Actual No Diabetes,218334,0
Actual Diabetes,35346,0


In this first neural network, we can see extreme overfitting. This is likely a result of the not employing any sampling techniques, however we will attempt to fix this by making some changes to the neural network first.

### Neural Network 2
With this neural network, we will add in dropout layers and change the activation function of the network.

In [154]:
inputs = keras.Input(shape = (21, ))
x = layers.Dense(21, activation="linear")(inputs)
x = layers.Dropout(rate=.1)(x)
x = layers.Dense(21, activation="linear")(x)
x = layers.Dropout(rate=.1)(x)
outputs = layers.Dense(1, activation = "relu")(x)
model = keras.Model(inputs=inputs, outputs=outputs, name="diabetes_model_1")
model.summary()

In [158]:
model.compile(
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=[keras.metrics.binary_accuracy],
)

history = model.fit(X_train, y_train, batch_size = 64, epochs=10, callbacks=keras.callbacks.EarlyStopping(start_from_epoch=5), validation_split=.2)

Epoch 1/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 778us/step - binary_accuracy: 0.8610 - loss: 0.6932 - val_binary_accuracy: 0.8613 - val_loss: 0.6931
Epoch 2/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 702us/step - binary_accuracy: 0.8614 - loss: 0.6932 - val_binary_accuracy: 0.8613 - val_loss: 0.6931
Epoch 3/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 711us/step - binary_accuracy: 0.8609 - loss: 0.6932 - val_binary_accuracy: 0.8613 - val_loss: 0.6931
Epoch 4/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 705us/step - binary_accuracy: 0.8610 - loss: 0.6932 - val_binary_accuracy: 0.8613 - val_loss: 0.6931
Epoch 5/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 715us/step - binary_accuracy: 0.8601 - loss: 0.6932 - val_binary_accuracy: 0.8613 - val_loss: 0.6931
Epoch 6/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 703us/step - binary_a

In [160]:
scores = model.evaluate(X_test, y_test, verbose=2)

1982/1982 - 1s - 417us/step - binary_accuracy: 0.8608 - loss: 0.6931


In [162]:
y_pred_prob = model.predict(X)  # Get predicted probabilities for each class
y_pred = np.argmax(y_pred_prob, axis=1)  # Convert probabilities to class labels
pd.DataFrame(confusion_matrix(y, y_pred), columns=["Predicted No Diabetes", "Predicted Diabetes"], index = ["Actual No Diabetes", "Actual Diabetes"])

[1m7928/7928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 453us/step


Unnamed: 0,Predicted No Diabetes,Predicted Diabetes
Actual No Diabetes,218334,0
Actual Diabetes,35346,0


Adding the dropout layers and changing the activation function did not seem to fix anything, now I will try to change the shape of the network.

### Neural Network 3

In [188]:
inputs = keras.Input(shape = (21, ))
x = layers.Dense(20, activation="relu")(inputs)
x = layers.Dropout(rate=.1)(x)
x = layers.Dense(10, activation="relu")(x)
x = layers.Dropout(rate=.1)(x)
x = layers.Dense(6, activation="relu")(x)
x = layers.Dropout(rate=.1)(x)
outputs = layers.Dense(1, activation = "sigmoid")(x)
model = keras.Model(inputs=inputs, outputs=outputs, name="diabetes_model_1")
model.summary()

In [192]:
model.compile(
    loss=keras.losses.BinaryCrossentropy(from_logits=False),
    optimizer=keras.optimizers.RMSprop(),
    metrics=[keras.metrics.binary_accuracy],
)

history = model.fit(X_train, y_train, batch_size = 64, epochs=10, callbacks=keras.callbacks.EarlyStopping(start_from_epoch=5), validation_split=.2)

Epoch 1/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 790us/step - binary_accuracy: 0.8588 - loss: 0.3319 - val_binary_accuracy: 0.8613 - val_loss: 0.3186
Epoch 2/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 758us/step - binary_accuracy: 0.8600 - loss: 0.3295 - val_binary_accuracy: 0.8613 - val_loss: 0.3187
Epoch 3/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 765us/step - binary_accuracy: 0.8591 - loss: 0.3291 - val_binary_accuracy: 0.8613 - val_loss: 0.3182
Epoch 4/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 755us/step - binary_accuracy: 0.8602 - loss: 0.3276 - val_binary_accuracy: 0.8613 - val_loss: 0.3206
Epoch 5/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 764us/step - binary_accuracy: 0.8602 - loss: 0.3282 - val_binary_accuracy: 0.8613 - val_loss: 0.3178
Epoch 6/10
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 762us/step - binary_a

In [180]:
scores = model.evaluate(X_test, y_test, verbose=2)

1982/1982 - 1s - 442us/step - binary_accuracy: 0.8608 - loss: 0.6931


In [182]:
y_pred_prob = model.predict(X)  # Get predicted probabilities for each class
y_pred = np.argmax(y_pred_prob, axis=1)  # Convert probabilities to class labels
pd.DataFrame(confusion_matrix(y, y_pred), columns=["Predicted No Diabetes", "Predicted Diabetes"], index = ["Actual No Diabetes", "Actual Diabetes"])

[1m7928/7928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 519us/step


Unnamed: 0,Predicted No Diabetes,Predicted Diabetes
Actual No Diabetes,218334,0
Actual Diabetes,35346,0
