In this script, the prediction models from cvd_prediction_models have been revisited for further optimisations. Four out of the five initial prediction models were successfully optimised.

In [None]:
# Import dependencies
import pandas as pd
import tensorflow as tf
import numpy as np
import plotly.graph_objs as go

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from google.colab import files

!pip install -q -U keras-tuner
import keras_tuner as kt

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/127.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/127.9 kB[0m [31m785.7 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m122.9/127.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/950.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m942.1/950.8 kB[0m [31m33.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hUsing TensorFlow backend


In [None]:
# Read in the cardiovascular dataset from Google Sheets
cvd_df = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSDchXr1EhgCSsxlxJ3lWPhh1kT5EJS3yv4DJ2YLeMIC3y4uq-Pp4EQknrs9zAiaI3ulne2Jyi6gR6G/pub?gid=602879552&single=true&output=csv")
cvd_df.head()

Unnamed: 0,general_health,checkup,exercise,heart_disease,skin_cancer,other_cancer,depression,diabetes,arthritis,sex,age_category,height_cm,weight_kg,bmi,smoking_history,alcohol_consumption,fruit_consumption,green_vegetables_consumption,friedpotato_consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150,32.66,14.54,Yes,0,30,16,12
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165,77.11,28.29,No,0,30,0,4
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163,88.45,33.47,No,4,12,3,16
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180,93.44,28.73,No,0,30,30,8
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191,88.45,24.37,Yes,0,8,4,0


# Data Preprocessing

In [None]:
# Define categorical columns for encoding and numeric columns for scaling
categorical_cols = cvd_df.dtypes[cvd_df.dtypes == 'object'].index.tolist()

numeric_cols = cvd_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Calculate mean and standard deviation for numeric columns
means = cvd_df[numeric_cols].mean()
stds = cvd_df[numeric_cols].std()

# Define the number of standard deviations for the threshold
threshold_std = 1

# Identify outliers for numeric columns based on standard deviations
outliers = ((cvd_df[numeric_cols] - means).abs() > threshold_std * stds).any(axis=1)

# Filter rows where 'heart_disease' is 'No' and not an outlier
filtered_rows = (~outliers) | (cvd_df['heart_disease'] == 'Yes')

# Create the filtered DataFrame
cvd_df_filtered = cvd_df[filtered_rows]

# Check value counts of target variable
cvd_df_filtered['heart_disease'].value_counts()

No     55566
Yes    24081
Name: heart_disease, dtype: int64

In [None]:
# Encode categorical columns using get_dummies
encoded_df = pd.get_dummies(cvd_df_filtered, columns=categorical_cols, drop_first=False)

# Scale numerical columns using StandardScaler
scaler = StandardScaler()
encoded_df[numeric_cols] = scaler.fit_transform(encoded_df[numeric_cols])
encoded_df.head()

Unnamed: 0,height_cm,weight_kg,bmi,alcohol_consumption,fruit_consumption,green_vegetables_consumption,friedpotato_consumption,general_health_Excellent,general_health_Fair,general_health_Good,...,age_category_45-49,age_category_50-54,age_category_55-59,age_category_60-64,age_category_65-69,age_category_70-74,age_category_75-79,age_category_80+,smoking_history_No,smoking_history_Yes
1,-0.83226,-0.393566,-0.00059,-0.556508,0.413674,-1.25037,-0.166917,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,1.184427,0.70489,0.095604,-0.556508,0.413674,1.515171,0.547281,0,0,0,...,0,0,0,0,0,0,1,0,1,0
6,0.512198,-0.881918,-1.213951,-0.556508,-0.472114,-0.512893,-0.881115,0,1,0,...,0,0,0,1,0,0,0,0,0,1
8,-1.101152,-0.698954,-0.182048,-0.556508,-0.725196,-0.144154,-0.166917,0,1,0,...,0,0,0,0,1,0,0,0,0,1
9,-1.101152,0.583139,1.394228,-0.556508,-0.725196,-0.144154,-0.702566,0,1,0,...,0,0,0,0,0,1,0,0,1,0


In [None]:
# Assign the target variable 'heart_disease' to y
y = encoded_df['heart_disease_Yes']

# Assign the remaining columns (features) to X
X = encoded_df.drop(columns=['heart_disease_Yes', 'heart_disease_No'])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Fit a random over sampler model to the training data to balance out the classes fully
ROS = RandomOverSampler(random_state=78)
X_train_resampled, y_train_resampled = ROS.fit_resample(X_train, y_train)

# Check the value counts to ensure oversampling has worked
y_train_resampled.value_counts()

1    41653
0    41653
Name: heart_disease_Yes, dtype: int64

# Model 1: Logistic Regression Optimisation

Hyperparameter tuning was considered for optimising the logistic regression model. The solver adopted in the initial model was called 'saga', this solver is known to work well with larger datasets. Two other solvers were considered for optimisation:

*   lbfgs - this solver did not optimise the model as it is designed....
*   sag - this solver was not used as it is an outdated version of saga and therefore, saga was more suited.

# Model 2: Support Vector Machine Optimisation

The overall accuracy for the SVM model before optimisation was 79% and the model achieved a weighted precision of 81%. One way to attempt to optimise this model would be by altering it's kernal hyperparamer which may be better suited to our dataset structure.

In [None]:
# Initialise model and fit to the training data
model = SVC(kernel='rbf')
model.fit(X_train_resampled, y_train_resampled)

In [None]:
# Validate with the test data
model.score(X_test, y_test)

0.8918742466854158

In [None]:
# Save model's predicitons for the test data
training_predictions = model.predict(X_test)

# Create a confusion matrix
training_matrix = confusion_matrix(y_test, training_predictions)

pd.DataFrame(training_matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,12745,1168
Actual 1,985,5014


The findings from the above confusion matrix are very have shown that there are now much less false negative and false positive cases. This matrix has informed that our optimised SVM model has:
*   Correctly predicted 1,900 more individuals who did not have heart disease than in the unoptimised model.
*   Correctly predicted 186 more individuals who did have heart disease than in the unoptimised model.

To showcase the reduction breakdown in false positives and false negatives
*   186 less individuals who actually had heart disease were predicted as not having one in comparison with the unoptimised model.
*   1,900 less individuals who did not actually have heart disease were predicted as having one in comparison with the unoptimised model.

The model's ability to minimise false positives and false negatives has improved. It is also still the case that the model is more likely to predict someone who does not have heart disease as having one than the contary.

In [None]:
# Target prediciton categories
target_names = ["No Heart Disease", "Heart Disease"]

# Print a classification report
print(classification_report(y_test, training_predictions,
                            target_names=target_names))

                  precision    recall  f1-score   support

No Heart Disease       0.93      0.92      0.92     13913
   Heart Disease       0.81      0.84      0.82      5999

        accuracy                           0.89     19912
       macro avg       0.87      0.88      0.87     19912
    weighted avg       0.89      0.89      0.89     19912



Precision represents the ratio of correctly predicted positive observaions to the total predicted positive observations:
* The precision was 2% higher for No Heart Disease in comparison with the unoptimised model, suggesting that the false positive rate has reduced slightly
* The precision was 20% for Heart Disease in comparison with the unoptimised model, suggesting that the false positive rate was higher for those with a heart disease

Recall represents the ratio of positive obervations to all predicted observations of that class:
* The recall was 14% higher for No Heart Disease in comparison with the unoptimised model, suggesting a significant improvement in the models ability to minimise false negative rates for those without a heart disease
* The recall was 4% higher for Heart Disease in comparison with the unoptimised model, suggesting that the false negative rate has reduced further than in the initial model for those with a heart disease

Considering the average of the precision and recall, the overall model has a f1 score has increased by 10% than our unoptimised model which shows the model has significantly imrpoved at avoiding false negatives and positives. This has provided assurance that with further tweaking (perhaps during preprocessing) there is potential or this model to become reliable.

# Model 3: Decision Tree Optimisation

The **optimization** process in this case involved tuning the hyperparameters of the Decision Tree model to improve its performance on the test data. Here's how the optimization process occurred:

*Manual Hyperparameter Tuning*:

In the optimisation model, the **hyperparameters** max_depth, min_samples_split, and min_samples_leaf were explicitly set to specific values. This is known as manual hyperparameter tuning.
By setting a maximum depth, the depth of the tree can be limited, which can help avoid overfitting. A tree that is too deep may memorize the CVD training data and perform poorly on new data.
By setting minimum samples for splitting and leaf nodes, the granularity of the splits are controlled, preventing the tree from making very small and potentially noisy splits.

In [None]:
# Create a Decision Tree Classifier with custom hyperparameters
clf = DecisionTreeClassifier(
    random_state=42,
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=1
)

In [None]:
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_resampled)


In [None]:
# Predicting on the test data
y_pred = clf.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

**Results:**

The optimized model (second model) achieved a **higher accuracy** (92.84%) compared to the first model (89.35%).
The optimized model also had excellent recall for class 0 (1.00) and a higher precision for class 1 (1.00), indicating it correctly predicted all instances of class 0 and improved precision for class 1.

In [None]:
# Print the results
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Accuracy: 0.9283848935315387
Confusion Matrix:
 [[13913     0]
 [ 1426  4573]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95     13913
           1       1.00      0.76      0.87      5999

    accuracy                           0.93     19912
   macro avg       0.95      0.88      0.91     19912
weighted avg       0.94      0.93      0.93     19912



# Model 4: Random Forest Optimisation

Distribution of hyperparameters was defined in the param_dist dictionary. These hyperparameters include:

n_estimators: The number of trees in the forest (100, 200, or 300).

max_depth: The maximum depth of each tree (None, 10, 20, or 30).

min_samples_split: The minimum number of samples required to split a node (2, 5, or 10).

min_samples_leaf: The minimum number of samples required to be a leaf node (1, 2, or 4).

max_features: The number of features to consider for the best split ('auto,' 'sqrt,' or 'log2').

In [None]:
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)


In [None]:
# Define a distribution of hyperparameters for RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300],          # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],         # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],        # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],          # Minimum samples required to be a leaf node
    'max_features': ['auto', 'sqrt', 'log2'] # Number of features to consider for the best split
}

A RandomizedSearchCV object named random_search to search for the best hyperparameters using random sampling. This process involves:

Using cross-validation (cv=5) to evaluate the model's performance.
Specifying the number of iterations (n_iter=10) to randomly sample hyperparameters.
Setting n_jobs=-1 to utilize all available CPU cores for parallel processing.
Defining scoring='accuracy' as the evaluation metric.
Setting a random state (random_state=42) for reproducibility.

In [None]:
# Create RandomizedSearchCV to search for the best hyperparameters
random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, verbose=2, scoring='accuracy', random_state=42)


In [None]:
# Fit the model to the training data while searching for the best hyperparameters
random_search.fit(X_train_resampled, y_train_resampled)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


  warn(


The final Random Forest model (final_rf_classifier) using the best hyperparameters obtained from the search was created.



In [None]:
# Get the best hyperparameters
best_params = random_search.best_params_

In [None]:
# Use the best hyperparameters to create the final Random Forest model
final_rf_classifier = RandomForestClassifier(random_state=42, **best_params)


In [None]:
# Fit the final model to the training data
final_rf_classifier.fit(X_train_resampled, y_train_resampled)

  warn(


In [None]:
# Predicting on the test data using the final model
y_pred = final_rf_classifier.predict(X_test)

In [None]:
# Evaluate the final model
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

The accuracy of the optimized Random Forest Classifier is approximately 92.64%. This indicates that the model correctly predicted the class labels for about 92.64% of the samples in the test dataset.

**Classification Report**:

The classification report provides additional performance metrics beyond accuracy:

Precision measures the proportion of true positive predictions out of all positive predictions. For class 0, precision is 0.93, and for class 1, it's 0.92. This indicates that the model has a high precision for both classes, meaning it makes relatively few false positive errors.

Recall (or sensitivity) measures the proportion of true positive predictions out of all actual positives. For class 0, recall is 0.97, and for class 1, it's 0.83. This suggests that the model is better at identifying true negatives (class 0) than true positives (class 1).

F1-score is the harmonic mean of precision and recall. It provides a balance between precision and recall. For class 0, the F1-score is 0.95, and for class 1, it's 0.87. A high F1-score indicates a good balance between precision and recall.

Support indicates the number of samples in each class.

Macro Avg and Weighted Avg:

The macro average (macro avg) computes the average of metrics across both classes without considering class imbalance. In this case, the macro avg F1-score is approximately 0.91.
The weighted average (weighted avg) computes the average of metrics, weighted by the number of samples in each class. This accounts for class imbalance, and the weighted avg F1-score is also approximately 0.93.

In [None]:
# Print the results
print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)
print("Classification Report:\n", class_report)

Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None}
Accuracy: 0.9264262756126959
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.97      0.95     13913
           1       0.92      0.83      0.87      5999

    accuracy                           0.93     19912
   macro avg       0.92      0.90      0.91     19912
weighted avg       0.93      0.93      0.93     19912



# Model 5: Neural Network Optimisation

### Optimisation 1

In [None]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn = tf.keras.models.Sequential()
    number_input_features = X_train_resampled.shape[1]
    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh'])
    # Allow kerastuner to decide number of neurons in first layer
    nn.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=30,
        step=5), activation=activation, input_dim=number_input_features))
    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 5)):
        nn.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=30,
            step=5),
            activation=activation))

    nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
    # Compile the model
    nn.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn

In [None]:
# Create a `Hyperband()` tuner instance
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

In [None]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_resampled,y_train_resampled,epochs=20,validation_data=(X_test,y_test))

Trial 60 Complete [00h 02m 06s]
val_accuracy: 0.9128665924072266

Best val_accuracy So Far: 0.9196464419364929
Total elapsed time: 01h 06m 18s


In [None]:
# Get top 3 model hyperparameters and print the values
top_hyper = tuner.get_best_hyperparameters(3)
for param in top_hyper:
  print(param.values)

{'activation': 'relu', 'first_units': 26, 'num_layers': 5, 'units_0': 26, 'units_1': 1, 'units_2': 11, 'units_3': 26, 'units_4': 11, 'tuner/epochs': 20, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}
{'activation': 'relu', 'first_units': 16, 'num_layers': 5, 'units_0': 26, 'units_1': 1, 'units_2': 26, 'units_3': 6, 'units_4': 11, 'tuner/epochs': 20, 'tuner/initial_epoch': 7, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0049'}
{'activation': 'relu', 'first_units': 26, 'num_layers': 3, 'units_0': 21, 'units_1': 6, 'units_2': 21, 'units_3': 16, 'units_4': 11, 'tuner/epochs': 20, 'tuner/initial_epoch': 7, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0012'}


In [None]:
# Evaluate the top 3 models against the test dataset
top_model = tuner.get_best_models(3)
for model in top_model:
  model_loss, model_accuracy = model.evaluate(X_test,y_test,verbose=2)

623/623 - 1s - loss: 0.2055 - accuracy: 0.9196 - 992ms/epoch - 2ms/step
623/623 - 1s - loss: 0.2065 - accuracy: 0.9185 - 988ms/epoch - 2ms/step
623/623 - 1s - loss: 0.2079 - accuracy: 0.9167 - 928ms/epoch - 1ms/step


{'activation': 'relu', 'first_units': 26, 'num_layers': 5, 'units_0': 21, 'units_1': 16, 'units_2': 1, 'units_3': 16, 'units_4': 21, 'tuner/epochs': 20, 'tuner/initial_epoch': 7, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0044'}

In [None]:
# Get the best hyperparameters
best_hyperparameters = top_hyper[0].values

best_hyperparameters

{'activation': 'relu',
 'first_units': 26,
 'num_layers': 5,
 'units_0': 26,
 'units_1': 1,
 'units_2': 11,
 'units_3': 26,
 'units_4': 11,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 0,
 'tuner/round': 0}

In [None]:
# Hardcoding output for best hyperparameters to access later
best_hyperparameters = {'activation': 'relu',
 'first_units': 26,
 'num_layers': 5,
 'units_0': 21,
 'units_1': 16,
 'units_2': 1,
 'units_3': 16,
 'units_4': 21,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 7,
 'tuner/bracket': 2,
 'tuner/round': 2,
 'tuner/trial_id': '0044'}

In [None]:
# Defining new model for the best model
def build_best_model(input_shape, activation, num_layers, units):
    best_nn = tf.keras.models.Sequential()

    # Input layer
    best_nn.add(tf.keras.layers.Input(shape=(input_shape,)))

    # Hidden layers
    for _ in range(num_layers):
        best_nn.add(tf.keras.layers.Dense(units, activation=activation))

    # Output layer (assuming binary classification with sigmoid activation)
    best_nn.add(tf.keras.layers.Dense(1, activation="sigmoid"))

    # Compile the model
    best_nn.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )

    return best_nn

In [None]:
# Defining best hyperparameters
input_shape = X_train_resampled.shape[1]
activation = best_hyperparameters['activation']
num_layers = best_hyperparameters['num_layers']
# Calculate the total number of units across all layers
total_units = sum([best_hyperparameters[f'units_{i}'] for i in range(num_layers)])
# Implement early stopping to reduce overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

In [None]:
# Build the best model for retraining
best_model = build_best_model(input_shape, activation, num_layers, total_units)

In [None]:
# Retrain the best model on the full training dataset (including both original and resampled data)
best_model.fit(X_train_resampled, y_train_resampled, epochs=20, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20


<keras.src.callbacks.History at 0x7b6ed0f1a410>

The training accuracy appears to be increasing and the training loss is decreasing suggesting that the model is learning well on the training data. However, the validation accuracy and loss are unstable and are not improving as the model learns. This, along with the fact that early stopping was triggered suggests that overfitting may still be occuring.

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = best_model.evaluate(X_test, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

623/623 - 1s - loss: 0.2269 - accuracy: 0.9011 - 806ms/epoch - 1ms/step
Loss: 0.22688905894756317, Accuracy: 0.9010646939277649


In [None]:
# Print model summary
best_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 75)                3525      
                                                                 
 dense_6 (Dense)             (None, 75)                5700      
                                                                 
 dense_7 (Dense)             (None, 75)                5700      
                                                                 
 dense_8 (Dense)             (None, 75)                5700      
                                                                 
 dense_9 (Dense)             (None, 75)                5700      
                                                                 
 dense_10 (Dense)            (None, 1)                 76        
                                                                 
Total params: 26401 (103.13 KB)
Trainable params: 2640

The complexity of this model (5 layers) may be contributing to the overfitting. In the next optimisation, the 3rd best model found by kerastuner will be utilised as it contains only 2 layers.

In [None]:
# Make predictions using the trained model
# Setting threshold of 0.5 since our model utilises sigmoid activation on a binary classification problem
y_pred = (best_model.predict(X_test) > 0.5).astype(np.int32)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.92      0.93     13913
           1       0.83      0.85      0.84      5999

    accuracy                           0.90     19912
   macro avg       0.88      0.89      0.88     19912
weighted avg       0.90      0.90      0.90     19912



**OPTIMISATION 2**

In [None]:
# Defining hyperparameters - the 3rd best performing according to kerastuner with only 2 layers to reduce complexity
hyperparameters = {
    'activation': 'relu',
    'first_units': 26,
    'num_layers': 2,
    'units_0': 16,
    'units_1': 6,
    'units_2': 11,
    'units_3': 6,
    'units_4': 26,
    'tuner/epochs': 20,
    'tuner/initial_epoch': 0,
    'tuner/bracket': 0,
    'tuner/round': 0}

In [None]:
# Defining new model for the best model
def build_best_model(input_shape, activation, num_layers, units):
    best_nn = tf.keras.models.Sequential()

    # Input layer
    best_nn.add(tf.keras.layers.Input(shape=(input_shape,)))

    # Hidden layers
    for _ in range(num_layers):
        best_nn.add(tf.keras.layers.Dense(units, activation=activation))

    # Output layer (assuming binary classification with sigmoid activation)
    best_nn.add(tf.keras.layers.Dense(1, activation="sigmoid"))

    # Compile the model
    best_nn.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )

    return best_nn

In [None]:
# Defining best hyperparameters
input_shape = X_train_resampled.shape[1]
activation = hyperparameters['activation']
num_layers = hyperparameters['num_layers']
# Calculate the total number of units across all layers
total_units = sum([hyperparameters[f'units_{i}'] for i in range(num_layers)])
# Implement early stopping to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)


In [None]:
# Build the best model for retraining
best_model_2 = build_best_model(input_shape, activation, num_layers, total_units)



In [None]:
# Retrain the best model on the full training dataset (including both original and resampled data)
best_model_2.fit(X_train_resampled, y_train_resampled, epochs=20, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7b6ea5d8f6a0>

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = best_model_2.evaluate(X_test, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

623/623 - 1s - loss: 0.2313 - accuracy: 0.9000 - 723ms/epoch - 1ms/step
Loss: 0.23126213252544403, Accuracy: 0.8999598026275635


In [None]:
# Print model summary
best_model_2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_11 (Dense)            (None, 22)                1034      
                                                                 
 dense_12 (Dense)            (None, 22)                506       
                                                                 
 dense_13 (Dense)            (None, 1)                 23        
                                                                 
Total params: 1563 (6.11 KB)
Trainable params: 1563 (6.11 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
best_model_2.save('cvd_nn_model2.h5')
files.download("cvd_nn_model2.h5")

  saving_api.save_model(


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>