In this script, the Random Forest prediction model from cvd_prediction_models has been revisited for further optimisations.

In [None]:
# Import dependencies
import pandas as pd
import tensorflow as tf
import numpy as np
import plotly.graph_objs as go

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from google.colab import files

!pip install -q -U keras-tuner
import keras_tuner as kt

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/127.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/127.9 kB[0m [31m785.7 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m122.9/127.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/950.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m942.1/950.8 kB[0m [31m33.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hUsing TensorFlow backend


In [None]:
# Read in the cardiovascular dataset from Google Sheets
cvd_df = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSDchXr1EhgCSsxlxJ3lWPhh1kT5EJS3yv4DJ2YLeMIC3y4uq-Pp4EQknrs9zAiaI3ulne2Jyi6gR6G/pub?gid=602879552&single=true&output=csv")
cvd_df.head()

Unnamed: 0,general_health,checkup,exercise,heart_disease,skin_cancer,other_cancer,depression,diabetes,arthritis,sex,age_category,height_cm,weight_kg,bmi,smoking_history,alcohol_consumption,fruit_consumption,green_vegetables_consumption,friedpotato_consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150,32.66,14.54,Yes,0,30,16,12
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165,77.11,28.29,No,0,30,0,4
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163,88.45,33.47,No,4,12,3,16
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180,93.44,28.73,No,0,30,30,8
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191,88.45,24.37,Yes,0,8,4,0


# Data Preprocessing

In [None]:
# Define categorical columns for encoding and numeric columns for scaling
categorical_cols = cvd_df.dtypes[cvd_df.dtypes == 'object'].index.tolist()

numeric_cols = cvd_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Calculate mean and standard deviation for numeric columns
means = cvd_df[numeric_cols].mean()
stds = cvd_df[numeric_cols].std()

# Define the number of standard deviations for the threshold
threshold_std = 1

# Identify outliers for numeric columns based on standard deviations
outliers = ((cvd_df[numeric_cols] - means).abs() > threshold_std * stds).any(axis=1)

# Filter rows where 'heart_disease' is 'No' and not an outlier
filtered_rows = (~outliers) | (cvd_df['heart_disease'] == 'Yes')

# Create the filtered DataFrame
cvd_df_filtered = cvd_df[filtered_rows]

# Check value counts of target variable
cvd_df_filtered['heart_disease'].value_counts()

No     55566
Yes    24081
Name: heart_disease, dtype: int64

In [None]:
# Encode categorical columns using get_dummies
encoded_df = pd.get_dummies(cvd_df_filtered, columns=categorical_cols, drop_first=False)

# Scale numerical columns using StandardScaler
scaler = StandardScaler()
encoded_df[numeric_cols] = scaler.fit_transform(encoded_df[numeric_cols])
encoded_df.head()

Unnamed: 0,height_cm,weight_kg,bmi,alcohol_consumption,fruit_consumption,green_vegetables_consumption,friedpotato_consumption,general_health_Excellent,general_health_Fair,general_health_Good,...,age_category_45-49,age_category_50-54,age_category_55-59,age_category_60-64,age_category_65-69,age_category_70-74,age_category_75-79,age_category_80+,smoking_history_No,smoking_history_Yes
1,-0.83226,-0.393566,-0.00059,-0.556508,0.413674,-1.25037,-0.166917,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,1.184427,0.70489,0.095604,-0.556508,0.413674,1.515171,0.547281,0,0,0,...,0,0,0,0,0,0,1,0,1,0
6,0.512198,-0.881918,-1.213951,-0.556508,-0.472114,-0.512893,-0.881115,0,1,0,...,0,0,0,1,0,0,0,0,0,1
8,-1.101152,-0.698954,-0.182048,-0.556508,-0.725196,-0.144154,-0.166917,0,1,0,...,0,0,0,0,1,0,0,0,0,1
9,-1.101152,0.583139,1.394228,-0.556508,-0.725196,-0.144154,-0.702566,0,1,0,...,0,0,0,0,0,1,0,0,1,0


In [None]:
# Assign the target variable 'heart_disease' to y
y = encoded_df['heart_disease_Yes']

# Assign the remaining columns (features) to X
X = encoded_df.drop(columns=['heart_disease_Yes', 'heart_disease_No'])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Fit a random over sampler model to the training data to balance out the classes fully
ROS = RandomOverSampler(random_state=78)
X_train_resampled, y_train_resampled = ROS.fit_resample(X_train, y_train)

# Check the value counts to ensure oversampling has worked
y_train_resampled.value_counts()

1    41653
0    41653
Name: heart_disease_Yes, dtype: int64

**RANDOM FOREST OPTIMISATION**

Distribution of hyperparameters was defined in the param_dist dictionary. These hyperparameters include:

n_estimators: The number of trees in the forest (100, 200, or 300).

max_depth: The maximum depth of each tree (None, 10, 20, or 30).

min_samples_split: The minimum number of samples required to split a node (2, 5, or 10).

min_samples_leaf: The minimum number of samples required to be a leaf node (1, 2, or 4).

max_features: The number of features to consider for the best split ('auto,' 'sqrt,' or 'log2').

In [None]:
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)


In [None]:
# Define a distribution of hyperparameters for RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300],          # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],         # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],        # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],          # Minimum samples required to be a leaf node
    'max_features': ['auto', 'sqrt', 'log2'] # Number of features to consider for the best split
}

A RandomizedSearchCV object named random_search to search for the best hyperparameters using random sampling. This process involves:

Using cross-validation (cv=5) to evaluate the model's performance.
Specifying the number of iterations (n_iter=10) to randomly sample hyperparameters.
Setting n_jobs=-1 to utilize all available CPU cores for parallel processing.
Defining scoring='accuracy' as the evaluation metric.
Setting a random state (random_state=42) for reproducibility.

In [None]:
# Create RandomizedSearchCV to search for the best hyperparameters
random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, verbose=2, scoring='accuracy', random_state=42)


In [None]:
# Fit the model to the training data while searching for the best hyperparameters
random_search.fit(X_train_resampled, y_train_resampled)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


  warn(


The final Random Forest model (final_rf_classifier) using the best hyperparameters obtained from the search was created.



In [None]:
# Get the best hyperparameters
best_params = random_search.best_params_

In [None]:
# Use the best hyperparameters to create the final Random Forest model
final_rf_classifier = RandomForestClassifier(random_state=42, **best_params)


In [None]:
# Fit the final model to the training data
final_rf_classifier.fit(X_train_resampled, y_train_resampled)

  warn(


In [None]:
# Predicting on the test data using the final model
y_pred = final_rf_classifier.predict(X_test)

In [None]:
# Evaluate the final model
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

The accuracy of the optimized Random Forest Classifier is approximately 92.64%. This indicates that the model correctly predicted the class labels for about 92.64% of the samples in the test dataset.

**Classification Report**:

The classification report provides additional performance metrics beyond accuracy:

Precision measures the proportion of true positive predictions out of all positive predictions. For class 0, precision is 0.93, and for class 1, it's 0.92. This indicates that the model has a high precision for both classes, meaning it makes relatively few false positive errors.

Recall (or sensitivity) measures the proportion of true positive predictions out of all actual positives. For class 0, recall is 0.97, and for class 1, it's 0.83. This suggests that the model is better at identifying true negatives (class 0) than true positives (class 1).

F1-score is the harmonic mean of precision and recall. It provides a balance between precision and recall. For class 0, the F1-score is 0.95, and for class 1, it's 0.87. A high F1-score indicates a good balance between precision and recall.

Support indicates the number of samples in each class.

Macro Avg and Weighted Avg:

The macro average (macro avg) computes the average of metrics across both classes without considering class imbalance. In this case, the macro avg F1-score is approximately 0.91.
The weighted average (weighted avg) computes the average of metrics, weighted by the number of samples in each class. This accounts for class imbalance, and the weighted avg F1-score is also approximately 0.93.

In [None]:
# Print the results
print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)
print("Classification Report:\n", class_report)

Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None}
Accuracy: 0.9264262756126959
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.97      0.95     13913
           1       0.92      0.83      0.87      5999

    accuracy                           0.93     19912
   macro avg       0.92      0.90      0.91     19912
weighted avg       0.93      0.93      0.93     19912

