In [1]:
# Import dependencies
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Read in the cardiovascular dataset from Google Sheets
cvd_df = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSDchXr1EhgCSsxlxJ3lWPhh1kT5EJS3yv4DJ2YLeMIC3y4uq-Pp4EQknrs9zAiaI3ulne2Jyi6gR6G/pub?gid=602879552&single=true&output=csv")
cvd_df.head()

Unnamed: 0,general_health,checkup,exercise,heart_disease,skin_cancer,other_cancer,depression,diabetes,arthritis,sex,age_category,height_cm,weight_kg,bmi,smoking_history,alcohol_consumption,fruit_consumption,green_vegetables_consumption,friedpotato_consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150,32.66,14.54,Yes,0,30,16,12
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165,77.11,28.29,No,0,30,0,4
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163,88.45,33.47,No,4,12,3,16
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180,93.44,28.73,No,0,30,30,8
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191,88.45,24.37,Yes,0,8,4,0


# Data Preprocessing

In [4]:
# Check datatypes
cvd_df.dtypes

general_health                   object
checkup                          object
exercise                         object
heart_disease                    object
skin_cancer                      object
other_cancer                     object
depression                       object
diabetes                         object
arthritis                        object
sex                              object
age_category                     object
height_cm                         int64
weight_kg                       float64
bmi                             float64
smoking_history                  object
alcohol_consumption               int64
fruit_consumption                 int64
green_vegetables_consumption      int64
friedpotato_consumption           int64
dtype: object

In [5]:
# Define categorical columns for encoding and numeric columns for scaling
categorical_cols = cvd_df.dtypes[cvd_df.dtypes == 'object'].index.tolist()

numeric_cols = cvd_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [6]:
# Check value counts of target variable
cvd_df['heart_disease'].value_counts()

No     275231
Yes     24081
Name: heart_disease, dtype: int64

The proportion of positive instances of heart disease ('Yes') to negative instances ('No') is heavily imbalanced. Initial testing revealed that this would result in a model that can accurately predict if a patient does not have heart diease, but is not able to accurately predict if a person does have disease. For this reason, we have decided to cut down the majority class, in this instance the 'No' category, to make the classes more balanced.

We have chosen to remove all datapoints that lay outside of 1 standard deviation from the mean of the data.

In [7]:
# Calculate mean and standard deviation for numeric columns
means = cvd_df[numeric_cols].mean()
stds = cvd_df[numeric_cols].std()

# Define the number of standard deviations for the threshold
threshold_std = 1

# Identify outliers for numeric columns based on standard deviations
outliers = ((cvd_df[numeric_cols] - means).abs() > threshold_std * stds).any(axis=1)

# Filter rows where 'heart_disease' is 'No' and not an outlier
filtered_rows = (~outliers) | (cvd_df['heart_disease'] == 'Yes')

# Create the filtered DataFrame
cvd_df_filtered = cvd_df[filtered_rows]

In [8]:
# Re-check value counts of target variable
cvd_df_filtered['heart_disease'].value_counts()

No     55566
Yes    24081
Name: heart_disease, dtype: int64

The classes in the target variable are now significantly closer to being balanced than previously.

In [9]:
# Encode categorical columns using get_dummies
encoded_df = pd.get_dummies(cvd_df_filtered, columns=categorical_cols, drop_first=False)
encoded_df.head()

Unnamed: 0,height_cm,weight_kg,bmi,alcohol_consumption,fruit_consumption,green_vegetables_consumption,friedpotato_consumption,general_health_Excellent,general_health_Fair,general_health_Good,...,age_category_45-49,age_category_50-54,age_category_55-59,age_category_60-64,age_category_65-69,age_category_70-74,age_category_75-79,age_category_80+,smoking_history_No,smoking_history_Yes
1,165,77.11,28.29,0,30,0,4,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,180,93.44,28.73,0,30,30,8,0,0,0,...,0,0,0,0,0,0,1,0,1,0
6,175,69.85,22.74,0,16,8,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
8,163,72.57,27.46,0,12,12,4,0,1,0,...,0,0,0,0,1,0,0,0,0,1
9,163,91.63,34.67,0,12,12,1,0,1,0,...,0,0,0,0,0,1,0,0,1,0


In [10]:
# Scale numerical columns using StandardScaler
scaler = StandardScaler()
encoded_df[numeric_cols] = scaler.fit_transform(encoded_df[numeric_cols])
encoded_df.head()

Unnamed: 0,height_cm,weight_kg,bmi,alcohol_consumption,fruit_consumption,green_vegetables_consumption,friedpotato_consumption,general_health_Excellent,general_health_Fair,general_health_Good,...,age_category_45-49,age_category_50-54,age_category_55-59,age_category_60-64,age_category_65-69,age_category_70-74,age_category_75-79,age_category_80+,smoking_history_No,smoking_history_Yes
1,-0.83226,-0.393566,-0.00059,-0.556508,0.413674,-1.25037,-0.166917,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,1.184427,0.70489,0.095604,-0.556508,0.413674,1.515171,0.547281,0,0,0,...,0,0,0,0,0,0,1,0,1,0
6,0.512198,-0.881918,-1.213951,-0.556508,-0.472114,-0.512893,-0.881115,0,1,0,...,0,0,0,1,0,0,0,0,0,1
8,-1.101152,-0.698954,-0.182048,-0.556508,-0.725196,-0.144154,-0.166917,0,1,0,...,0,0,0,0,1,0,0,0,0,1
9,-1.101152,0.583139,1.394228,-0.556508,-0.725196,-0.144154,-0.702566,0,1,0,...,0,0,0,0,0,1,0,0,1,0


In [11]:
# Assign the target variable 'heart_disease' to y
y = encoded_df['heart_disease_Yes']

# Assign the remaining columns (features) to X
X = encoded_df.drop(columns=['heart_disease_Yes', 'heart_disease_No'])

In [12]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [13]:
# Checking the value counts of the y variable
y_train.value_counts()

0    41653
1    18082
Name: heart_disease_Yes, dtype: int64

In [14]:
# Fit a random over sampler model to the training data to balance out the classes fully
ROS = RandomOverSampler(random_state=78)
X_train_resampled, y_train_resampled = ROS.fit_resample(X_train, y_train)

In [15]:
# Re-check the value counts to ensure oversampling has worked
y_train_resampled.value_counts()

1    41653
0    41653
Name: heart_disease_Yes, dtype: int64

# Model 1: Logistic Regression

# Model 2: Support Vector Machine

Support Vector Machine (SVM) algorithms are renound for being accurate but they can be difficult to design. In comparison with logisitic regression, SVG's are appropriate for large datasets and will be more accurate as they can employ a non-linear approach.

The below model will use SVM's for classification and will separate classes of the datapoints into a multi-dimentional space where each respective cluster will be separated by a hyperplane.

In [16]:
# Initialise model and fit to the training data
model = SVC(kernel='linear')
model.fit(X_train_resampled, y_train_resampled)

In [17]:
# Validate with the test data
model.score(X_test, y_test)

0.7871132985134592

In the validation step, a small subset of the labeled data was used to test how well the model is able to predict labels. The above outcome suggests that the SVM model is able to correctly predict 78.71% (2.d.p) of all individuals in the test dataset (15,672 out of 19,912 individuals). This is a positive outcome. However, as human health is a highly sensitive topic with great importance, for this model to become respected, an accuracy of 99% or above would be ideal.

The 4,239 indivudals in the test dataset who would have been informed incorrectly would either not recieve the medication and life style change they need or would be unecessarily making these changes.

It should be noted that accuracy can be very suceptible to imbalanced classes. If the initial dataset brought into this script was trained against without taking a representitive sample and then oversampling the minority class, it would have been really easy for the model to only care about people without a cardiovascular disease because they would have had the biggest impact on accuracy. However, we also care about the people with a heart disease which highlights the importance of the step taken during preprocessing.

In [19]:
# Save model's predicitons for the test data
training_predictions = model.predict(X_test)

# Create a confusion matrix
training_matrix = confusion_matrix(y_test, training_predictions)

pd.DataFrame(training_matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,10845,3068
Actual 1,1171,4828


The findings from the above confusion matrix are very insightful, it has informed that our SVM model has:
*   Correctly predicted 10,945 individuals who did not have heart disease.
*   Correctly predicted 4,828 individuals who did have heart disease.
However, what is of more value to be aware of is:
*   1,171 individuals who actually had heart disease were predicted as not having one by this model.
*   3,068 individuals who did not actually have heart disease were predicted as having one by this model.


In [20]:
# Target prediciton categories
target_names = ["No Heart Disease", "Heart Disease"]

# Print a classification report
print(classification_report(y_test, training_predictions,
                            target_names=target_names))

                  precision    recall  f1-score   support

No Heart Disease       0.90      0.78      0.84     13913
   Heart Disease       0.61      0.80      0.69      5999

        accuracy                           0.79     19912
       macro avg       0.76      0.79      0.77     19912
    weighted avg       0.81      0.79      0.79     19912



Precision represents the ratio of correctly predicted positive observaions to the total predicted positive observations:
* The precision was 90% for No Heart Disease, suggesting that there is a low false positive rate
* The precision was 61% for Heart Disease, suggesting that the false positive rate was higher for those with a heart disease

Recall represents the ratio of positive obervations to all predicted observations of that class:
* The recall was 78% for No Heart Disease, suggesting that there is a low false negative rate for those without a heart disease
* The precision was 80% for Heart Disease, suggesting that the false negative rate was slightly lower for those with a heart disease

Considering the average of the precision and recall, the overall model has a f1 score of 79% which shows the model is fairly good at avoiding false negatives and positives but there is room for improvement before this model would be worthy to deal with such a sensitive topic due to the repercussions of these false positives and negatives which are still present.

# Model 3: Decision Tree

# Model 4: Random Forest

# Model 5: Neural Network

# Model Comparison

In [31]:
import plotly.graph_objs as go

# Store the model accuracy and precision scores in a dictionary (these values are test data to test chart functionality - they will be changed)
scores = {
   'Model': ['Logistic Regression', 'SVM', 'Decision Tree', 'Random Forests', 'Neural Network'],
   'Accuracy': [0.8,0.8,0.8,0.8,0.8],
   'Precision': [0.8,0.8,0.8,0.8,0.8]
}

# Convert dicitonary to a DataFrame
comparison_df = pd.DataFrame(scores)

# Define colors codes for the bars
colors = ['#FF6961', '#DEA5A4']

# Create a grouped bar chart with Accuracy and Precision
fig = go.Figure(data=[
                      go.Bar(name='Accuracy', x=comparison_df['Model'], y=comparison_df['Accuracy'], marker_color=colors[0]),
                      go.Bar(name='Precision', x=comparison_df['Model'], y=comparison_df['Precision'], marker_color=colors[1])
                    ])

# Chart layout
fig.update_layout(
                  barmode='group',
                  title=dict(text='Machine Learning Model Performance Comparison', font=dict(color='black')),
                  xaxis=dict(tickfont=dict(color='black')),
                  yaxis=dict(tickfont=dict(color='black')),
                  xaxis_title=dict(text='Model', font=dict(color='black')),
                  yaxis_title=dict(text='Score', font=dict(color='black')),
                  legend=dict(font=dict(color='black')),
                  template='simple_white',
                  )

# Add interactivity for toggling between Accuracy, Precision, and both
fig.update_layout(
                  updatemenus=[
                               {
                                'buttons': [
                                            {
                                             'label': 'Accuracy and Precision',
                                             'method': 'update',
                                             'args': [{'visible': [True, True]}, {'title': 'Both Accuracy and Precision'}]
                                            },
                                            {
                                             'label': 'Accuracy',
                                             'method': 'update',
                                             'args': [{'visible': [True, False]}, {'title': 'Accuracy Comparison'}]
                                            },
                                            {
                                            'label': 'Precision',
                                            'method': 'update',
                                            'args': [{'visible': [False, True]}, {'title': 'Precision Comparison'}]
                                            },
                                          ],
                             'direction': 'down',
                             'showactive': True,
                             }
                            ]
                )

# Show the chart
fig.show()
