# Cardiovascular Disease (CVD) Risk Prediction Notebook

# Installing dependencies

In [29]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
from IPython.display import Markdown, display
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from keras import layers, callbacks
from keras.layers import Dense, Dropout

# Loading the dataset

In [4]:
df = pd.read_csv('./CVD_cleaned.csv')
df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


# Exploring the dataset

Here we can see each column (feature) in the dataframe.



In [5]:
df.isnull().sum()

General_Health                  0
Checkup                         0
Exercise                        0
Heart_Disease                   0
Skin_Cancer                     0
Other_Cancer                    0
Depression                      0
Diabetes                        0
Arthritis                       0
Sex                             0
Age_Category                    0
Height_(cm)                     0
Weight_(kg)                     0
BMI                             0
Smoking_History                 0
Alcohol_Consumption             0
Fruit_Consumption               0
Green_Vegetables_Consumption    0
FriedPotato_Consumption         0
dtype: int64

Here we can see that there is no null content within any of the columns in the dataframe.

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   General_Health                308854 non-null  object 
 1   Checkup                       308854 non-null  object 
 2   Exercise                      308854 non-null  object 
 3   Heart_Disease                 308854 non-null  object 
 4   Skin_Cancer                   308854 non-null  object 
 5   Other_Cancer                  308854 non-null  object 
 6   Depression                    308854 non-null  object 
 7   Diabetes                      308854 non-null  object 
 8   Arthritis                     308854 non-null  object 
 9   Sex                           308854 non-null  object 
 10  Age_Category                  308854 non-null  object 
 11  Height_(cm)                   308854 non-null  float64
 12  Weight_(kg)                   308854 non-nul

Here we can see the complete range of responses provided for each column within the dataframe.

In [7]:
for i in range(len(df.columns)):
  print(df.columns[i])
  print(df[df.columns[i]].unique())
  display(Markdown('---'))


General_Health
['Poor' 'Very Good' 'Good' 'Fair' 'Excellent']


---

Checkup
['Within the past 2 years' 'Within the past year' '5 or more years ago'
 'Within the past 5 years' 'Never']


---

Exercise
['No' 'Yes']


---

Heart_Disease
['No' 'Yes']


---

Skin_Cancer
['No' 'Yes']


---

Other_Cancer
['No' 'Yes']


---

Depression
['No' 'Yes']


---

Diabetes
['No' 'Yes' 'No, pre-diabetes or borderline diabetes'
 'Yes, but female told only during pregnancy']


---

Arthritis
['Yes' 'No']


---

Sex
['Female' 'Male']


---

Age_Category
['70-74' '60-64' '75-79' '80+' '65-69' '50-54' '45-49' '18-24' '30-34'
 '55-59' '35-39' '40-44' '25-29']


---

Height_(cm)
[150. 165. 163. 180. 191. 183. 175. 160. 168. 178. 152. 157. 188. 185.
 170. 173. 155. 193. 196. 206. 198. 140. 135. 145. 147. 142. 201. 218.
 124. 203. 137. 122. 216. 224. 229. 151. 177. 164. 162. 156. 153. 169.
 167. 172. 106. 190. 143. 171. 154. 176. 200. 146. 148. 158. 159. 187.
 104. 120. 107. 211. 226. 182. 213.  97. 184. 125. 127. 234. 130. 119.
 132. 105. 166. 181. 186.  91. 174. 208. 149.  96. 197. 161.  94. 103.
 221. 134. 144. 189. 100. 179. 117.  99. 102. 110. 241. 115. 205. 195.
 108.]


---

Weight_(kg)
[ 32.66  77.11  88.45  93.44 154.22  69.85 108.86  72.57  91.63  74.84
  73.48  83.91 113.4   52.16 116.12  99.79  81.65 104.33  79.38  55.79
 124.74  81.19  70.31 112.49 147.42  84.82 102.06  64.41  60.78  61.23
  88.    90.72  49.9   85.28 120.2   69.4   62.14  65.77  89.81  66.68
  86.18  72.12  87.54  62.6   75.75  88.9   92.08  56.7   68.04  79.83
  63.5   58.97 114.76  45.36  73.94  54.43 125.19  77.56  96.16  95.25
 115.67  82.55 136.08  78.93  70.76  95.71  53.52  87.09  55.34  83.01
 123.38  98.88  73.03  76.66  97.52  71.67  83.46 122.47  58.06  74.39
  67.13  82.1   47.63  99.34  85.73 108.41  91.17  57.61  63.05  45.81
  94.35  44.45 117.93 107.5  127.01 106.59 107.95  89.36  92.99  53.07
  78.02 131.09  97.98  84.37 111.13  50.8   57.15  64.86  80.29  76.2
 114.31  65.32  97.07  67.59  75.3  105.69 110.68  86.64  51.26  61.69
 107.05  42.64  40.82 101.6   90.26 131.54  98.43  78.47  59.87  68.95
  60.33  94.8   48.53  96.62 117.48 102.51  46.27 109.77  58.51  6

---

BMI
[14.54 28.29 33.47 ... 63.83 19.09 56.32]


---

Smoking_History
['Yes' 'No']


---

Alcohol_Consumption
[ 0.  4.  3.  8. 30.  2. 12.  1.  5. 10. 20. 17. 16.  6. 25. 28. 15.  7.
  9. 24. 11. 29. 27. 14. 21. 23. 18. 26. 22. 13. 19.]


---

Fruit_Consumption
[ 30.  12.   8.  16.   2.   1.  60.   0.   7.   5.   3.   6.  90.  28.
  20.   4.  80.  24.  15.  10.  25.  14. 120.  32.  40.  17.  45. 100.
   9.  99.  96.  35.  50.  56.  48.  27.  72.  36.  84.  26.  23.  18.
  21.  42.  22.  11. 112.  29.  64.  70.  33.  76.  44.  39.  75.  31.
  92. 104.  88.  65.  55.  13.  38.  63.  97. 108.  19.  52.  98.  37.
  68.  34.  41. 116.  54.  62.  85.]


---

Green_Vegetables_Consumption
[ 16.   0.   3.  30.   4.  12.   8.  20.   1.  10.   5.   2.   6.  60.
  28.  25.  14.  40.   7.  22.  24.  15. 120.  90.  19.  13.  11.  80.
  27.  17.  56.  18.   9.  21.  99.  29.  31.  45.  23. 100. 104.  32.
  48.  75.  36.  35. 112.  26.  50.  33.  96.  52.  76.  84.  34.  97.
  88.  98.  68.  92.  55.  95.  64. 124.  61.  65.  77.  85.  44.  39.
  70.  93. 128.  37.  53.]


---

FriedPotato_Consumption
[ 12.   4.  16.   8.   0.   1.   2.  30.  20.  15.  10.   3.   7.  28.
   5.   9.   6. 120.  32.  14.  60.  33.  48.  25.  24.  21.  90.  13.
  99.  17.  18.  40.  56.  34.  36.  44. 100.  11.  64.  45.  80.  29.
  68.  26.  50.  22.  95.  23.  27. 112.  35.  31.  98.  96.  88.  92.
  19.  76.  49.  97. 128.  41.  37.  42.  52.  72.  46. 124.  84.]


---

Here we separate the columns into numerical and categorical.

In [8]:
numerical = df.select_dtypes(include=['float64']).columns.sort_values()
categorical = df.select_dtypes(include=['object']).columns.sort_values()

# Exploratory Data Analysis

We perform data analysis to get an ideal dataset to train in order to get the best accuracy.

In [9]:
print(f'There are {len(categorical)} Categorical variables')
print(f'There are {len(numerical)} Numerical variables')

There are 12 Categorical variables
There are 7 Numerical variables


Here we can see all the categorical columns listed.

In [10]:
categorical

Index(['Age_Category', 'Arthritis', 'Checkup', 'Depression', 'Diabetes',
       'Exercise', 'General_Health', 'Heart_Disease', 'Other_Cancer', 'Sex',
       'Skin_Cancer', 'Smoking_History'],
      dtype='object')

Here we can see all the numerical columns listed.

In [11]:
numerical

Index(['Alcohol_Consumption', 'BMI', 'FriedPotato_Consumption',
       'Fruit_Consumption', 'Green_Vegetables_Consumption', 'Height_(cm)',
       'Weight_(kg)'],
      dtype='object')

Here we perform label encoding on the categorical data. It's done to convert categorical variables into numerical format, as machine learning algorithms and statistical models require numerical input.

In [12]:
label_encoder = preprocessing.LabelEncoder()

for i in (df[categorical]):
  df[i] = label_encoder.fit_transform(df[i])

df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,3,2,0,0,0,0,0,0,1,0,10,150.0,32.66,14.54,1,0.0,30.0,16.0,12.0
1,4,4,0,1,0,0,0,2,0,0,10,165.0,77.11,28.29,0,0.0,30.0,0.0,4.0
2,4,4,1,0,0,0,0,2,0,0,8,163.0,88.45,33.47,0,4.0,12.0,3.0,16.0
3,3,4,1,1,0,0,0,2,0,1,11,180.0,93.44,28.73,0,0.0,30.0,30.0,8.0
4,2,4,0,0,0,0,0,0,0,1,12,191.0,88.45,24.37,1,0.0,8.0,4.0,0.0


Here we can see now that every column within the dataframe contains numerical data.

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   General_Health                308854 non-null  int64  
 1   Checkup                       308854 non-null  int64  
 2   Exercise                      308854 non-null  int64  
 3   Heart_Disease                 308854 non-null  int64  
 4   Skin_Cancer                   308854 non-null  int64  
 5   Other_Cancer                  308854 non-null  int64  
 6   Depression                    308854 non-null  int64  
 7   Diabetes                      308854 non-null  int64  
 8   Arthritis                     308854 non-null  int64  
 9   Sex                           308854 non-null  int64  
 10  Age_Category                  308854 non-null  int64  
 11  Height_(cm)                   308854 non-null  float64
 12  Weight_(kg)                   308854 non-nul

# Finding the highly correlated columns

Here we set the target column as 'Heart_Disease' and view the correlation between each column and the target column.

In [14]:
correlation_matrix = df.corr()['Heart_Disease']
correlation_matrix

General_Health                 -0.020645
Checkup                         0.083480
Exercise                       -0.096347
Heart_Disease                   1.000000
Skin_Cancer                     0.090848
Other_Cancer                    0.092387
Depression                      0.032526
Diabetes                        0.166241
Arthritis                       0.153913
Sex                             0.072595
Age_Category                    0.229011
Height_(cm)                     0.015780
Weight_(kg)                     0.045875
BMI                             0.042666
Smoking_History                 0.107797
Alcohol_Consumption            -0.036569
Fruit_Consumption              -0.020055
Green_Vegetables_Consumption   -0.024043
FriedPotato_Consumption        -0.009227
Name: Heart_Disease, dtype: float64

# Splitting the training and testing data

Here we split the dataframe into training and testing data. We drop the target variable (Heart_Disease) from the feature variables (X) and make it the target variable (y).

In [15]:
X = df.drop('Heart_Disease', axis=1)
y = df['Heart_Disease']

Here the dataset is split into training and testing sets (75% train, 25% test).


In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

Here we view the lengths of each of the variables from the split by printing the lengths of the training and testing sets.

In [17]:
print("Length of training set (x_train):", len(x_train))
print("Length of testing set (x_test):", len(x_test))
print("Length of training labels (y_train):", len(y_train))
print("Length of testing labels (y_test):", len(y_test))

Length of training set (x_train): 231640
Length of testing set (x_test): 77214
Length of training labels (y_train): 231640
Length of testing labels (y_test): 77214


# Creating the Multilayer Perceptron (MLP)
Here we create a multilayer perceptron to train our model.

## Defining the early stopping callback



*   min_delta=0.001:

  * **Justification**: Early stopping will only occur if the improvement in the monitored metric (in this case, validation loss) is greater than or equal to 0.001.
  * **Reason**: Setting a small min_delta ensures that the model continues training until there is a relatively small but meaningful improvement in performance, preventing premature stopping due to minor fluctuations.
* patience=5:

  * **Justification**: Early stopping will wait for 5 epochs without improvement in the monitored metric before stopping.
  * **Reason**: Allowing some patience ensures that the model doesn't stop too early due to temporary fluctuations in the validation loss. It helps capture more significant trends in the model's performance.
* restore_best_weights=True:

  * **Justification**: If early stopping occurs, it will restore the model's weights to the configuration that had the lowest monitored metric (validation loss) during training.
  * **Reason**: This is crucial because it prevents the model from potentially settling into a suboptimal state after training for many epochs. Restoring the best weights ensures the model retains the best performance achieved during training.



In [18]:
early_stopping = EarlyStopping(
    min_delta=0.001,  # Minimum amount of change to count as an improvement
    patience=5,       # How many epochs to wait before stopping
    restore_best_weights=True  # Restore the best weights when stopping
)

## Initialising the MLP

In [19]:
model = Sequential()

## Defining the layers of the MLP

In [20]:
model.add(Dense(units=128, kernel_initializer='uniform', activation='relu', input_dim=18))
model.add(Dense(units=64, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=32, kernel_initializer='uniform', activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(units=16, kernel_initializer='uniform', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=8, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=4, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))

## Compiling the MLP

In [21]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

## Training the MLP

In [22]:
history = model.fit(x_train, y_train, batch_size=25, epochs=80, callbacks=[early_stopping], validation_split=0.25)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80


# Validating the Model

In [27]:
# Predict on training and test sets
train_preds = model.predict(x_train)
train_pred = (train_preds > 0.5).astype(int)
test_preds = model.predict(x_test)
test_pred = (test_preds > 0.5).astype(int)

# Calculate the accuracy scores
train_accuracy = accuracy_score(y_train, train_pred)
test_accuracy = accuracy_score(y_test, test_pred)

print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)


Training Accuracy: 0.9193274045933345
Test Accuracy: 0.9186157950630714


## Testing an individual's CVD risk using the model

In [31]:
def predict_cvd(config, model):

    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config

    y_pred = model.predict(df)

    return y_pred

In [57]:
config = {
    'General_Health': [4],  # Encoding for "Excellent" or very healthy
    'Checkup': [0],  # Encoding for "Within the past 2 years" or regular checkups
    'Exercise': [1],  # Encoding for "Yes" or regular exercise
    'Skin_Cancer': [0],  # Encoding for "No" (no skin cancer)
    'Other_Cancer': [0],  # Encoding for "No" (no other cancer)
    'Depression': [0],  # Encoding for "No" (no depression)
    'Diabetes': [0],  # Encoding for "No" (no diabetes)
    'Arthritis': [0],  # Encoding for "No" (no arthritis)
    'Sex': [1],  # Encoding for "Male"
    'Age_Category': [7],  # Encoding for "18-24" or a relatively young age
    'Height_(cm)': [175.0],  # Average height in centimeters
    'Weight_(kg)': [70.0],  # Healthy weight in kilograms
    'BMI': [22.86],  # Normal BMI value
    'Smoking_History': [0],  # Encoding for "No" (non-smoker)
    'Alcohol_Consumption': [2.5],  # Moderate alcohol consumption in grams per day
    'Fruit_Consumption': [3.0],  # Moderate fruit consumption in servings per day
    'Green_Vegetables_Consumption': [2.0],  # Moderate green vegetables consumption in servings per day
    'FriedPotato_Consumption': [0.5]  # Low fried potato consumption in servings per day
}

predict_cvd(config, model)



array([[0.01588038]], dtype=float32)

## Creating model file

Here, the model file is created in order to be used by the API later on.

In [55]:
# Save to file in the current working directory
pkl_filename = "model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)

##loading the model from the saved file
pkl_filename = "model.pkl"
with open(pkl_filename, 'rb') as f_in:
    model = pickle.load(f_in)