# Importing dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import recall_score, accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
import pickle
from IPython.display import Markdown, display
from sklearn import preprocessing
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
    auc,
    confusion_matrix,
)


# Creating functions

In [5]:
## printing the shape and head
def head(df,shape_only=False):
    print(df.shape)

    if shape_only:
        return
    else:
        return df.head()


# Reading dataset 

In [7]:
df = pd.read_csv('../data/raw/dataset.csv')

## Viewing the dataframe and shape
head(df,shape_only=False)

(308854, 19)


Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


In [8]:
## Setting the target variable
target = 'Heart_Disease'

In [9]:
## Creating numerical and categorical columns
numerical = df.select_dtypes(include=['float64']).columns.sort_values()
categorical = df.select_dtypes(include=['object']).columns.sort_values()

## Printing the length of numerical and categorical. The total length should have
## the same length as our dataframe
print(f'There are {len(categorical)} Categorical variables')
print(f'There are {len(numerical)} Numerical variables')

There are 12 Categorical variables
There are 7 Numerical variables


In [10]:
## Showing the columns in alphabetical order
df.columns.sort_values()

## Showing the descriptions of numerical variables
print('')
num_describe = df.describe().T
num_describe_table = num_describe.loc[:,['mean', 'std', '25%', '50%', '75%']]
print(num_describe_table)

## Showing the descriptions of categorical variables
print('')
object_describe_table = df.describe(include=object)
print(object_describe_table)
## print it to latex
# print(object_describe_table.to_latex())


                                    mean        std     25%     50%     75%
Height_(cm)                   170.615249  10.658026  163.00  170.00  178.00
Weight_(kg)                    83.588655  21.343210   68.04   81.65   95.25
BMI                            28.626211   6.522323   24.21   27.44   31.85
Alcohol_Consumption             5.096366   8.199763    0.00    1.00    6.00
Fruit_Consumption              29.835200  24.875735   12.00   30.00   30.00
Green_Vegetables_Consumption   15.110441  14.926238    4.00   12.00   20.00
FriedPotato_Consumption         6.296616   8.582954    2.00    4.00    8.00

       General_Health               Checkup Exercise Heart_Disease  \
count          308854                308854   308854        308854   
unique              5                     5        2             2   
top         Very Good  Within the past year      Yes            No   
freq           110395                239371   239381        283883   

       Skin_Cancer Other_Cancer Depress

# Feature Engineering (Preprocessing)

Changing the values of Heart Disease to 0 and 1 for preprocessing steps

In [11]:
df['Heart_Disease'] = df['Heart_Disease'].map({'No':0,'Yes':1})
print('')
print(df['Heart_Disease'].value_counts())


Heart_Disease
0    283883
1     24971
Name: count, dtype: int64


Splitting the train and test set. Using stratify to keep the ratio between two classes be the same

In [12]:
from sklearn.model_selection import train_test_split

train,test = train_test_split(df, test_size=0.2,random_state=22,stratify=df['Heart_Disease'])

print(train.shape)
print(test.shape)

(247083, 19)
(61771, 19)


Showing the ratio of the target variable from train and test set

In [13]:
yes = train['Heart_Disease'].value_counts()[0]/len(train['Heart_Disease'])*100
no = train['Heart_Disease'].value_counts()[1]/len(train['Heart_Disease'])*100
print('Train Set')
print(f'ratio of people with heart disease to total is {yes}')
print(f'ratio of people that dont have heart disease to total is {no}')
print('')

yes = test['Heart_Disease'].value_counts()[0]/len(test['Heart_Disease'])*100
no = test['Heart_Disease'].value_counts()[1]/len(test['Heart_Disease'])*100
print('Test Set')
print(f'ratio of people with heart disease to total is {yes}')
print(f'ratio of people that dont have heart disease to total is {no}')

Train Set
ratio of people with heart disease to total is 91.91486261701533
ratio of people that dont have heart disease to total is 8.085137382984666

Test Set
ratio of people with heart disease to total is 91.91530005989866
ratio of people that dont have heart disease to total is 8.084699940101341


In [14]:
## Splitting the X and y variables in the train set
X_train = train.drop("Heart_Disease", axis=1)
y_train = train["Heart_Disease"].copy()

## Splitting the X and y variables in the test set
X_test = test.drop("Heart_Disease", axis=1)
y_test = test["Heart_Disease"].copy()

Printing the number of unique values per each column

In [15]:
X_train.nunique()

General_Health                     5
Checkup                            5
Exercise                           2
Skin_Cancer                        2
Other_Cancer                       2
Depression                         2
Diabetes                           4
Arthritis                          2
Sex                                2
Age_Category                      13
Height_(cm)                       98
Weight_(kg)                      502
BMI                             3502
Smoking_History                    2
Alcohol_Consumption               31
Fruit_Consumption                 73
Green_Vegetables_Consumption      73
FriedPotato_Consumption           67
dtype: int64

Notes:
* There are 8 categorical variables. Variables that are not in order.
* There are 7 numerical variables.
* There are 3 ordinal variables. The General Health, Age Category, and the Checkup variable, The data from this can be represented with an order.

# Creating Pipelines

## Categorical Pipeline

In [16]:
cat_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore',drop='first'))

* For categorical pipeline, only OneHotEncoder will be implemented. Since this dataset has been cleaned and there are no missing values.

## Numerical Pipeline

In [17]:
num_pipeline = make_pipeline(
                             FunctionTransformer(np.log1p,feature_names_out='one-to-one'),
                             StandardScaler()
                            )

For numerical pipeline, two methods are used:
1. Log Transform: From the EDA, most of the numerical functions are skewed right.
2. Standard Scaler: The numerical variable will be scaled to put them all on the same scale.

## Ordinal Pipeline

In [18]:
## Age Category Pipeline
agecat_pipeline = make_pipeline(
        OrdinalEncoder()
)

## General Health Pipeline
genhealth_pipeline = make_pipeline(
        OrdinalEncoder(categories=[['Poor','Fair','Good','Very Good','Excellent']])
)

## Checkup Pipeline
checkup_pipeline = make_pipeline(
        OrdinalEncoder(categories=[['Within the past year','Within the past 2 years','Within the past 5 years','5 or more years ago','Never']])
)

* For the ordinal variable, the variables are transformed based on their order. The values with in the lowest order will start with 0 and increases by 1.

## Creating the pipeline lists

In [19]:
## Setting each column to the pipeline where they will be used
num_pipe_col = numerical

cat_pipe_col = ['Arthritis', 'Depression', 'Diabetes',
       'Exercise', 'Other_Cancer', 'Sex',
       'Skin_Cancer', 'Smoking_History']

## Finalizing the preprocessing pipeline for the testing dataset

In [20]:
## Combining all the pipelines and creating a main pipeline to enter all the data
preprocessing = ColumnTransformer([
    ('Categorical', cat_pipeline,   cat_pipe_col),
    ('Age_Category',agecat_pipeline,['Age_Category']),
    ('Checkup',checkup_pipeline,['Checkup']),
    ('Gen_health',genhealth_pipeline,['General_Health']),
    ('Numerical',   num_pipeline,  num_pipe_col),
],remainder='passthrough')
preprocessing

In [21]:
## Using preprocessing pipeline
print('Shape before the preprocessing:')
print(X_test.shape)

test_preprocessed = preprocessing.fit_transform(X_train)

print('Shape after the preprocessing:')
print(test_preprocessed.shape)

Shape before the preprocessing:
(61771, 18)
Shape after the preprocessing:
(247083, 20)


In [22]:
test_preprocessed[:5]

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  7.00000000e+00,  0.00000000e+00,
         4.00000000e+00,  1.87007552e+00, -1.52461967e+00,
        -4.87091678e-01,  6.13004916e-03, -9.53748411e-01,
        -2.13824879e-01, -1.45449652e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.20000000e+01,  0.00000000e+00,
         3.00000000e+00, -9.10705045e-01, -7.75562687e-01,
        -1.66354705e+00, -2.04183373e-01, -7.34162436e-01,
         8.86072283e-01, -2.05255774e-01],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  9.0

# Model Testing

## Loading the Model

In [24]:
# Load the MLP model from the pickle file
with open('../models/mlp_model_rfe_94_v3.pkl', 'rb') as f:
    mlp_model_rfe = pickle.load(f)

In [31]:
max_iterations = 10

## Initialise Lists to Store Evaluation Metrics After Each Iteration

In [25]:
# Initialize lists to store evaluation metrics after each iteration
accuracy_scores = []
precision_scores = []
recall_scores = []
roc_auc_scores = []
confusion_matrices = []
loss_scores = []

## Plot/Output Testing Progress

In [38]:
# Transform the test set using the same preprocessing pipeline
test_preprocessed = preprocessing.transform(X_test)

# Generate predictions on the test set
test_predictions = mlp_model_rfe.predict(test_preprocessed)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, test_predictions)
precision = precision_score(y_test, test_predictions)
recall = recall_score(y_test, test_predictions)
roc_auc = roc_auc_score(y_test, test_predictions)
conf_matrix = confusion_matrix(y_test, test_predictions)

# Print or visualize the metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)

# Display classification report
classification_rep = classification_report(y_test, test_predictions)
print("Classification Report:\n", classification_rep)

Accuracy: 0.8887018180052128
Precision: 0.23424696241876236
Recall: 0.1659991990388466
ROC AUC: 0.5591343019517462
Confusion Matrix:
 [[54067  2710]
 [ 4165   829]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94     56777
           1       0.23      0.17      0.19      4994

    accuracy                           0.89     61771
   macro avg       0.58      0.56      0.57     61771
weighted avg       0.87      0.89      0.88     61771



# Testing a Single Entry

In [49]:
# Assuming single_entry is a pandas DataFrame with the same features as your model expects
single_entry = pd.DataFrame({
    'General_Health': ['Poor'],
    'Checkup': ['Within the past 2 years'],
    'Exercise': ['Yes'],
    'Skin_Cancer': ['Yes'],
    'Other_Cancer': ['Yes'],
    'Depression': ['Yes'],
    'Diabetes': ['Yes'],
    'Arthritis': ['Yes'],
    'Sex': ['Female'],
    'Age_Category': ['70-74'],
    'Height_(cm)': [150.0],
    'Weight_(kg)': [32.66],
    'BMI': [20004],
    'Smoking_History': ['Yes'],
    'Alcohol_Consumption': [0.0],
    'Fruit_Consumption': [30.0],
    'Green_Vegetables_Consumption': [16.0],
    'FriedPotato_Consumption': [12.0]
})

# Apply the same preprocessing steps
single_entry_preprocessed = preprocessing.transform(single_entry)

# Use the trained model to make predictions
prediction = mlp_model_rfe.predict(single_entry_preprocessed)

# Print or use the prediction as needed
print("Prediction:", prediction[0])


Prediction: 1


In [50]:
# Assuming single_entry is a pandas DataFrame with the same features as your model expects
single_entry = pd.DataFrame({
    'General_Health': ['Poor'],
    'Checkup': ['Within the past 2 years'],
    'Exercise': ['Yes'],
    'Skin_Cancer': ['Yes'],
    'Other_Cancer': ['Yes'],
    'Depression': ['No'],
    'Diabetes': ['No'],
    'Arthritis': ['No'],
    'Sex': ['Female'],
    'Age_Category': ['70-74'],
    'Height_(cm)': [150.0],
    'Weight_(kg)': [32.66],
    'BMI': [14.54],
    'Smoking_History': ['Yes'],
    'Alcohol_Consumption': [0.0],
    'Fruit_Consumption': [30.0],
    'Green_Vegetables_Consumption': [16.0],
    'FriedPotato_Consumption': [12.0]
})

# Apply the same preprocessing steps
single_entry_preprocessed = preprocessing.transform(single_entry)

# Use the trained model to make predictions
prediction = mlp_model_rfe.predict(single_entry_preprocessed)

# Print or use the prediction as needed
print("Prediction:", prediction[0])


Prediction: 0
