In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score

#importing csv
df = pd.read_csv('data/Heart_Disease_Prediction.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      270 non-null    int64  
 1   Sex                      270 non-null    int64  
 2   Chest pain type          270 non-null    int64  
 3   BP                       270 non-null    int64  
 4   Cholesterol              270 non-null    int64  
 5   FBS over 120             270 non-null    int64  
 6   EKG results              270 non-null    int64  
 7   Max HR                   270 non-null    int64  
 8   Exercise angina          270 non-null    int64  
 9   ST depression            270 non-null    float64
 10  Slope of ST              270 non-null    int64  
 11  Number of vessels fluro  270 non-null    int64  
 12  Thallium                 270 non-null    int64  
 13  Heart Disease            270 non-null    object 
dtypes: float64(1), int64(12), 

In [29]:
#B1 Data Preprocessing
#Convert categorical variables, data cleaning, handle missing values -> Upload preprocessed dataset to GitLab
#Converting Absence/Presnce to Binary Presence Flag
df['Heart Disease'] = (df['Heart Disease'] == 'Presence').astype(int)

df['Heart Disease'].value_counts() # 0 = Absence of HD; 1 = Presene of HD

Heart Disease
0    150
1    120
Name: count, dtype: int64

In [30]:
#Check for null values
df.isnull().sum()

Age                        0
Sex                        0
Chest pain type            0
BP                         0
Cholesterol                0
FBS over 120               0
EKG results                0
Max HR                     0
Exercise angina            0
ST depression              0
Slope of ST                0
Number of vessels fluro    0
Thallium                   0
Heart Disease              0
dtype: int64

In [None]:
#Set 'Heart Disease' as the target column
#B2 Build the algorithm
X = df.drop(columns=['Heart Disease'])
y = df['Heart Disease']

#Split data to training/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#B3 Assigning and train model
rfc_model = RandomForestClassifier(random_state=42)
rfc_model.fit(X_train, y_train)
y_pred = rfc_model.predict(X_test)

#B4 Evaluate model accuracy
#Evaluation metrics - f1score, precision, recall, auc-roc
rfc_model_f1_score = f1_score(y_test, y_pred)
rfc_precision = precision_score(y_test, y_pred)
rfc_recall = recall_score(y_test, y_pred)
rfc_auc_roc = roc_auc_score(y_test, y_pred)

#Display Base Metrics
print("Base Model Evaluation Metrics")
print("Precision: {:.5f}".format(rfc_precision))
print("Recall: {:.5f}".format(rfc_recall))
print("AUC ROC: {:.5f}".format(rfc_auc_roc))
print("F1 Score: {:.5f}".format(rfc_model_f1_score))

Base Model Evaluation Metrics
Precision: 0.77778
Recall: 0.66667
AUC ROC: 0.77273
F1 Score: 0.71795
