In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from pathlib import Path
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler,OneHotEncoder


In [3]:
columns = [
    "HeartDisease", "BMI", "Smoking", "AlcoholDrinking",
    "Stroke", "PhysicalHealth", "MentalHealth", "DiffWalking",
    "Sex", "AgeCategory", "Race", "Diabetic",
    "PhysicalActivity", "GenHealth", "SleepTime", "Asthma",
    "KidneyDisease","SkinCancer"
]

target = ["HeartDisease"]

In [4]:
healthy_hearts_df = pd.read_csv("heart_2020_cleaned.csv")
healthy_hearts_df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [5]:
# Convert Yes and No to 1 and 0 respectively 
x = {'Yes': 1}   
healthy_hearts_df = healthy_hearts_df.replace(x)

x = {'No': 0}    
healthy_hearts_df = healthy_hearts_df.replace(x)

# Convert numeric Diabetic values back to Yes and No for encoding 
healthy_hearts_df.loc[(healthy_hearts_df.HeartDisease == 0), 'HeartDisease'] = "No"
healthy_hearts_df.loc[(healthy_hearts_df.HeartDisease == 1), 'HeartDisease'] = "Yes"

healthy_hearts_df.reset_index(inplace=True, drop=True)

In [6]:
healthy_hearts_df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,1,0,0,3.0,30.0,0,Female,55-59,White,1,1,Very good,5.0,1,0,1
1,No,20.34,0,0,1,0.0,0.0,0,Female,80 or older,White,0,1,Very good,7.0,0,0,0
2,No,26.58,1,0,0,20.0,30.0,0,Male,65-69,White,1,1,Fair,8.0,1,0,0
3,No,24.21,0,0,0,0.0,0.0,0,Female,75-79,White,0,0,Good,6.0,0,0,1
4,No,23.71,0,0,0,28.0,0.0,1,Female,40-44,White,0,1,Very good,8.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,1,0,0,7.0,0.0,1,Male,60-64,Hispanic,1,0,Fair,6.0,1,0,0
319791,No,29.84,1,0,0,0.0,0.0,0,Male,35-39,Hispanic,0,1,Very good,5.0,1,0,0
319792,No,24.24,0,0,0,0.0,0.0,0,Female,45-49,Hispanic,0,1,Good,6.0,0,0,0
319793,No,32.81,0,0,0,0.0,0.0,0,Female,25-29,Hispanic,0,0,Good,12.0,0,0,0


In [7]:
# Determine the number of unique values in each column.
healthy_hearts_df.nunique()


HeartDisease           2
BMI                 3604
Smoking                2
AlcoholDrinking        2
Stroke                 2
PhysicalHealth        31
MentalHealth          31
DiffWalking            2
Sex                    2
AgeCategory           13
Race                   6
Diabetic               4
PhysicalActivity       2
GenHealth              5
SleepTime             24
Asthma                 2
KidneyDisease          2
SkinCancer             2
dtype: int64

In [8]:
#Check the data types of each column 
healthy_hearts_df.dtypes

HeartDisease         object
BMI                 float64
Smoking               int64
AlcoholDrinking       int64
Stroke                int64
PhysicalHealth      float64
MentalHealth        float64
DiffWalking           int64
Sex                  object
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity      int64
GenHealth            object
SleepTime           float64
Asthma                int64
KidneyDisease         int64
SkinCancer            int64
dtype: object

In [9]:
# Convert numeric Diabetic values back to Yes and No for encoding 
healthy_hearts_df.loc[(healthy_hearts_df.Diabetic == 0), 'Diabetic'] = "No"
healthy_hearts_df.loc[(healthy_hearts_df.Diabetic == 1), 'Diabetic'] = "Yes"

healthy_hearts_df['Diabetic'].value_counts()

No                         269653
Yes                         40802
No, borderline diabetes      6781
Yes (during pregnancy)       2559
Name: Diabetic, dtype: int64

In [10]:
# Create the OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded DataFrame for Sex
encode_df = pd.DataFrame(enc.fit_transform(healthy_hearts_df.Sex.values.reshape(-1,1)))

# Rename encoded columns
encode_df.columns = enc.get_feature_names(['Sex'])

# Fit the encoder and produce encoded DataFrame for Age Category 
encode_df2 = pd.DataFrame(enc.fit_transform(healthy_hearts_df.AgeCategory.values.reshape(-1,1)))

# Rename encoded columns
encode_df2.columns = enc.get_feature_names(['AgeCategory'])
encode_df = encode_df.merge(encode_df2,left_index=True,right_index=True)

# Fit the encoder and produce encoded DataFrame for Race
encode_df3 = pd.DataFrame(enc.fit_transform(healthy_hearts_df.Race.values.reshape(-1,1)))

# Rename encoded columns
encode_df3.columns = enc.get_feature_names(['Race'])
encode_df3.head()
encode_df = encode_df.merge(encode_df3,left_index=True,right_index=True)

# Fit the encoder and produce encoded DataFrame for General Health
encode_df4 = pd.DataFrame(enc.fit_transform(healthy_hearts_df.GenHealth.values.reshape(-1,1)))

# Rename encoded columns
encode_df4.columns = enc.get_feature_names(['GenHealth'])
encode_df = encode_df.merge(encode_df4,left_index=True,right_index=True)

# Fit the encoder and produce encoded DataFrame for Diabetes
encode_df5 = pd.DataFrame(enc.fit_transform(healthy_hearts_df.Diabetic.values.reshape(-1,1)))

# Rename encoded columns
encode_df5.columns = enc.get_feature_names(['Diabetic'])
encode_df = encode_df.merge(encode_df5,left_index=True,right_index=True)
encode_df

Unnamed: 0,Sex_Female,Sex_Male,AgeCategory_18-24,AgeCategory_25-29,AgeCategory_30-34,AgeCategory_35-39,AgeCategory_40-44,AgeCategory_45-49,AgeCategory_50-54,AgeCategory_55-59,...,Race_White,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
319791,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
319792,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
319793,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [11]:
#Merge the dataframes and drop the original columns that were encoded 
healthy_hearts_df = healthy_hearts_df.merge(encode_df,left_index=True,right_index=True).drop(["GenHealth","Sex","Race","Diabetic","AgeCategory"],axis=1)
healthy_hearts_df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,PhysicalActivity,SleepTime,...,Race_White,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
0,No,16.60,1,0,0,3.0,30.0,0,1,5.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,No,20.34,0,0,1,0.0,0.0,0,1,7.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,No,26.58,1,0,0,20.0,30.0,0,1,8.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,No,24.21,0,0,0,0.0,0.0,0,0,6.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,No,23.71,0,0,0,28.0,0.0,1,1,8.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,1,0,0,7.0,0.0,1,0,6.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
319791,No,29.84,1,0,0,0.0,0.0,0,1,5.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
319792,No,24.24,0,0,0,0.0,0.0,0,1,6.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
319793,No,32.81,0,0,0,0.0,0.0,0,0,12.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [12]:
# Create our features
x_cols = [i for i in healthy_hearts_df.columns if i not in ('HeartDisease')]
X = healthy_hearts_df[x_cols]
X=pd.get_dummies(X)

# Create our target
y = healthy_hearts_df['HeartDisease']

In [13]:
# Check the amount of points for each result in the Data frame 
y.value_counts()

No     292422
Yes     27373
Name: HeartDisease, dtype: int64

In [14]:
# Split into testing and training sets 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)
Counter(y_train)

Counter({'No': 219316, 'Yes': 20530})

In [18]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)
Counter(y_resampled)

Counter({'No': 219316, 'Yes': 219316})

In [19]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [20]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7295732549469375

In [21]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)


array([[55136, 17970],
       [ 2019,  4824]], dtype=int64)

In [22]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.96      0.75      0.70      0.85      0.73      0.53     73106
        Yes       0.21      0.70      0.75      0.33      0.73      0.53      6843

avg / total       0.90      0.75      0.71      0.80      0.73      0.53     79949



In [27]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=50, random_state=78) 
rf_model = rf_model.fit(X_train, y_train)
predictions = rf_model.predict(X_test)

In [28]:
# Calculated the balanced accuracy score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5523725388007754

In [29]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[71469,  1637],
       [ 5973,   870]], dtype=int64)

In [30]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.92      0.98      0.13      0.95      0.35      0.13     73106
        Yes       0.35      0.13      0.98      0.19      0.35      0.11      6843

avg / total       0.87      0.90      0.20      0.88      0.35      0.13     79949

