In [129]:
# %pip install seaborn

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Make sure the csv is in the same directory
df = pd.read_csv('data/heart_2020_cleaned.csv')
df.dropna()
pd.set_option('display.max_columns', None)

In [3]:
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [4]:
print(list(df))

['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke', 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime', 'Asthma', 'KidneyDisease', 'SkinCancer']


In [5]:
# ages = pd.get_dummies(df['AgeCategory'])
# diabetes = pd.get_dummies(df['Diabetic'])
# races = pd.get_dummies(df['Race'])
# healths = pd.get_dummies(df['GenHealth'])
categorical_columns = ['AgeCategory', 'Diabetic', 'Race', 'GenHealth']
for cat in categorical_columns:
    dummy = pd.get_dummies(df[cat])
    df = df.join(dummy)
    
df.rename(columns = {'No':'Diabetic_No', 'Yes': 'Diabetic_Yes', 'Yes (during pregnancy)': 'Diabetic_During_Pregnancy', 'No, borderline diabetes': 'Borderline_Diabetes'}, inplace = True)
df.rename(columns = {'Excellent': 'Excellent_Health', 'Fair': 'Fair_Health', 'Good': 'Good_Health', 'Poor': 'Poor_Health', 'Very good': 'Very_Good_Health'}, inplace = True)
for cat in categorical_columns:
    df.drop([cat], axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer,18-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80 or older,Diabetic_No,Borderline_Diabetes,Diabetic_Yes,Diabetic_During_Pregnancy,American Indian/Alaskan Native,Asian,Black,Hispanic,Other,White,Excellent_Health,Fair_Health,Good_Health,Poor_Health,Very_Good_Health
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,Yes,5.0,Yes,No,Yes,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,Yes,7.0,No,No,No,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,Yes,8.0,Yes,No,No,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0
3,No,24.21,No,No,No,0.0,0.0,No,Female,No,6.0,No,No,Yes,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,Yes,8.0,No,No,No,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [7]:
# Only columns that have yes/no values
yes_no_columns = ['HeartDisease','Smoking','AlcoholDrinking','Stroke','DiffWalking','PhysicalActivity','Asthma','KidneyDisease','SkinCancer']

# Mapping for yes/no values
yes_no_map = {'Yes': 1, 'No': 0}
male_female_map = {'Male': 1, 'Female': 0}

# Converting that have yes/no values
for cat in yes_no_columns:
    df[cat] = df[cat].map(yes_no_map)
df['Sex'] = df['Sex'].map(male_female_map)

In [None]:
df.head()

In [None]:
print(list(df))

In [None]:
row_heart_yes = df.loc[df['HeartDisease'] == 1]
row_heart_no = df.loc[df['HeartDisease'] == 0]
row_kidney_yes = df.loc[df['KidneyDisease'] == 1]
row_kidney_no = df.loc[df['KidneyDisease'] == 0]
row_diabetes_yes = df['Diabetic_Yes']
row_diabetes_no = df['Diabetic_No']
heart_labels = ['Has Heart Disease', 'Does Not Have Heart Disease']
plt.pie([len(row_heart_yes), len(row_heart_no)], labels = heart_labels)

In [None]:
kidney_labels = ['Has Kidney Disease', 'Does Not Have Kidney Disease']
plt.pie([len(row_kidney_yes), len(row_kidney_no)], labels = kidney_labels)

In [None]:
diabetes_labels = ['Has Diabetes', 'Does Not Have Diabetes']
plt.pie([sum(row_diabetes_yes), sum(row_diabetes_no)], labels = diabetes_labels)

In [None]:
plt.figure(figsize=(40,40))
sns.heatmap(df.corr(), annot=True)

In [None]:
target = 'HeartDisease'

heart_disease_pearson = df.corr('pearson')[[target]].sort_values(by=target, ascending=False)
heart_disease_spearman = df.corr('spearman')[[target]].sort_values(by=target, ascending=False)

print(f"Pearson:\n{heart_disease_pearson[1:10]}")
print(f"Spearman:\n{heart_disease_spearman[1:10]}")

In [None]:
df.to_csv('data/heart_2020_cleaned_binary.csv')