In [27]:
# I spoke with my Mentor and decided to get the Heart Failure Prediction data set from Kaggle
# to see whether Heart Failure is predictable based on sex, age, and 10 other variables

import os
import pandas as pd
import matplotlib.pyplot as plt



In [89]:
# Data Collection Step - Data received from Kaggle
# https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction

# Read CSV and put it into a DataFrame
df = pd.read_csv('data/heart.csv')
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,0,0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,1,1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,2,2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,3,3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,4,4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [29]:
# All data above looks good, column names match up and make sense
# Everything is in one CSV file, and all the data has been pulled into the DF

# Data Cleanup
#Search for NA values in each row of DataFrame

df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [30]:
# No NA values found in the data
# I can't check for duplicate values because there is no Key for each row, like name, and multiple rows can 
# have the same value


df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [31]:
# Max HR should never exceed 202 according to the data provider, which checks out in the describe table above
# There are no missing fields in each column
# There are no negative numbers in the range for years, restingBP, cholesterol, fastingBS, maxHR - which is also accurate


# Let's take a look at the fields that do not have ints/float values

df['ExerciseAngina'].unique()



array(['N', 'Y'], dtype=object)

In [10]:
# The ExerciseAngina field only has 'Y' and 'N' values which is accurate

df['ChestPainType'].unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [11]:
# The ChestPainType has only 4 unique values and all are accurate

df['RestingECG'].unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [12]:
# The RestingECG has 3 unique values, all are accurate

df['Sex'].unique()

array(['M', 'F'], dtype=object)

In [13]:
# The Sex field only has 2 unique values, which is accurate according to data provider

df['ST_Slope'].unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [10]:
# The ST_Slope field only has 3 unique values, all of which are mentioned by data provider

# This Data is clean and ready for the Exploratory Data Analysis step

print(df.dtypes)

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object


In [33]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [81]:
# the min value for RestingBP and Cholesterol should not be 0, so I will replace these values with the median or
# remove it compelted from the data set if it is only one value
fix_restingBP = df[df['RestingBP'] == 0].value_counts()
df.drop(df.loc[df['RestingBP']==0].index, inplace=True)
df.describe()

Unnamed: 0.1,Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,458.51036,53.509269,132.540894,243.211559,0.23337,136.789531,0.886696,0.55289
std,265.292578,9.437636,17.999749,53.430044,0.423206,25.467129,1.06696,0.497466
min,0.0,28.0,80.0,85.0,0.0,60.0,-2.6,0.0
25%,229.0,47.0,120.0,214.0,0.0,120.0,0.0,0.0
50%,459.0,54.0,130.0,237.0,0.0,138.0,0.6,1.0
75%,688.0,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,917.0,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [95]:
fix_cholesterol = df[df['Cholesterol'] == 0].value_counts()
cholesterol_med = df[df['Cholesterol'] > 0].median()
df.loc[df["Cholesterol"] < 1, "Cholesterol"] = 237
df.drop(df.columns[0], axis=1, inplace=True)
df.describe()


  cholesterol_med = df[df['Cholesterol'] > 0].median()


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,53.509269,132.540894,243.211559,0.23337,136.789531,0.886696,0.55289
std,9.437636,17.999749,53.430044,0.423206,25.467129,1.06696,0.497466
min,28.0,80.0,85.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,214.0,0.0,120.0,0.0,0.0
50%,54.0,130.0,237.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [96]:
df.to_csv('Data/heart.csv',index=True)

# All columns have accurate datatypes. All the columns with 'string' datatypes are mentioned above as having 'object'


# This Data is clean and ready for the Exploratory Data Analysis step

In [97]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,53.509269,132.540894,243.211559,0.23337,136.789531,0.886696,0.55289
std,9.437636,17.999749,53.430044,0.423206,25.467129,1.06696,0.497466
min,28.0,80.0,85.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,214.0,0.0,120.0,0.0,0.0
50%,54.0,130.0,237.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0
