# Heart Disease Dataset


In [1]:
# import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# import data into dataframe
df = pd.read_csv("heart.csv")
df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


## Data Cleaning

In [3]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

Most of the columns have names that don't make intuitive sense to me as I am not familiar which the cardiovascular system and heart diseases. As a result of this, I will be renaming the columns to names that make more sense to me.

In [4]:
columns = {"cp": "chest_pain_type", "trestbps": "rest_bp", "chol": "cholesterol", "fbs": "fasting_blood_sugar", "restecg": "rest_ecg", "thalach": "max_heart_rate", "exang": "exercise_angina", "oldpeak": "exercise_st_depression","slope": "exercise_st_slope", "ca": "n_vessels_fluoro"}
df = df.rename(columns, axis=1)
df.columns

Index(['age', 'sex', 'chest_pain_type', 'rest_bp', 'cholesterol',
       'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate', 'exercise_angina',
       'exercise_st_depression', 'exercise_st_slope', 'n_vessels_fluoro',
       'thal', 'target'],
      dtype='object')

In [5]:
# check missing values
df.isnull().sum()

age                       0
sex                       0
chest_pain_type           0
rest_bp                   0
cholesterol               0
fasting_blood_sugar       0
rest_ecg                  0
max_heart_rate            0
exercise_angina           0
exercise_st_depression    0
exercise_st_slope         0
n_vessels_fluoro          0
thal                      0
target                    0
dtype: int64

In [6]:
# check datatypes
df.dtypes

age                         int64
sex                         int64
chest_pain_type             int64
rest_bp                     int64
cholesterol                 int64
fasting_blood_sugar         int64
rest_ecg                    int64
max_heart_rate              int64
exercise_angina             int64
exercise_st_depression    float64
exercise_st_slope           int64
n_vessels_fluoro            int64
thal                        int64
target                      int64
dtype: object

It seems that there are no missing values and all values are the correct datatype.

In [7]:
# check for duplicated values
df.duplicated().value_counts()

False    302
True       1
dtype: int64

In [8]:
# There seems to be duplicate entries so I'll find which rows are duplicated
df[df.duplicated(keep=False)]

Unnamed: 0,age,sex,chest_pain_type,rest_bp,cholesterol,fasting_blood_sugar,rest_ecg,max_heart_rate,exercise_angina,exercise_st_depression,exercise_st_slope,n_vessels_fluoro,thal,target
163,38,1,2,138,175,0,1,173,0,0.0,2,4,2,1
164,38,1,2,138,175,0,1,173,0,0.0,2,4,2,1


So it seems that there are two duplicated rows (163 and 164). I think it is highly unlikely that there would be two patients that have all 14 values that are the same.

In [12]:
# removed duplicates
df = df.drop_duplicates()
df[df.duplicated(keep=False)]

Unnamed: 0,age,sex,chest_pain_type,rest_bp,cholesterol,fasting_blood_sugar,rest_ecg,max_heart_rate,exercise_angina,exercise_st_depression,exercise_st_slope,n_vessels_fluoro,thal,target


For now, I don't think there are any problems with the data.
## Data Exploration

In [9]:
df["target"].value_counts()

1    165
0    138
Name: target, dtype: int64

In [10]:
df.describe()

Unnamed: 0,age,sex,chest_pain_type,rest_bp,cholesterol,fasting_blood_sugar,rest_ecg,max_heart_rate,exercise_angina,exercise_st_depression,exercise_st_slope,n_vessels_fluoro,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0
