In [1]:
import pandas as pd

#Import Dataset and Preview
dataset = pd.read_csv("./heart.csv")
dataset.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [2]:
#Check Dimensions
dataset.shape

(918, 12)

In [3]:
#Check correlation between numerical columns
dataset.corr(numeric_only=True)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
Age,1.0,0.254399,-0.095282,0.198039,-0.382045,0.258612,0.282039
RestingBP,0.254399,1.0,0.100893,0.070193,-0.112135,0.164803,0.107589
Cholesterol,-0.095282,0.100893,1.0,-0.260974,0.235792,0.050148,-0.232741
FastingBS,0.198039,0.070193,-0.260974,1.0,-0.131438,0.052698,0.267291
MaxHR,-0.382045,-0.112135,0.235792,-0.131438,1.0,-0.160691,-0.400421
Oldpeak,0.258612,0.164803,0.050148,0.052698,-0.160691,1.0,0.403951
HeartDisease,0.282039,0.107589,-0.232741,0.267291,-0.400421,0.403951,1.0


In [4]:
#Check number of occurences of categorical columns
print(
dataset['RestingECG'].value_counts(), #Resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]
dataset['ChestPainType'].value_counts(), #[TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
dataset['ExerciseAngina'].value_counts(), #exercise-induced angina [Y: Yes, N: No]
dataset['ST_Slope'].value_counts(), #the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
dataset['Sex'].value_counts() #sex of the patient [M: Male, F: Female]
)

Normal    552
LVH       188
ST        178
Name: RestingECG, dtype: int64 ASY    496
NAP    203
ATA    173
TA      46
Name: ChestPainType, dtype: int64 N    547
Y    371
Name: ExerciseAngina, dtype: int64 Flat    460
Up      395
Down     63
Name: ST_Slope, dtype: int64 M    725
F    193
Name: Sex, dtype: int64


In [5]:
#Convert the categorical columns into numerical columns
restingECG = pd.get_dummies(dataset['RestingECG'], drop_first=True)
chestPainType = pd.get_dummies(dataset['ChestPainType'], drop_first=True)
exerciseAngina = pd.get_dummies(dataset['ExerciseAngina'], drop_first=True)
st_slope = pd.get_dummies(dataset['ST_Slope'], drop_first=True)
sex = pd.get_dummies(dataset['Sex'], drop_first=True)

In [6]:
#Add newly created numerical columns and remove categorical columns
dataset = dataset.drop(["Sex","ST_Slope","ExerciseAngina","RestingECG","ChestPainType"], axis=1)
dataset = pd.concat([dataset, restingECG, chestPainType, exerciseAngina, st_slope, sex], axis=1)
dataset.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Normal,ST,ATA,NAP,TA,Y,Flat,Up,M
0,40,140,289,0,172,0.0,0,1,0,1,0,0,0,0,1,1
1,49,160,180,0,156,1.0,1,1,0,0,1,0,0,1,0,0
2,37,130,283,0,98,0.0,0,0,1,1,0,0,0,0,1,1
3,48,138,214,0,108,1.5,1,1,0,0,0,0,1,1,0,0
4,54,150,195,0,122,0.0,0,1,0,0,1,0,0,0,1,1


In [7]:
#Get Input and Output Arrays
X = dataset.drop("HeartDisease", axis=1)
Y = dataset["HeartDisease"]

In [8]:
#Split input and output into training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, random_state = 0) #Using 20% of dataset for testing accuracy

In [9]:
#Feature Scaline
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
#Add extra array bracket around each element to make work in neural network module
X_train = [[a] for a in X_train]
Y_train = [[a] for a in Y_train]

X_test = [[a] for a in X_test]
Y_test = [[a] for a in Y_test]

In [11]:
#Save variables data for machine learning using pickle
import pickle

def save_file(name, var):
    f = open(f"{name}.pckl","wb")
    pickle.dump(var, f)
    f.close()

save_file("X_train", X_train)
save_file("X_test", X_test)
save_file("Y_train", Y_train)
save_file("Y_test", Y_test)
save_file("scaler", scaler)