In [0]:
import os

if not os.path.exists("heart-attack-prediction.csv"):
    !rm -rf sample_data
    from google.colab import files
    files.upload()
    !xz -d *.xz
    print("Uploaded files:")
    !ls
else:
    print("Already have files:")
    !ls

Already have files:
heart-attack-prediction.csv


In [0]:
# Prepare dataframe
import pandas as pd
import numpy as np

df = (pd.read_csv("heart-attack-prediction.csv", na_values="?")
#         .replace("?", np.nan) # change ? to nan
        
        .drop(["ca", "thal", "slope"], axis=1)) # drop columns w/ too many nulls

df["restecg"] = df["restecg"].astype(pd.Int64Dtype())

# Encode feature cp using one-hot encoding
df = pd.concat([df, pd.get_dummies(df["cp"], prefix="cp", drop_first=True)], axis=1) # Don't drop first if using SVM
df = pd.concat([df, pd.get_dummies(df["restecg"], prefix="restecg", drop_first=True)], axis=1)


# Reorder and drop columns
df = df[["age", "sex", "cp_2", "cp_3", "cp_4", "trestbps", "chol", "fbs", "restecg_1", "restecg_2", "thalach", "exang", "oldpeak", "num"]]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 14 columns):
age          294 non-null int64
sex          294 non-null int64
cp_2         294 non-null uint8
cp_3         294 non-null uint8
cp_4         294 non-null uint8
trestbps     293 non-null float64
chol         271 non-null float64
fbs          286 non-null float64
restecg_1    294 non-null uint8
restecg_2    294 non-null uint8
thalach      293 non-null float64
exang        293 non-null float64
oldpeak      294 non-null float64
num          294 non-null int64
dtypes: float64(6), int64(3), uint8(5)
memory usage: 22.2 KB


In [0]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")

imputer.fit(df)

df = pd.DataFrame(imputer.transform(df), columns = df.columns)

df.head()

Unnamed: 0,age,sex,cp_2,cp_3,cp_4,trestbps,chol,fbs,restecg_1,restecg_2,thalach,exang,oldpeak,num
0,28.0,1.0,1.0,0.0,0.0,130.0,132.0,0.0,0.0,1.0,185.0,0.0,0.0,0.0
1,29.0,1.0,1.0,0.0,0.0,120.0,243.0,0.0,0.0,0.0,160.0,0.0,0.0,0.0
2,29.0,1.0,1.0,0.0,0.0,140.0,250.848708,0.0,0.0,0.0,170.0,0.0,0.0,0.0
3,30.0,0.0,0.0,0.0,0.0,170.0,237.0,0.0,1.0,0.0,170.0,0.0,0.0,0.0
4,31.0,0.0,1.0,0.0,0.0,100.0,219.0,0.0,1.0,0.0,150.0,0.0,0.0,0.0


In [0]:
from sklearn.model_selection import train_test_split

x = df.drop("num", axis=1)
y = df["num"]


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

print(len(x_train), 'train examples')
print(len(x_test), 'test examples')

235 train examples
59 test examples


In [0]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10, max_depth=5)

model.fit(x_train, y_train)

y_predict = model.predict(x_test)

In [0]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_predict, y_test)

cm

array([[31,  5],
       [ 2, 21]])

In [0]:
acc = (cm[0][0]+cm[1][1])/(cm.sum().sum())
prec = (cm[1][1])/(cm[1][0]+cm[1][1])
rec = (cm[1][1])/(cm[0][1]+cm[1][1])
f1 = 2*prec*rec/(prec+rec)
f2 = 5*prec*rec/(4*prec+rec)

metrics = pd.DataFrame({"Metric": ["f1", "f2", "precision", "recall", "accuracy"], "Score": [f1, f2, prec, rec, acc]})
metrics

Unnamed: 0,Metric,Score
0,f1,0.857143
1,f2,0.826772
2,precision,0.913043
3,recall,0.807692
4,accuracy,0.881356
