In [0]:
import os

if not os.path.exists("heart-attack-prediction.csv"):
    !rm -rf sample_data
    from google.colab import files
    files.upload()
    !xz -d *.xz
    print("Uploaded files:")
    !ls
else:
    print("Already have files:")
    !ls

Already have files:
heart-attack-prediction.csv


In [0]:
# Prepare dataframe
import pandas as pd
import numpy as np

df = (pd.read_csv("heart-attack-prediction.csv", na_values="?")
#         .replace("?", np.nan) # change ? to nan
        
        .drop(["ca", "thal", "slope"], axis=1)) # drop columns w/ too many nulls

df["restecg"] = df["restecg"].astype(pd.Int64Dtype())

# Encode feature cp using one-hot encoding
df = pd.concat([df, pd.get_dummies(df["cp"], prefix="cp", drop_first=True)], axis=1) # Don't drop first if using SVM
df = pd.concat([df, pd.get_dummies(df["restecg"], prefix="restecg", drop_first=True)], axis=1)


# Reorder and drop columns
df = df[["age", "sex", "cp_2", "cp_3", "cp_4", "trestbps", "chol", "fbs", "restecg_1", "restecg_2", "thalach", "exang", "oldpeak", "num"]]

print(df.head())

for column in df.columns:
    print (column, df[column].unique(), "\n")

   age  sex  cp_2  cp_3  cp_4  ...  restecg_2  thalach  exang  oldpeak  num
0   28    1     1     0     0  ...          1    185.0    0.0      0.0    0
1   29    1     1     0     0  ...          0    160.0    0.0      0.0    0
2   29    1     1     0     0  ...          0    170.0    0.0      0.0    0
3   30    0     0     0     0  ...          0    170.0    0.0      0.0    0
4   31    0     1     0     0  ...          0    150.0    0.0      0.0    0

[5 rows x 14 columns]
age [28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
 52 53 54 55 56 57 58 59 60 61 62 63 65 66] 

sex [1 0] 

cp_2 [1 0] 

cp_3 [0 1] 

cp_4 [0 1] 

trestbps [130. 120. 140. 170. 100. 105. 110. 125. 150.  98. 112. 145. 190. 160.
 115. 142. 180. 132. 135.  nan 108. 124. 113. 122.  92. 118. 106. 200.
 138. 136. 128. 155.] 

chol [132. 243.  nan 237. 219. 198. 225. 254. 298. 161. 214. 220. 160. 167.
 308. 264. 166. 340. 209. 260. 211. 173. 283. 194. 223. 315. 275. 297.
 292. 182. 200. 204. 241.

In [0]:
from sklearn.model_selection import train_test_split

x = df.drop("num", axis=1)
y = df["num"]


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

print(len(x_train), 'train examples')
print(len(x_test), 'test examples')

235 train examples
59 test examples


In [0]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)

In [0]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_predict, y_test)

cm

array([[30,  6],
       [ 5, 18]])

In [0]:
acc = (cm[0][0]+cm[1][1])/(cm.sum().sum())
prec = (cm[1][1])/(cm[1][0]+cm[1][1])
rec = (cm[1][1])/(cm[0][1]+cm[1][1])
f1 = 2*prec*rec/(prec+rec)
f2 = 5*prec*rec/(4*prec+rec)

metrics = pd.DataFrame({"Metric": ["f1", "f2", "precision", "recall", "accuracy"], "Score": [f1, f2, prec, rec, acc]})
metrics

Unnamed: 0,Metric,Score
0,f1,0.765957
1,f2,0.756303
2,precision,0.782609
3,recall,0.75
4,accuracy,0.813559
