In [28]:
import os

if not os.path.exists("heart-attack-prediction.csv"):
    !rm -rf sample_data
    from google.colab import files
    files.upload()
    !xz -d *.xz
    print("Uploaded files:")
    !ls
else:
    print("Already have files:")
    !ls

Already have files:
heart-attack-prediction.csv


In [29]:
# Prepare dataframe
import pandas as pd
import numpy as np

df = (pd.read_csv("heart-attack-prediction.csv", na_values="?")
#         .replace("?", np.nan) # change ? to nan
        
        .drop(["ca", "thal", "slope"], axis=1)) # drop columns w/ too many nulls

df["restecg"] = df["restecg"].astype(pd.Int64Dtype())

# Encode feature cp using one-hot encoding
df = pd.concat([df, pd.get_dummies(df["cp"], prefix="cp", drop_first=True)], axis=1) # Don't drop first if using SVM
df = pd.concat([df, pd.get_dummies(df["restecg"], prefix="restecg", drop_first=True)], axis=1)


# Reorder and drop columns
df = df[["age", "sex", "cp_2", "cp_3", "cp_4", "trestbps", "chol", "fbs", "restecg_1", "restecg_2", "thalach", "exang", "oldpeak", "num"]]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 14 columns):
age          294 non-null int64
sex          294 non-null int64
cp_2         294 non-null uint8
cp_3         294 non-null uint8
cp_4         294 non-null uint8
trestbps     293 non-null float64
chol         271 non-null float64
fbs          286 non-null float64
restecg_1    294 non-null uint8
restecg_2    294 non-null uint8
thalach      293 non-null float64
exang        293 non-null float64
oldpeak      294 non-null float64
num          294 non-null int64
dtypes: float64(6), int64(3), uint8(5)
memory usage: 22.2 KB


In [30]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")

imputer.fit(df)

df = pd.DataFrame(imputer.transform(df), columns = df.columns)

df.head()

Unnamed: 0,age,sex,cp_2,cp_3,cp_4,trestbps,chol,fbs,restecg_1,restecg_2,thalach,exang,oldpeak,num
0,28.0,1.0,1.0,0.0,0.0,130.0,132.0,0.0,0.0,1.0,185.0,0.0,0.0,0.0
1,29.0,1.0,1.0,0.0,0.0,120.0,243.0,0.0,0.0,0.0,160.0,0.0,0.0,0.0
2,29.0,1.0,1.0,0.0,0.0,140.0,250.848708,0.0,0.0,0.0,170.0,0.0,0.0,0.0
3,30.0,0.0,0.0,0.0,0.0,170.0,237.0,0.0,1.0,0.0,170.0,0.0,0.0,0.0
4,31.0,0.0,1.0,0.0,0.0,100.0,219.0,0.0,1.0,0.0,150.0,0.0,0.0,0.0


In [31]:
from sklearn.model_selection import train_test_split

x = df.drop("num", axis=1)
y = df["num"]


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

print(len(x_train), 'train examples')
print(len(x_test), 'test examples')

235 train examples
59 test examples


In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

In [0]:
models = {
    "Logistic Regression": LogisticRegression(random_state = 0),
    "Decision Tree": DecisionTreeClassifier(max_depth=5),
    "Random Forest": RandomForestClassifier(n_estimators=10, max_depth=5),
    "XGBoost": XGBClassifier(),
    "SVM (linear)": SVC(kernel="linear", gamma=0.00001, C=100),
    "SVM (polynomial)": SVC(kernel="poly", degree=2, gamma=0.001, C=10),
    "SVM (rbf)": SVC(kernel="rbf", gamma=0.001, C=10),
    "Naive Bayes": GaussianNB(),
    "KNN (k=1)": KNeighborsClassifier(n_neighbors=1, weights="distance"),
    "KNN (k=3)": KNeighborsClassifier(n_neighbors=3, weights="distance"),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5, weights="distance"),
}

In [0]:
results = pd.DataFrame(columns=["Model", "True Positive", "False Negative", "False Positive", "True Positive", "Precision", "Recall", "Accuracy", "F1", "F2"])

In [0]:
from sklearn.metrics import confusion_matrix
from tqdm import tqdm

In [39]:
for name, model in tqdm(models.items()):
    
    model.fit(x_train, y_train)
    
    y_predict = model.predict(x_test)
    
    cm = confusion_matrix(y_predict, y_test)
    
    tn, fn, fp, tp = cm.ravel()
    
    acc = (tn+tp)/(tn+fn+fp+tp)
    prec = tp/(tp+fp)
    rec = tp/(tp+fn)
    f1 = 2*prec*rec/(prec+rec)
    f2 = 5*prec*rec/(4*prec+rec)
    
#    print("%20s\t[[%2d, %2d],[%2d, %2d]]\tacc: %6.2f%% prec: %6.2f%% rec: %6.2f%% f1: %3.2f f2: %3.2f"
#          %(name, tn, fn, fp, tp, 100*acc, 100*prec, 100*rec, f1, f2))
    
    results = results.append({
        "Model": name,
        "True Positive": tp,
        "False Negative": fn,
        "False Positive": fp,
        "True Positive": tp,
        "Precision": prec,
        "Recall": rec,
        "Accuracy": acc,
        "F1": f1,
        "F2": f2
    }, ignore_index=True)

100%|██████████| 11/11 [00:23<00:00,  2.16s/it]


In [49]:
results.sort_values("Recall", ascending=False, inplace=True)
results

Unnamed: 0,Model,True Positive,False Negative,False Positive,True Positive.1,Precision,Recall,Accuracy,F1,F2
7,Naive Bayes,16,2,11,16,0.592593,0.888889,0.779661,0.711111,0.808081
0,Logistic Regression,13,5,2,13,0.866667,0.722222,0.881356,0.787879,0.747126
5,SVM (polynomial),13,5,2,13,0.866667,0.722222,0.881356,0.787879,0.747126
2,Random Forest,12,6,0,12,1.0,0.666667,0.898305,0.8,0.714286
3,XGBoost,12,6,1,12,0.923077,0.666667,0.881356,0.774194,0.705882
4,SVM (linear),12,6,1,12,0.923077,0.666667,0.881356,0.774194,0.705882
1,Decision Tree,10,8,0,10,1.0,0.555556,0.864407,0.714286,0.609756
8,KNN (k=1),10,8,18,10,0.357143,0.555556,0.559322,0.434783,0.5
6,SVM (rbf),9,9,11,9,0.45,0.5,0.661017,0.473684,0.48913
10,KNN (k=5),8,10,8,8,0.5,0.444444,0.694915,0.470588,0.454545
