**Notes about SVM Hyperparameters**

High gamma means high bias and low variance
Low gamma means low bias and high variance

High C means high variance and low bias

Bias is underfitting and variance is overfitting

In [0]:
# Upload file
import os

if not os.path.exists("heart-attack-prediction.csv"):
    !rm -rf sample_data
    from google.colab import files
    files.upload()
    !xz -d *.xz
    print("Uploaded files:")
    !ls
else:
    print("Already have files:")
    !ls

Already have files:
heart-attack-prediction.csv


In [0]:
# Prepare dataframe
import pandas as pd
import numpy as np

df = (pd.read_csv("heart-attack-prediction.csv")
        .replace("?", np.nan) # change ? to nan
        
        .drop(["ca", "thal", "slope"], axis=1)) # drop columns w/ too many nulls

# Encode feature cp using one-hot encoding
df = pd.concat([df, pd.get_dummies(df["cp"], prefix="cp")], axis=1) # Don't drop first if using SVM
df = pd.concat([df, pd.get_dummies(df["restecg"], prefix="restecg")], axis=1)

# Reorder and drop columns
df = df[["age", "sex", "cp_1", "cp_2", "cp_3", "cp_4", "trestbps", "chol", "fbs", "restecg_0", "restecg_1", "restecg_2", "thalach", "exang", "oldpeak", "num"]]

In [0]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(max_iter=10, random_state=0)

imp.fit(df);

col = df.columns

df = pd.DataFrame(imp.transform(df))

df.columns = col

# for column in df.columns:
#     print (column, df[column].unique(), "\n")
    
df.head()

Unnamed: 0,age,sex,cp_1,cp_2,cp_3,cp_4,trestbps,chol,fbs,restecg_0,restecg_1,restecg_2,thalach,exang,oldpeak,num
0,28.0,1.0,0.0,1.0,0.0,0.0,130.0,132.0,0.0,0.0,0.0,1.0,185.0,0.0,0.0,0.0
1,29.0,1.0,0.0,1.0,0.0,0.0,120.0,243.0,0.0,1.0,0.0,0.0,160.0,0.0,0.0,0.0
2,29.0,1.0,0.0,1.0,0.0,0.0,140.0,243.837881,0.0,1.0,0.0,0.0,170.0,0.0,0.0,0.0
3,30.0,0.0,1.0,0.0,0.0,0.0,170.0,237.0,0.0,0.0,1.0,0.0,170.0,0.0,0.0,0.0
4,31.0,0.0,0.0,1.0,0.0,0.0,100.0,219.0,0.0,0.0,1.0,0.0,150.0,0.0,0.0,0.0


In [0]:
from sklearn.model_selection import train_test_split

x = df.drop("num", axis=1)
y = df["num"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

In [0]:
from sklearn.svm import SVC
svm = SVC(kernel="linear", gamma=0.00001, C=100)

svm.fit(x_train, y_train)

y_pred = svm.predict(x_test)

In [0]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_pred, y_test)
cm

array([[ 9,  5],
       [12,  4]])

In [0]:
acc = (cm[0][0]+cm[1][1])/(cm.sum().sum())
prec = (cm[1][1])/(cm[1][0]+cm[1][1])
rec = (cm[1][1])/(cm[0][1]+cm[1][1])
f1 = 2*prec*rec/(prec+rec)
f2 = 5*prec*rec/(4*prec+rec)

metrics = pd.DataFrame({"Metric": ["f1", "f2", "precision", "recall", "accuracy"], "Score": [f1, f2, prec, rec, acc]})
metrics

Unnamed: 0,Metric,Score
0,f1,0.32
1,f2,0.384615
2,precision,0.25
3,recall,0.444444
4,accuracy,0.433333
