# Loading data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


# Train-Test Split

In [3]:
from sklearn.model_selection import train_test_split
X= df.drop(columns = 'target')
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10, stratify = y)
print(len(X_train), len(X_test))

242 61


In [4]:
X_train.shape

(242, 13)

## One-Hot Encoding

In [5]:
cat_columns = ['cp', 'exang', 'slope', 'thal']
num_columns = [c for c in X_train.columns if c not in cat_columns]

In [6]:
from sklearn.preprocessing import OneHotEncoder

In [7]:
#Create an OneHotEncoder instance
encoder = OneHotEncoder(handle_unknown = 'ignore')

#Fit on categorical columns
encoder.fit(X_train[cat_columns])

#Transform on training data
X_train_cat_encoded = encoder.transform(X_train[cat_columns])

column_names = encoder.get_feature_names(input_features = cat_columns)
#print(X_train_cat_encoded.toarray())
print(X_train_cat_encoded.todense().shape)
print(column_names)

X_train_encoded_df = pd.DataFrame(X_train_cat_encoded.todense(),
                                  columns = column_names,
                                  index = X_train.index)

#X_train_encoded_df.head()

(242, 13)
['cp_0' 'cp_1' 'cp_2' 'cp_3' 'exang_0' 'exang_1' 'slope_0' 'slope_1'
 'slope_2' 'thal_0' 'thal_1' 'thal_2' 'thal_3']


In [8]:
X_train_encoded = pd.concat([X_train[num_columns], X_train_encoded_df], axis = 1)
#X_train_encoded.head()

In [9]:
#Fit on categorical columns
encoder.fit(X_test[cat_columns])

#Transform on training data
X_test_cat_encoded = encoder.transform(X_test[cat_columns])

column_names = encoder.get_feature_names(input_features = cat_columns)
#print(X_test_cat_encoded.toarray())
print(X_test_cat_encoded.todense().shape)
print(column_names)

X_test_encoded_df = pd.DataFrame(X_test_cat_encoded.todense(),
                                  columns = column_names,
                                  index = X_test.index)

#print(X_test_encoded_df.head())
X_test_encoded = pd.concat([X_test[num_columns], X_test_encoded_df], axis = 1)
#X_test_encoded.head()

(61, 13)
['cp_0' 'cp_1' 'cp_2' 'cp_3' 'exang_0' 'exang_1' 'slope_0' 'slope_1'
 'slope_2' 'thal_0' 'thal_1' 'thal_2' 'thal_3']


# Model

In [10]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train_encoded, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

# Grid Search

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

grid_param = {'kernel':('linear', 'rbf'), 'C':(1,10)}

scorer = make_scorer(f1_score, average = 'micro')

clf = GridSearchCV(SVC(), grid_param, scoring = scorer)
clf.fit(X_train_encoded, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': (1, 10), 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(f1_score, average=micro), verbose=0)

In [12]:
print(clf.best_score_, clf.best_params_)

0.8262755102040817 {'C': 1, 'kernel': 'linear'}


# Final Model

In [15]:
model = SVC(random_state = 1, kernel = 'linear', C = 1)
model.fit(X_train_encoded, y_train)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=1, shrinking=True, tol=0.001,
    verbose=False)

# Evaluation

In [17]:
from sklearn import metrics
y_pred = model.predict(X_test_encoded)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.82      0.84        28
           1       0.85      0.88      0.87        33

    accuracy                           0.85        61
   macro avg       0.85      0.85      0.85        61
weighted avg       0.85      0.85      0.85        61

