# Preliminaries

In [1]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report

In [8]:
data = pd.read_csv("/content/drive/MyDrive/SEM4/Machine Learning/Experiments/healthcare-dataset-stroke-data.csv")

print(data.shape)
data.head()

(5110, 12)


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## Data Processing

In [9]:
data = data.drop(columns=['id'])
data = data[data['bmi'].notna()]
data = data[data.smoking_status != "Unknown"]
# data = data[data.gender != "Other"]

In [10]:
x = data.drop(columns=['stroke'])
y = data['stroke']

In [11]:
oneHotEncoding = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0,4,5,6,9])], remainder='passthrough')
x = np.array(oneHotEncoding.fit_transform(x))
print(x)

[[  0.     1.     0.   ...   1.   228.69  36.6 ]
 [  0.     1.     0.   ...   1.   105.92  32.5 ]
 [  1.     0.     0.   ...   0.   171.23  34.4 ]
 ...
 [  1.     0.     0.   ...   0.   125.2   40.  ]
 [  1.     0.     0.   ...   0.    82.99  30.6 ]
 [  0.     1.     0.   ...   0.   166.29  25.6 ]]


In [12]:
x.shape

(3426, 20)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

## Training SVM

## Radial basis function kernel

In [14]:
svc_rbf = SVC()
svc_rbf.fit(x_train, y_train)
print(accuracy_score(y_test, svc_rbf.predict(x_test)))

0.9533527696793003


In [19]:
y_pred_rbf = svc_rbf.predict(x_test)

In [21]:
print(classification_report(y_test, y_pred_rbf, zero_division=1))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98       654
           1       1.00      0.00      0.00        32

    accuracy                           0.95       686
   macro avg       0.98      0.50      0.49       686
weighted avg       0.96      0.95      0.93       686



In [22]:
y_pred_rbf = svc_rbf.predict(x)

In [23]:
print(classification_report(y, y_pred_rbf, zero_division=1))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      3246
           1       1.00      0.00      0.00       180

    accuracy                           0.95      3426
   macro avg       0.97      0.50      0.49      3426
weighted avg       0.95      0.95      0.92      3426



## Linear kernel

In [15]:
svc_linear = SVC(kernel = "linear")
svc_linear.fit(x_train, y_train)
print(accuracy_score(y_test, svc_linear.predict(x_test)))

0.9533527696793003


In [24]:
y_pred_linear = svc_linear.predict(x_test)

In [25]:
print(classification_report(y_test, y_pred_linear, zero_division=1))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98       654
           1       1.00      0.00      0.00        32

    accuracy                           0.95       686
   macro avg       0.98      0.50      0.49       686
weighted avg       0.96      0.95      0.93       686



In [26]:
y_pred_linear = svc_linear.predict(x)

In [27]:
print(classification_report(y, y_pred_linear, zero_division=1))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      3246
           1       1.00      0.00      0.00       180

    accuracy                           0.95      3426
   macro avg       0.97      0.50      0.49      3426
weighted avg       0.95      0.95      0.92      3426



## Poly kernel

In [16]:
for i in range(6):
  svm_poly = SVC(kernel = "poly", degree = i)
  svm_poly.fit(x_train, y_train)
  print(accuracy_score(y_test, svm_poly.predict(x_test)))

0.9533527696793003
0.9533527696793003
0.9533527696793003
0.9533527696793003
0.9533527696793003
0.9533527696793003


In [28]:
y_pred_poly = svm_poly.predict(x_test)

In [29]:
print(classification_report(y_test, y_pred_poly, zero_division=1))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98       654
           1       1.00      0.00      0.00        32

    accuracy                           0.95       686
   macro avg       0.98      0.50      0.49       686
weighted avg       0.96      0.95      0.93       686



In [30]:
y_pred_poly = svm_poly.predict(x)

In [31]:
print(classification_report(y, y_pred_poly, zero_division=1))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      3246
           1       1.00      0.00      0.00       180

    accuracy                           0.95      3426
   macro avg       0.97      0.50      0.49      3426
weighted avg       0.95      0.95      0.92      3426

