In [None]:
# untuk ambil dataset lewat google drive
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# Library yang akan dipakai
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ML Project/healthcare-dataset-stroke-data.csv")

## Data Processing

In [None]:
# menampilkan jumlah fitur dan sample
print(data.shape)
data.head()

(5110, 12)


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [None]:
# Data cleaning
data = data.drop(columns=['id'])
data = data[data['bmi'].notna()]
data = data[data.smoking_status != "Unknown"]
data = data[data.gender != "Other"]

In [None]:
x = data.drop(columns=['stroke']) # Data Independen
y = data['stroke'] # Data Dependen

In [None]:
# One Hot Encoding
oneHotEncoding = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0,4,5,6,9])], remainder='passthrough')
x = np.array(oneHotEncoding.fit_transform(x))
print(x)

[[  0.     1.     0.   ...   1.   228.69  36.6 ]
 [  0.     1.     0.   ...   1.   105.92  32.5 ]
 [  1.     0.     0.   ...   0.   171.23  34.4 ]
 ...
 [  1.     0.     0.   ...   0.   125.2   40.  ]
 [  1.     0.     0.   ...   0.    82.99  30.6 ]
 [  0.     1.     0.   ...   0.   166.29  25.6 ]]


In [None]:
# Jumlah fitur dan sample setelah encoding
x.shape

(3425, 19)

In [None]:
# Pembagian data training dan data test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [None]:
# standard scaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

# Training Model

## Without StandardScaler()

In [None]:
# SVC dengan kernel apapun
svc_rbf = SVC()
svc_rbf.fit(x_train, y_train)
print(accuracy_score(y_test, svc_rbf.predict(x_test)))

0.9416058394160584


In [None]:
# SVC dengan kernel linear
svc_linear = SVC(kernel = "linear")
svc_linear.fit(x_train, y_train)
print(accuracy_score(y_test, svc_linear.predict(x_test)))

0.9416058394160584


In [None]:
# SVC dengan kernel polinomial hingga degree = 6
for i in range(6):
  svm_poly = SVC(kernel = "poly", degree = i)
  svm_poly.fit(x_train, y_train)
  print(accuracy_score(y_test, svm_poly.predict(x_test)))

0.9416058394160584
0.9416058394160584
0.9416058394160584
0.9416058394160584
0.9416058394160584
0.9416058394160584


In [None]:
# mencari nilai cross value pada svc linear
scores = cross_val_score(svc_linear, x, y, cv=10)
scores

array([0.94752187, 0.94752187, 0.94752187, 0.94752187, 0.94752187,
       0.94736842, 0.94736842, 0.94736842, 0.94736842, 0.94736842])

In [None]:
# menampilkan metrik
from sklearn import metrics
print(metrics.classification_report(y_test, svc_linear.predict(x_test)))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       645
           1       0.00      0.00      0.00        40

    accuracy                           0.94       685
   macro avg       0.47      0.50      0.48       685
weighted avg       0.89      0.94      0.91       685



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Using StandardScaler()

In [None]:
svc_rbf = SVC()
svc_rbf.fit(x_train, y_train)
print(accuracy_score(y_test, svc_rbf.predict(x_test)))

0.9416058394160584


In [None]:
svc_linear = SVC(kernel = "linear")
svc_linear.fit(x_train, y_train)
print(accuracy_score(y_test, svc_linear.predict(x_test)))

0.9416058394160584


In [None]:
for i in range(6):
  svm_poly = SVC(kernel = "poly", degree = i)
  svm_poly.fit(x_train, y_train)
  print(accuracy_score(y_test, svm_poly.predict(x_test)))

0.9416058394160584
0.9416058394160584
0.9416058394160584
0.9416058394160584
0.9430656934306569
0.9401459854014599


In [None]:
scores = cross_val_score(svc_linear, x, y, cv=10)
scores

array([0.94752187, 0.94752187, 0.94752187, 0.94752187, 0.94752187,
       0.94736842, 0.94736842, 0.94736842, 0.94736842, 0.94736842])

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, svc_linear.predict(x_test)))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       645
           1       0.00      0.00      0.00        40

    accuracy                           0.94       685
   macro avg       0.47      0.50      0.48       685
weighted avg       0.89      0.94      0.91       685



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## All Sample Using StandardScaler()

In [None]:
svc_rbf = SVC()
svc_rbf.fit(x, y)
print(accuracy_score(y, svc_rbf.predict(x)))

0.9474452554744526


In [None]:
svc_linear = SVC(kernel = "linear")
svc_linear.fit(x, y)
print(accuracy_score(y, svc_linear.predict(x)))

0.9474452554744526


In [None]:
for i in range(6):
  svm_poly = SVC(kernel = "poly", degree = i)
  svm_poly.fit(x, y)
  print(accuracy_score(y, svm_poly.predict(x)))

0.9474452554744526
0.9474452554744526
0.9474452554744526
0.9474452554744526
0.9474452554744526
0.9474452554744526


In [None]:
scores = cross_val_score(svc_linear, x, y, cv=10)
scores

array([0.94752187, 0.94752187, 0.94752187, 0.94752187, 0.94752187,
       0.94736842, 0.94736842, 0.94736842, 0.94736842, 0.94736842])

In [None]:
from sklearn import metrics
print(metrics.classification_report(y, svc_linear.predict(x)))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      3245
           1       0.00      0.00      0.00       180

    accuracy                           0.95      3425
   macro avg       0.47      0.50      0.49      3425
weighted avg       0.90      0.95      0.92      3425



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(metrics.classification_report(y, svm_poly.predict(x)))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      3245
           1       0.00      0.00      0.00       180

    accuracy                           0.95      3425
   macro avg       0.47      0.50      0.49      3425
weighted avg       0.90      0.95      0.92      3425



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
