# **RAW DATA**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('brain_stroke.csv')
data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
4976,Male,41.0,0,0,No,Private,Rural,70.15,29.8,formerly smoked,0
4977,Male,40.0,0,0,Yes,Private,Urban,191.15,31.1,smokes,0
4978,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,31.8,smokes,0
4979,Male,40.0,0,0,Yes,Private,Rural,83.94,30.0,smokes,0


In [3]:
data.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [4]:
data.iloc[0]

gender                          Male
age                             67.0
hypertension                       0
heart_disease                      1
ever_married                     Yes
work_type                    Private
Residence_type                 Urban
avg_glucose_level             228.69
bmi                             36.6
smoking_status       formerly smoked
stroke                             1
Name: 0, dtype: object

In [5]:
data.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0
mean,43.419859,0.096165,0.05521,105.943562,28.498173,0.049789
std,22.662755,0.294848,0.228412,45.075373,6.790464,0.217531
min,0.08,0.0,0.0,55.12,14.0,0.0
25%,25.0,0.0,0.0,77.23,23.7,0.0
50%,45.0,0.0,0.0,91.85,28.1,0.0
75%,61.0,0.0,0.0,113.86,32.6,0.0
max,82.0,1.0,1.0,271.74,48.9,1.0


In [6]:
data.shape

(4981, 11)

In [7]:
data.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [8]:
data.select_dtypes(include=np.number).columns.tolist()

['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke']

In [9]:
encoder = LabelEncoder()
data['gender'] = encoder.fit_transform(data['gender'])
gender = {index : label for index, label in enumerate(encoder.classes_)}
gender

{0: 'Female', 1: 'Male'}

In [10]:
data['ever_married'] = encoder.fit_transform(data['ever_married'])
ever_married = {index : label for index, label in enumerate(encoder.classes_)}
ever_married

{0: 'No', 1: 'Yes'}

In [11]:
data['work_type'] = encoder.fit_transform(data['work_type'])
work_type = {index : label for index, label in enumerate(encoder.classes_)}
work_type

{0: 'Govt_job', 1: 'Private', 2: 'Self-employed', 3: 'children'}

In [12]:
data['Residence_type'] = encoder.fit_transform(data['Residence_type'])
Residence_type = {index : label for index, label in enumerate(encoder.classes_)}
Residence_type

{0: 'Rural', 1: 'Urban'}

In [13]:
data['smoking_status'] = encoder.fit_transform(data['smoking_status'])
smoking_status = {index : label for index, label in enumerate(encoder.classes_)}
smoking_status

{0: 'Unknown', 1: 'formerly smoked', 2: 'never smoked', 3: 'smokes'}

In [14]:
Y =data['stroke']
X = data.drop(['stroke'],axis=1)

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=42, shuffle = True, stratify = Y)

In [16]:
from sklearn.svm import SVC
svc_model = SVC(kernel='linear')
svc_model.fit(X_train, Y_train)

In [17]:
from sklearn.metrics import accuracy_score, confusion_matrix
predictions= svc_model .predict(X_train)
percentage=svc_model.score(X_train,Y_train)
res=confusion_matrix(Y_train,predictions)
print("Training confusion matrix")
print(res)
predictions= svc_model.predict(X_test)
percentage=svc_model.score(X_test,Y_test)
res=confusion_matrix(Y_test,predictions)
print("validation confusion matrix")
print(res)
# check the accuracy on the training set
print('training accuracy = '+str(svc_model.score(X_train, Y_train)*100))
print('testing accuracy = '+str(svc_model.score(X_test, Y_test)*100))

Training confusion matrix
[[3786    0]
 [ 198    0]]
validation confusion matrix
[[947   0]
 [ 50   0]]
training accuracy = 95.03012048192771
testing accuracy = 94.98495486459379


In [18]:
features = np.array([[0, 11, 1, 1, 1, 2, 0, 70, 29, 3]])
svc_model.predict(features)



array([0], dtype=int64)

In [19]:
if svc_model.predict(features) == 0:
    print("Anda diprediksi bebas Stroke Otak")
else:
    print("Anda terindikasi Stroke Otak")


Anda diprediksi bebas Stroke Otak




In [20]:
print(classification_report(Y_test, svc_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       947
           1       0.00      0.00      0.00        50

    accuracy                           0.95       997
   macro avg       0.47      0.50      0.49       997
weighted avg       0.90      0.95      0.93       997



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# **TRANSFORMATION & BALANCING DATA**

In [21]:
# for column in data.columns:
#     data[['age', 'avg_glucose_level', 'bmi']] = (data[['age', 'avg_glucose_level', 'bmi']] - data[['age', 'avg_glucose_level', 'bmi']].min()) / (data[['age', 'avg_glucose_level', 'bmi']].max() - data[['age', 'avg_glucose_level', 'bmi']].min())

In [21]:
data['age'] = np.log2(data['age'])
data['avg_glucose_level'] = np.log2(data['avg_glucose_level'])
data['bmi'] = np.log2(data['bmi'])

In [22]:
data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,6.066089,0,1,1,1,1,7.837249,5.193772,1,1
1,1,6.321928,0,1,1,1,0,6.726831,5.022368,2,1
2,0,5.614710,0,0,1,1,1,7.419792,5.104337,3,1
3,0,6.303781,1,0,1,2,0,7.443938,4.584963,2,1
4,1,6.339850,0,0,1,1,1,7.540787,4.857981,1,1
...,...,...,...,...,...,...,...,...,...,...,...
4976,1,5.357552,0,0,0,1,0,6.132371,4.897240,1,0
4977,1,5.321928,0,0,1,1,1,7.578561,4.958843,3,0
4978,0,5.491853,1,0,1,0,0,6.570159,4.990955,3,0
4979,1,5.321928,0,0,1,1,0,6.391287,4.906891,3,0


In [23]:
data.dropna()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,6.066089,0,1,1,1,1,7.837249,5.193772,1,1
1,1,6.321928,0,1,1,1,0,6.726831,5.022368,2,1
2,0,5.614710,0,0,1,1,1,7.419792,5.104337,3,1
3,0,6.303781,1,0,1,2,0,7.443938,4.584963,2,1
4,1,6.339850,0,0,1,1,1,7.540787,4.857981,1,1
...,...,...,...,...,...,...,...,...,...,...,...
4976,1,5.357552,0,0,0,1,0,6.132371,4.897240,1,0
4977,1,5.321928,0,0,1,1,1,7.578561,4.958843,3,0
4978,0,5.491853,1,0,1,0,0,6.570159,4.990955,3,0
4979,1,5.321928,0,0,1,1,0,6.391287,4.906891,3,0


In [24]:
print(data['stroke'].value_counts())
df_class_0 = data[data['stroke'] == 0]
df_class_1 = data[data['stroke'] == 1]

0    4733
1     248
Name: stroke, dtype: int64


In [25]:
from sklearn.utils import resample

In [26]:
df_majority = data[data.stroke==0]
df_minority = data[data.stroke==1]

In [27]:
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=245,    # to match minority class
                                 random_state=1)     # reproducible results

In [28]:
data_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [29]:
from sklearn.utils import shuffle
data_downsampled = shuffle(data_downsampled)

In [30]:
data_downsampled

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
144,1,6.285402,1,0,1,2,0,5.810186,4.672425,1,1
94,0,6.087463,0,0,0,0,1,6.359310,4.760221,0,1
181,1,6.339850,1,1,1,1,1,7.970911,4.812498,3,1
2272,1,6.357552,0,0,1,1,1,7.744296,5.083213,1,0
580,0,6.321928,0,1,1,2,1,6.305423,4.510962,2,0
...,...,...,...,...,...,...,...,...,...,...,...
3921,1,5.584963,0,0,1,1,1,6.301039,5.432959,2,0
2227,1,5.727920,0,0,1,0,0,6.426265,4.590961,2,0
46,1,6.189825,1,0,1,2,1,7.607256,5.035624,2,1
66,0,6.303781,0,0,0,1,0,6.474436,4.517276,2,1


In [32]:
#data_downsampled.to_csv('datastrokeclean.csv', index=False)

In [31]:
Y =data_downsampled['stroke']
X = data_downsampled.drop(['stroke'],axis=1)

In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=0, shuffle = True, stratify = Y)

In [33]:
from sklearn.svm import SVC
svc_model = SVC(kernel='linear',gamma=8)
svc_model.fit(X_train, Y_train)

In [34]:
from sklearn.metrics import accuracy_score, confusion_matrix
predictions= svc_model .predict(X_train)
percentage=svc_model.score(X_train,Y_train)
res=confusion_matrix(Y_train,predictions)
print("Training confusion matrix")
print(res)
predictions= svc_model.predict(X_test)
percentage=svc_model.score(X_test,Y_test)
res=confusion_matrix(Y_test,predictions)
print("validation confusion matrix")

print(res)
# check the accuracy on the training set
print('training accuracy = '+str(svc_model.score(X_train, Y_train)*100))
print('testing accuracy = '+str(svc_model.score(X_test, Y_test)*100))

Training confusion matrix
[[145  51]
 [ 29 169]]
validation confusion matrix
[[30 19]
 [ 3 47]]
training accuracy = 79.69543147208121
testing accuracy = 77.77777777777779


In [37]:
features = np.array([[0, np.log2(62), 0, 1, 0, 3, 1, np.log2(160), np.log2(53), 0]])
svc_model.predict(features)



array([1], dtype=int64)

In [38]:
if svc_model.predict(features) == 0:
    print("Anda diprediksi bebas Stroke")
else:
    print("Anda terindikasi Stroke")

Anda terindikasi Stroke




In [39]:
print(classification_report(Y_test, svc_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.91      0.61      0.73        49
           1       0.71      0.94      0.81        50

    accuracy                           0.78        99
   macro avg       0.81      0.78      0.77        99
weighted avg       0.81      0.78      0.77        99

