In [47]:
import numpy as np
import pandas as pd

In [48]:
data  = pd.read_csv("/home/adarsh/Downloads/Machine learning/Project/input/brain_stroke.csv")
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   object 
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   object 
 5   work_type          4981 non-null   object 
 6   Residence_type     4981 non-null   object 
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   object 
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB


In [50]:
#To find the number of null values.
#Luckily we dont have any null values in the dataset.
data.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [51]:
#Data Preprocessing
"""
Machine learning models do not tend to perform well on data that contains text as parameters. So it is better that we convert it into
numbers using encoding.

There are various methods to achieve this.
Label encoding
One Hot encoding. etc.,

We will use label encoder here to encode the textual data to numbers.
"""

'\nMachine learning models do not tend to perform well on data that contains text as parameters. So it is better that we convert it into\nnumbers using encoding.\n\nThere are various methods to achieve this.\nLabel encoding\nOne Hot encoding. etc.,\n\nWe will use label encoder here to encode the textual data to numbers.\n'

In [52]:
#importing label encoder from sklearn.preprocessing  to modify the data as required.
#As per the data visualisation above we could see that parameters "Gender","ever_married", "work_type", "residence type", "smoking status" have textual data.
#do not use transform directly. use fit_transform

from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

data["gender"] = labelencoder.fit_transform(data["gender"])
data["ever_married"] = labelencoder.fit_transform(data["ever_married"])
data["work_type"] = labelencoder.fit_transform(data["work_type"])
data["Residence_type"] = labelencoder.fit_transform(data["Residence_type"])
data["smoking_status"] = labelencoder.fit_transform(data["smoking_status"])


In [53]:
#Now we are able to see that the data is converted to numbers.
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   int64  
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   int64  
 5   work_type          4981 non-null   int64  
 6   Residence_type     4981 non-null   int64  
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   int64  
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(8)
memory usage: 428.2 KB


In [54]:
#Now we are splitting the data into train and test.
#we have a feature in scikit to to this. 

X = data.drop("stroke",axis=1)
Y = data["stroke"]

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=10)

In [55]:
#We would be able to clearlt see that the textual data is now modified

X_train.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
4414,0,46.0,0,0,1,1,0,106.47,27.2,0
2751,1,80.0,1,0,1,2,0,232.12,28.8,2
3236,0,37.0,0,0,1,1,0,110.28,22.3,2
2591,1,14.0,0,0,0,1,0,108.65,23.1,2
1404,1,71.0,0,0,1,1,1,108.43,32.8,3


In [56]:
#Feature scaling
# We need to scale our data to a uniform range


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [57]:
#trying the pre processed data in various algorithms

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

DecisionTree = DecisionTreeClassifier()
DecisionTree.fit(X_train_scaled,Y_train) #fitting the preprocessed data with the decision tree algorithm
ycap = DecisionTree.predict(X_test_scaled) 
accuracy = accuracy_score(Y_test, ycap)
print("The accuracy of the decision tree algorithm for the data is: {}".format(accuracy))

The accuracy of the decision tree algorithm for the data is: 0.9107321965897693


In [58]:
#Using machine learning ensembling method to improve the performance of the prediction.

from sklearn.ensemble import BaggingClassifier

bag_model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
bag_model.fit(X_train_scaled, Y_train)
print(bag_model.oob_score_)

0.946285140562249


In [59]:
#training the model with KNN algorithm and getting the accuracy score of the prediction

from sklearn.neighbors import KNeighborsClassifier
KNN=KNeighborsClassifier()

KNN.fit(X_train_scaled, Y_train)
ycap = KNN.predict(X_test_scaled)
accuracy = accuracy_score(Y_test, ycap)
print("The accuracy of the KNN algorithm for the data is: {}".format(accuracy))

The accuracy of the KNN algorithm for the data is: 0.9518555667001003


In [60]:
#training the model with Random forest and getting the accuracy score of the prediction

from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()

RF.fit(X_train_scaled, Y_train)
ycap = RF.predict(X_test_scaled)
accuracy = accuracy_score(Y_test, ycap)
print("The accuracy of the Random forest algorithm for the data is: {}".format(accuracy))

The accuracy of the Random forest algorithm for the data is: 0.9478435305917753


In [61]:
#training the model with SVM algorithm and getting the accuracy score of the prediction

from sklearn.svm import SVC
SVM = SVC()

SVM.fit(X_train_scaled, Y_train)
ycap = SVM.predict(X_test_scaled)
accuracy = accuracy_score(Y_test, ycap)
print("The accuracy of the SVM algorithm for the data is: {}".format(accuracy))

The accuracy of the SVM algorithm for the data is: 0.950852557673019


In [67]:
#We stack the models trained to reduce the variance of the model, and this gives a whooping .94> accuracy.
# we will save this model and use it to build an user interface

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

models = [
    ('DecisionTree',DecisionTree),
    ('KNN',KNN),
    ('RF',RF),
    ('SVM',SVM)]

stackmodel = StackingClassifier(estimators = models, final_estimator = LogisticRegression())
stackmodel.fit(X_train_scaled, Y_train)
ypred = stackmodel.predict(X_test_scaled)
accuracy = accuracy_score(Y_test, ypred)
print("The accuracy of the stacked model for the data is: {}".format(accuracy))

The accuracy of the stacked model for the data is: 0.9498495486459378


In [64]:
#saving the model

import joblib
import os

joblib.dump(stackmodel, os.path.join('/home/adarsh/Downloads/Machine learning/Project/models','model.bin'))

['/home/adarsh/Downloads/Machine learning/Project/models/model.bin']