In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.preprocessing import MinMaxScaler

#getting the data set
data=pd.read_csv('brain_stroke.csv',index_col=0)
data.head(-5)
#print(data)

#checking the missing rows
#print(data.isna().sum())
#checking the type of each rows
#data.info()



# Importing LabelEncoder
from sklearn.preprocessing import LabelEncoder
# Instantiating LabelEncoder
le=LabelEncoder()
# Iterating over all the values of each column and extract their dtypes
for col in data.columns.to_numpy():
    # Comparing if the dtype is object
    if data[col].dtypes in ('object','category'):
    # Using LabelEncoder to do the numeric transformation
        data[col]=le.fit_transform(data[col].astype(str))

#establishing the training set and the test set
x=data.drop(["stroke"],axis=1)
y=data["stroke"]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42,stratify=y)
x_train.shape, x_test.shape

# Normalize feature data
scaler = MinMaxScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Saving the feature names 
features = x.columns.tolist()

import time
startTime = time.time()
##Decision tree
dt=DecisionTreeClassifier(max_depth=5,random_state=42,criterion="gini",splitter="best",min_samples_split=10,max_leaf_nodes=15)
dt.fit(x_train,y_train)


# Extracting the importances by sklearn 
importances_sk = dt.feature_importances_
feature_importance_sk = {}
for i, feature in enumerate(features):
    feature_importance_sk[feature] = round(importances_sk[i], 3)
    
print(f"Feature importance by sklearn: {feature_importance_sk}")



#Evaluate the accuracy of the model
y_pred = dt.predict(x_test)
predictions = metrics.accuracy_score(y_test, y_pred)
#Calculating the accuracy in percentage
print('The accuracy is: ', predictions * 100, '%')
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

Feature importance by sklearn: {'age': 0.538, 'hypertension': 0.0, 'heart_disease': 0.075, 'ever_married': 0.022, 'work_type': 0.0, 'Residence_type': 0.0, 'avg_glucose_level': 0.164, 'bmi': 0.172, 'smoking_status': 0.03}
The accuracy is:  94.24749163879599 %
Execution time in seconds: 0.01895594596862793


In [10]:
#creating reduced vectors
x=data.drop(columns=['stroke', 'hypertension','work_type','Residence_type','ever_married','smoking_status'])
y=data["stroke"]
print(x)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42,stratify=y)
x_train.shape, x_test.shape

         age  heart_disease  avg_glucose_level   bmi
gender                                              
Male    67.0              1             228.69  36.6
Male    80.0              1             105.92  32.5
Female  49.0              0             171.23  34.4
Female  79.0              0             174.12  24.0
Male    81.0              0             186.21  29.0
...      ...            ...                ...   ...
Male    41.0              0              70.15  29.8
Male    40.0              0             191.15  31.1
Female  45.0              0              95.02  31.8
Male    40.0              0              83.94  30.0
Female  80.0              0              83.75  29.1

[4981 rows x 4 columns]


((3486, 4), (1495, 4))

In [14]:
import time
startTime = time.time()
##Decision tree
dt=DecisionTreeClassifier(max_depth=5,random_state=42,criterion="gini",splitter="best",min_samples_split=10,max_leaf_nodes=15)
dt.fit(x_train,y_train)

#Evaluate the accuracy of the model
y_pred = dt.predict(x_test)
predictions = metrics.accuracy_score(y_test, y_pred)
#Calculating the accuracy in percentage
print('The accuracy is: ', predictions * 100, '%')
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

The accuracy is:  94.11371237458194 %
Execution time in seconds: 0.015624761581420898
