In [96]:
import pandas as pd
import pickle
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [97]:
df = pd.read_csv("Resultant_CSV.csv")

# 1. Selecting Important Features


In [98]:
#Selecting top 6 Important Features using chi2 score

X = df.iloc[:,0:11]
Y = df.iloc[:,-1]

bestfeatures = SelectKBest(score_func=chi2, k=6)
fit = bestfeatures.fit(X,Y)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Columns','Chi-Score']  
print(featureScores)

X_new = fit.fit_transform(X,Y)
cols=featureScores[featureScores['Chi-Score']>10]['Columns']

X_new = pd.DataFrame(X_new,columns=cols)
print(X_new)


              Columns    Chi-Score
0                  id     0.816746
1              gender     0.007337
2                 age  1896.196192
3        hypertension    51.933938
4       heart_disease    33.957709
5        ever_married    14.752847
6           work_type     1.408445
7      Residence_type     0.221088
8   avg_glucose_level    13.515120
9                 bmi   118.055292
10     smoking_status     1.686465
Columns  age  hypertension  heart_disease  ever_married  avg_glucose_level  \
0        101             0              1             1               2444   
1         95             1              1             1                579   
2         90             0              0             0               1919   
3         80             0              0             1                898   
4         99             0              0             1                123   
...      ...           ...            ...           ...                ...   
4385     101             1        

# 1. Splitting Data into Training and Testing Dataset


In [99]:
#Splitting Data into Training and Testing Dataset

train_x,test_x,train_y,test_y  = train_test_split(df[cols],df['stroke'],random_state=1200,test_size=0.25)

train_x.shape,test_x.shape,train_y.shape,test_y.shape


((3292, 6), (1098, 6), (3292,), (1098,))

# 2. Training the classifier model on training dataset


In [100]:
#The model used here is Decision Tree for classification
#Training Phase

model = DecisionTreeClassifier(criterion='entropy')
model.fit(train_x,train_y)

# 3. Testing the classifier model on testing dataset


In [101]:
#Model predicting on Test Dataset
#Testing Phase

pred_y=model.predict(test_x)
pred_y


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# 4. Evaluating the model on various metrics


In [102]:
#Calculating Accuracy of the model

accuracy=accuracy_score(pred_y,test_y)*100
print("Accuracy of the model is {:.2f}".format(accuracy))

Accuracy of the model is 92.26


In [103]:
#Confusion Matrix of the model

confusion_matrix(pred_y,test_y)

array([[1003,   43],
       [  42,   10]], dtype=int64)

In [104]:
#All classes are treated equally to evaluate the overall performance of the classifier. Hence used Macro Average Precision

precision_score(test_y, pred_y, average='macro')

0.5755993528460068

In [105]:
f1_score(test_y, pred_y, average='macro')

0.5749128919860627

# 1. Hyperparameter Tuning


In [106]:
#Creating a model on criterion based on Gini Impurity and random state as 6.

model = DecisionTreeClassifier(criterion='gini', random_state=6)
model.fit(train_x,train_y)

In [107]:
pred_y=model.predict(test_x)

In [108]:
#After setting these hyperparameters, the Accuracy of the model has increased from 92.26 to 93.08

accuracy=accuracy_score(pred_y,test_y)*100
print("Accuracy of the model is {:.2f}".format(accuracy))

Accuracy of the model is 93.08


# 2. Saving Model to a pickle file


In [109]:
pickle.dump(model, open('classifier_model.pkl', 'wb'))

# 3. Predict values for validate set using pickle file


In [110]:
# the trained model from the pickle file
with open('classifier_model.pkl', 'rb') as file:
    model = pickle.load(file)

X_validate = df.drop(columns=['stroke'])  # Assuming 'stroke' is the target column and should be excluded

X_validate = X_validate[model.feature_names_in_]

# Make predictions on the validation data
predictions = model.predict(X_validate)

y_true = df['stroke']

y_true

0       1
1       1
2       1
3       1
4       1
       ..
4385    0
4386    0
4387    0
4388    0
4389    0
Name: stroke, Length: 4390, dtype: int64

# 4. Write validate set along with it's predicted values to csv file.


In [111]:
# The trained model from the pickle file
with open('classifier_model.pkl', 'rb') as file:
    model = pickle.load(file)

X_validate = df.drop(columns=['stroke'])  # Assuming 'stroke' is the target column and should be excluded

X_validate = X_validate[model.feature_names_in_]

predictions = model.predict(X_validate)

y_true = df['stroke']

# Add the predicted values as a new column to the validation DataFrame
df['predicted_stroke'] = predictions

# Write the validation set along with the predicted values to a CSV file
df.to_csv('validation_with_predictions.csv', index=False)

df


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,predicted_stroke
0,1841,1,101,0,1,1,2,0,2444,197,2,1,0
1,3221,1,95,1,1,1,2,0,579,146,2,1,0
2,638,0,90,0,0,0,2,1,1919,100,2,1,1
3,1618,0,80,0,0,1,2,0,898,153,0,1,1
4,3640,0,99,0,0,1,2,1,123,114,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4385,1122,0,101,1,0,1,2,1,1338,153,2,0,0
4386,2655,0,102,0,0,1,3,1,2982,272,2,0,0
4387,1193,0,56,0,0,1,3,0,1294,178,2,0,0
4388,2226,1,72,0,0,1,2,0,3309,128,1,0,0
