In [87]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [88]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")

#### PROBLEM STATEMENT
 
In this project, we are going to bulid a predictive model to predict whether a patient is likely to get stroke based on the input parameters provided.

# DATA DESCRIPTION

The dataset contains 5110 observations with 12 attributes. 

Target Variable : stroke

Predictor Variable : gender, age, hypertension, heart_disease, ever_married, work_type, Residence_type, avg_glucose_level, bmi, smoking_status.

Attribute Information:

1) id: unique identifier

2) gender: "Male", "Female" or "Other"

3) age: age of the patient

4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension

5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease

6) ever_married: "No" or "Yes"

7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"

8) Residence_type: "Rural" or "Urban"

9) avg_glucose_level: average glucose level in blood

10) bmi: body mass index

11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"

12) stroke: 1 if the patient had a stroke or 0 if not

In [89]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [90]:
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [91]:
df.shape

(5110, 12)

In [92]:
for col in df.columns:
    print(df[col].value_counts())
    print("-"*25)

49152    1
21785    1
25982    1
5500     1
28027    1
        ..
45759    1
4795     1
47802    1
49849    1
16380    1
Name: id, Length: 5110, dtype: int64
-------------------------
Female    2994
Male      2115
Other        1
Name: gender, dtype: int64
-------------------------
78.00    102
57.00     95
52.00     90
54.00     87
51.00     86
        ... 
0.48       3
1.40       3
0.16       3
0.08       2
0.40       2
Name: age, Length: 104, dtype: int64
-------------------------
0    4612
1     498
Name: hypertension, dtype: int64
-------------------------
0    4834
1     276
Name: heart_disease, dtype: int64
-------------------------
Yes    3353
No     1757
Name: ever_married, dtype: int64
-------------------------
Private          2925
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: work_type, dtype: int64
-------------------------
Urban    2596
Rural    2514
Name: Residence_type, dtype: int64
-------------------------
93.88     6
72.

#### DATA CLEANING

In [93]:
lcn.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [94]:
df.bmi = df.bmi.fillna(df.bmi.mean())

In [95]:
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [165]:
df.gender.replace({"Male":1, "Female":0, "Other":-1} ,inplace = True)
df.ever_married.replace({"No":0 , "Yes":1 }, inplace = True)
df.work_type.replace({"children":0, "Never_worked":1, "Self-employed":2 , "Private" :-1, "Govt_job":-2 }, inplace = True)
df.Residence_type.replace({"Urban":0, "Rural":1}, inplace = True)
df.smoking_status.replace({"Unknown":0, "never smoked":1, "formerly smoked":2, "smokes":-1}, inplace = True)

In [166]:
df.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,0,1,228.69,36.6,2,1
1,51676,0,61.0,0,0,1,1,0,202.21,28.893237,0,1
2,31112,1,80.0,0,1,1,0,0,105.92,32.5,0,1
3,60182,0,49.0,0,0,1,0,1,171.23,34.4,255,1
4,1665,0,79.0,1,0,1,1,0,174.12,24.0,0,1
5,56669,1,81.0,0,0,1,0,1,186.21,29.0,2,1
6,53882,1,74.0,1,1,1,0,0,70.09,27.4,0,1
7,10434,0,69.0,0,0,0,0,1,94.39,22.8,0,1
8,27419,0,59.0,0,0,1,0,0,76.15,28.893237,1,1
9,60491,0,78.0,0,0,1,0,1,58.57,24.2,1,1


#### DATA VISUALIZATION

In [167]:
gender  = df["gender"].tolist()
smokingStatus = df["smoking_status"].tolist()
workType = df["work_type"].tolist()
residenceType = df["Residence_type"].tolist()
stroke = df["stroke"]


Splitting the Data into Train, Test Datasets

In [168]:
df_x = df.iloc[: , 1:11]
df_y = df.iloc[: , -1]   

In [169]:
import sklearn
from sklearn.model_selection import train_test_split
df_x_train, df_x_test, df_y_train , df_y_test  =  train_test_split( df_x , df_y , test_size = .2 , random_state = 101)

In [170]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()

In [171]:
logmodel.fit(df_x_train, df_y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [173]:
pred_value = logmodel.predict(df_x_test)
pred_value

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [175]:
list(pred_value)
np.asarray(list(pred_value))

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

#### MODEL EVALUATION

Confusion matrix

In [179]:
from sklearn.metrics import confusion_matrix
tab1 = confusion_matrix(pred_value , df_y_test)
tab1

array([[966,  53],
       [  2,   1]], dtype=int64)

In [180]:
tab1.diagonal().sum() / tab1.sum() * 100

94.6183953033268

In [182]:
logmodel.predict_proba(df_x_test)

array([[0.98330605, 0.01669395],
       [0.98237917, 0.01762083],
       [0.77454284, 0.22545716],
       ...,
       [0.88144416, 0.11855584],
       [0.88165315, 0.11834685],
       [0.92367796, 0.07632204]])

In [183]:
pred_full_prob = logmodel.predict_proba(df_x)

In [190]:
pred_full_prob

array([[0.83316551, 0.16683449],
       [0.9270004 , 0.0729996 ],
       [0.84165052, 0.15834948],
       ...,
       [0.99035776, 0.00964224],
       [0.97332506, 0.02667494],
       [0.97372443, 0.02627557]])

In [184]:
pd.Series(pred_full_prob[:,1])

0       0.166834
1       0.073000
2       0.158349
3       0.036909
4       0.282481
          ...   
5105    0.235837
5106    0.127933
5107    0.009642
5108    0.026675
5109    0.026276
Length: 5110, dtype: float64

In [186]:
df1 = pd.concat([df.id, pd.Series(pred_full_prob[:,-1])], axis=1)

In [187]:
df1 = df1.rename(columns={df1.columns[1]:"Probs"})

In [188]:
df1.head()

Unnamed: 0,id,Probs
0,9046,0.166834
1,51676,0.073
2,31112,0.158349
3,60182,0.036909
4,1665,0.282481


In [189]:
df1.sort_values(['Probs'], ascending=False)

Unnamed: 0,id,Probs
337,56357,6.802943e-01
187,67895,6.265169e-01
3318,65955,5.703511e-01
215,68025,5.669340e-01
35,712,5.656374e-01
...,...,...
2801,760,1.273674e-03
3705,11658,1.086063e-03
4209,51856,9.738449e-04
2128,56420,4.917972e-04
