In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder

In [2]:
path = 'data/healthcare-dataset-stroke-data.csv'

dataframe = pd.read_csv(path)
dataframe.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


### FP-Growth algoritm 

I'm going to use thia algorithm to find frequent patterns from the data set that can help to identify patternas that characterize the stroke. 
This algoritm work in the next way:
1. List the items of the data set
2. Scan the whole data set, count each item and remove infrequent items.
3. Group the items in order of their frequency from bigger to smaller , and create a tree where each path represents the connection between the items and common combinations.


This Algorithm is more efficient then Apriori algorithm, due to it computation cost. Apriori algorithm each time create the new group of of k items that connect and based on this groups is scan the data set one more time to create thegroups of k+1 items.(Candidate generation and than filter them by support rate).

FP-Growth Algorithm works with categorical values, and can't handle numerical ,in our data set we have the numerical values like: age, glucose level ,hypertension, bmi, stroke . 

For the items like age,glucose level and bmi i'll use the binning  and discretization techniques to convert them to categorical values.

For hypertension ,stroke and heart deciase I'll use discritization.

### PreProcessing Data

In [3]:
cp_dataframe = dataframe.copy()

In [4]:
cp_dataframe = cp_dataframe.drop(axis=1,columns="id")

For this type of algorithm, I will apply binning to the numerical values. The maximum age in the dataset is 82, though it could potentially go up to 100. Therefore, I’ve defined the age ranges as 0–29, 30–49, and 50–82.

In [5]:
bins = [0, 30, 50, 82]
labels = [ 'young adult', 'middle age', 'senior']

cp_dataframe['age_group'] = pd.cut(cp_dataframe['age'],bins = bins,labels = labels,right=True)

In [6]:
cp_dataframe

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_group
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,senior
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1,senior
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,senior
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,middle age
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,senior
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0,senior
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0,senior
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0,middle age
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0,senior


From the dataframe we can observe couple of parameters that also have numerical values is glucose level and bmi. For them will be applied binning , for bmi is ranges of 10

In [70]:
print(max(cp_dataframe['bmi']),min(cp_dataframe['bmi']),max(cp_dataframe['avg_glucose_level']),min(cp_dataframe['avg_glucose_level']))

97.6 10.3 271.74 55.12


The maximum and minimum value for bmi is 10.3 and 97.6.We can create the range by 30.

**BMI Categories:**
  
Underweight is < 18.5  
Normal weight is 18.5 - 24.5  
Overweight is 25 - 29.9  
Obesity (Class I) is 30 - 34.9  
Obesity (Class II) is 35 - 39.9  
Obesity (class III) is 40+  

**Average Glucose Levels:**

Adults and Childern 13+ normal is 70-140 mg/dl  
Children under 13 normal is 70-180 mg/dl


In [7]:
bmi_mean_by_age = cp_dataframe.groupby('age_group')['bmi'].mean().to_dict()

  bmi_mean_by_age = cp_dataframe.groupby('age_group')['bmi'].mean().to_dict()


Create the bins for glucose level and bmi. For glucose levele i have bins of the size 20, and for bmi 10.

In [8]:
for key,val in bmi_mean_by_age.items():
    mask = (cp_dataframe['age_group'] == key) & (cp_dataframe['bmi'].isna())
    cp_dataframe.loc[mask,'bmi'] = val
    
cp_dataframe['bmi'] = cp_dataframe['bmi'].round(2)

#discretization and smoothing for bmi
# Define bin edges from 10 to 100 with a step of 10
bin_edges_bmi = np.arange(10, 100 + 10, 10)  # [10, 20, 30, ..., 100]

bin_edges_glucose = np.arange(55,275+90,90)
# Create BMI bins
cp_dataframe['bmi_bin'] = pd.cut(cp_dataframe['bmi'], bins=bin_edges_bmi, include_lowest=True)
#Create Glucose Bins 
cp_dataframe['avg_glucose_bin']= pd.cut(cp_dataframe['avg_glucose_level'], bins=bin_edges_glucose, include_lowest=True)




In [9]:
cp_dataframe

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_group,bmi_bin,avg_glucose_bin
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.60,formerly smoked,1,senior,"(30.0, 40.0]","(145.0, 235.0]"
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,30.67,never smoked,1,senior,"(30.0, 40.0]","(145.0, 235.0]"
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.50,never smoked,1,senior,"(30.0, 40.0]","(54.999, 145.0]"
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.40,smokes,1,middle age,"(30.0, 40.0]","(145.0, 235.0]"
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.00,never smoked,1,senior,"(20.0, 30.0]","(145.0, 235.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,30.67,never smoked,0,senior,"(30.0, 40.0]","(54.999, 145.0]"
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.00,never smoked,0,senior,"(30.0, 40.0]","(54.999, 145.0]"
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.60,never smoked,0,middle age,"(30.0, 40.0]","(54.999, 145.0]"
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.60,formerly smoked,0,senior,"(20.0, 30.0]","(145.0, 235.0]"


In [10]:
cp_dataframe = cp_dataframe.drop(axis =1, columns=['avg_glucose_level','age','bmi'])

In [11]:
categorical_columns = cp_dataframe.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_columns

['gender',
 'ever_married',
 'work_type',
 'Residence_type',
 'smoking_status',
 'age_group',
 'bmi_bin',
 'avg_glucose_bin']

In [121]:
encoder =OrdinalEncoder()

dataframe_cp = cp_dataframe.copy()
dataframe_cp[categorical_columns] = encoder.fit_transform(cp_dataframe[categorical_columns])

In [122]:
data_no_stroke = dataframe_cp.drop(columns='stroke')
stroke_column  = dataframe_cp['stroke']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(data_no_stroke, stroke_column)

In [123]:
X_resampled[categorical_columns] = encoder.inverse_transform(X_resampled[categorical_columns])

In [124]:
data = X_resampled.join(y_resampled,how='inner')
data['bmi_bin'] = data['bmi_bin'].astype('category')
data['avg_glucose_bin'] = data['avg_glucose_bin'].astype('category')

In [125]:
#lets one-hot encode the data set 
df_encoded = pd.get_dummies(data)

# Step 3: Convert to True/False format (optional, but cleaner for FP-Growth)
df_encoded = df_encoded.astype(bool)

In [126]:
df_encoded

Unnamed: 0,hypertension,heart_disease,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,...,"bmi_bin_(20.0, 30.0]","bmi_bin_(30.0, 40.0]","bmi_bin_(40.0, 50.0]","bmi_bin_(50.0, 60.0]","bmi_bin_(60.0, 70.0]","bmi_bin_(70.0, 80.0]","bmi_bin_(90.0, 100.0]","avg_glucose_bin_(54.999, 145.0]","avg_glucose_bin_(145.0, 235.0]","avg_glucose_bin_(235.0, 325.0]"
0,False,True,True,False,True,False,False,True,False,False,...,False,True,False,False,False,False,False,False,True,False
1,False,False,True,True,False,False,False,True,False,False,...,False,True,False,False,False,False,False,False,True,False
2,False,True,True,False,True,False,False,True,False,False,...,False,True,False,False,False,False,False,True,False,False
3,False,False,True,True,False,False,False,True,False,False,...,False,True,False,False,False,False,False,False,True,False
4,True,False,True,True,False,False,False,True,False,False,...,True,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9717,True,False,True,True,False,False,False,True,False,False,...,True,False,False,False,False,False,False,True,False,False
9718,False,False,True,True,False,False,False,True,True,False,...,True,False,False,False,False,False,False,False,True,False
9719,False,False,True,False,True,False,False,True,False,False,...,False,True,False,False,False,False,False,False,False,True
9720,False,False,True,True,False,False,False,True,False,False,...,True,False,False,False,False,False,False,True,False,False


In [14]:
#lets use the FP-Growth algorithm to check frequent patterns can we observe from the data and them retreive the association rules 
from mlxtend.frequent_patterns import fpgrowth, association_rules

In [15]:
frequent_itemsets = fpgrowth(df_encoded, min_support=0.4, use_colnames=True,verbose=1)


NameError: name 'df_encoded' is not defined

In [None]:
frequent_itemsets 

Unnamed: 0,support,itemsets
0,0.75756,(ever_married_Yes)
1,0.649969,(age_group_senior)
2,0.604814,(work_type_Private)
3,0.5,(stroke)
4,0.48776,(Residence_type_Urban)
5,0.618186,(gender_Female)
6,0.51224,(Residence_type_Rural)
7,0.790064,"(avg_glucose_bin_(54.999, 145.0])"
8,0.509977,"(bmi_bin_(20.0, 30.0])"
9,0.569327,"(ever_married_Yes, avg_glucose_bin_(54.999, 14..."


In [129]:
frequent_itemsets['itemsets']

0                                    (ever_married_Yes)
1                                    (age_group_senior)
2                                   (work_type_Private)
3                                              (stroke)
4                                (Residence_type_Urban)
5                                       (gender_Female)
6                                (Residence_type_Rural)
7                     (avg_glucose_bin_(54.999, 145.0])
8                                (bmi_bin_(20.0, 30.0])
9     (ever_married_Yes, avg_glucose_bin_(54.999, 14...
10                 (ever_married_Yes, age_group_senior)
11    (age_group_senior, avg_glucose_bin_(54.999, 14...
12    (ever_married_Yes, age_group_senior, avg_gluco...
13                (work_type_Private, ever_married_Yes)
14    (work_type_Private, avg_glucose_bin_(54.999, 1...
15                           (stroke, age_group_senior)
16                           (stroke, ever_married_Yes)
17                    (ever_married_Yes, gender_

In [130]:
# Step 5: Generate rules
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.6)

In [131]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(ever_married_Yes),"(avg_glucose_bin_(54.999, 145.0])",0.75756,0.790064,0.569327,0.751527,0.951224,1.0,-0.029194,0.844907,-0.17458,0.581958,-0.183562,0.736068
1,"(avg_glucose_bin_(54.999, 145.0])",(ever_married_Yes),0.790064,0.75756,0.569327,0.720609,0.951224,1.0,-0.029194,0.867745,-0.196304,0.581958,-0.152413,0.736068
2,(ever_married_Yes),(age_group_senior),0.75756,0.649969,0.57807,0.763069,1.174007,1.0,0.08568,1.477351,0.611354,0.696925,0.323113,0.826225
3,(age_group_senior),(ever_married_Yes),0.649969,0.75756,0.57807,0.889381,1.174007,1.0,0.08568,2.19167,0.423439,0.696925,0.543727,0.826225
4,(age_group_senior),"(avg_glucose_bin_(54.999, 145.0])",0.649969,0.790064,0.468422,0.720684,0.912184,1.0,-0.045095,0.751607,-0.215706,0.482109,-0.330482,0.656788
5,"(ever_married_Yes, age_group_senior)","(avg_glucose_bin_(54.999, 145.0])",0.57807,0.790064,0.409072,0.707651,0.895689,1.0,-0.04764,0.718102,-0.216311,0.426534,-0.39256,0.612711
6,"(ever_married_Yes, avg_glucose_bin_(54.999, 14...",(age_group_senior),0.569327,0.649969,0.409072,0.718519,1.105466,1.0,0.039027,1.243531,0.221523,0.504888,0.195838,0.673945
7,"(age_group_senior, avg_glucose_bin_(54.999, 14...",(ever_married_Yes),0.468422,0.75756,0.409072,0.873298,1.152777,1.0,0.054214,1.913468,0.249314,0.500755,0.477389,0.706642
8,(age_group_senior),"(ever_married_Yes, avg_glucose_bin_(54.999, 14...",0.649969,0.569327,0.409072,0.629372,1.105466,1.0,0.039027,1.162007,0.272558,0.504888,0.13942,0.673945
9,(work_type_Private),(ever_married_Yes),0.604814,0.75756,0.487657,0.806293,1.064328,1.0,0.029474,1.251577,0.152941,0.557502,0.201008,0.725006


I initially used the SMOTE technique to generate more data for the minority class (stroke cases) and then used this augmented dataset to create association rules. However, there were only a few association rules specifically related to stroke. Now, I want to take a different approach by using only the original stroke-positive data — before applying SMOTE — to generate association rules and compare the results.

In [12]:
stroke_positive_data = cp_dataframe[cp_dataframe["stroke"]==1]

In [13]:
stroke_positive_data

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,smoking_status,stroke,age_group,bmi_bin,avg_glucose_bin
0,Male,0,1,Yes,Private,Urban,formerly smoked,1,senior,"(30.0, 40.0]","(145.0, 235.0]"
1,Female,0,0,Yes,Self-employed,Rural,never smoked,1,senior,"(30.0, 40.0]","(145.0, 235.0]"
2,Male,0,1,Yes,Private,Rural,never smoked,1,senior,"(30.0, 40.0]","(54.999, 145.0]"
3,Female,0,0,Yes,Private,Urban,smokes,1,middle age,"(30.0, 40.0]","(145.0, 235.0]"
4,Female,1,0,Yes,Self-employed,Rural,never smoked,1,senior,"(20.0, 30.0]","(145.0, 235.0]"
...,...,...,...,...,...,...,...,...,...,...,...
244,Male,0,0,Yes,Private,Rural,Unknown,1,senior,"(30.0, 40.0]","(54.999, 145.0]"
245,Female,0,0,No,children,Rural,Unknown,1,young adult,"(30.0, 40.0]","(54.999, 145.0]"
246,Female,0,0,Yes,Self-employed,Rural,formerly smoked,1,senior,"(20.0, 30.0]","(54.999, 145.0]"
247,Male,1,0,Yes,Self-employed,Rural,Unknown,1,senior,"(30.0, 40.0]","(54.999, 145.0]"


We have got only 249 samples, lets check if there any rules could be retrieved .

In [16]:
#lets one-hot encode the data set 
stroke_encoded = pd.get_dummies(stroke_positive_data)

# Step 3: Convert to True/False format (optional, but cleaner for FP-Growth)
stroke_encoded = stroke_encoded.astype(bool)

In [17]:
frequent_itemsets_stroke = fpgrowth(stroke_encoded, min_support=0.4, use_colnames=True,verbose=1)


11 itemset(s) from tree conditioned on items ()
0 itemset(s) from tree conditioned on items (stroke)
1 itemset(s) from tree conditioned on items (age_group_senior)
3 itemset(s) from tree conditioned on items (ever_married_Yes)
3 itemset(s) from tree conditioned on items (work_type_Private)
0 itemset(s) from tree conditioned on items (work_type_Private, stroke)
1 itemset(s) from tree conditioned on items (work_type_Private, ever_married_Yes)
3 itemset(s) from tree conditioned on items (work_type_Private, age_group_senior)
3 itemset(s) from tree conditioned on items (Residence_type_Urban)
0 itemset(s) from tree conditioned on items (Residence_type_Urban, stroke)
1 itemset(s) from tree conditioned on items (Residence_type_Urban, age_group_senior)
3 itemset(s) from tree conditioned on items (Residence_type_Urban, ever_married_Yes)
3 itemset(s) from tree conditioned on items (bmi_bin_(30.0, 40.0])
0 itemset(s) from tree conditioned on items (bmi_bin_(30.0, 40.0], stroke)
1 itemset(s) from t

In [20]:
frequent_itemsets_stroke['itemsets']

0                                              (stroke)
1                                    (age_group_senior)
2                                    (ever_married_Yes)
3                                   (work_type_Private)
4                                (Residence_type_Urban)
                            ...                        
56    (avg_glucose_bin_(54.999, 145.0], age_group_se...
57    (avg_glucose_bin_(54.999, 145.0], age_group_se...
58                       (bmi_bin_(20.0, 30.0], stroke)
59             (age_group_senior, bmi_bin_(20.0, 30.0])
60     (age_group_senior, bmi_bin_(20.0, 30.0], stroke)
Name: itemsets, Length: 61, dtype: object

In [18]:
# Step 5: Generate rules
rules_stroe = association_rules(frequent_itemsets_stroke, metric='confidence', min_threshold=0.6)

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


In [19]:
rules_stroe

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(age_group_senior),(stroke),0.907631,1.000000,0.907631,1.000000,1.000000,1.0,0.000000,inf,0.000000,0.907631,0.000000,0.953815
1,(stroke),(age_group_senior),1.000000,0.907631,0.907631,0.907631,1.000000,1.0,0.000000,1.000000,0.000000,0.907631,0.000000,0.953815
2,(ever_married_Yes),(stroke),0.883534,1.000000,0.883534,1.000000,1.000000,1.0,0.000000,inf,0.000000,0.883534,0.000000,0.941767
3,(stroke),(ever_married_Yes),1.000000,0.883534,0.883534,0.883534,1.000000,1.0,0.000000,1.000000,0.000000,0.883534,0.000000,0.941767
4,(age_group_senior),(ever_married_Yes),0.907631,0.883534,0.811245,0.893805,1.011625,1.0,0.009322,1.096720,0.124408,0.827869,0.088190,0.905994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,"(bmi_bin_(20.0, 30.0])",(stroke),0.445783,1.000000,0.445783,1.000000,1.000000,1.0,0.000000,inf,0.000000,0.445783,0.000000,0.722892
126,"(bmi_bin_(20.0, 30.0])",(age_group_senior),0.445783,0.907631,0.405622,0.909910,1.002511,1.0,0.001016,1.025301,0.004520,0.427966,0.024677,0.678406
127,"(age_group_senior, bmi_bin_(20.0, 30.0])",(stroke),0.405622,1.000000,0.405622,1.000000,1.000000,1.0,0.000000,inf,0.000000,0.405622,0.000000,0.702811
128,"(bmi_bin_(20.0, 30.0], stroke)",(age_group_senior),0.445783,0.907631,0.405622,0.909910,1.002511,1.0,0.001016,1.025301,0.004520,0.427966,0.024677,0.678406


In [25]:
rules_stroe[rules_stroe['consequents']==frozenset({'stroke'})]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(age_group_senior),(stroke),0.907631,1.0,0.907631,1.0,1.0,1.0,0.0,inf,0.0,0.907631,0.0,0.953815
2,(ever_married_Yes),(stroke),0.883534,1.0,0.883534,1.0,1.0,1.0,0.0,inf,0.0,0.883534,0.0,0.941767
6,"(age_group_senior, ever_married_Yes)",(stroke),0.811245,1.0,0.811245,1.0,1.0,1.0,0.0,inf,0.0,0.811245,0.0,0.905622
12,(work_type_Private),(stroke),0.598394,1.0,0.598394,1.0,1.0,1.0,0.0,inf,0.0,0.598394,0.0,0.799197
16,"(work_type_Private, ever_married_Yes)",(stroke),0.542169,1.0,0.542169,1.0,1.0,1.0,0.0,inf,0.0,0.542169,0.0,0.771084
22,"(work_type_Private, age_group_senior)",(stroke),0.534137,1.0,0.534137,1.0,1.0,1.0,0.0,inf,0.0,0.534137,0.0,0.767068
28,"(work_type_Private, ever_married_Yes, age_grou...",(stroke),0.481928,1.0,0.481928,1.0,1.0,1.0,0.0,inf,0.0,0.481928,0.0,0.740964
34,(Residence_type_Urban),(stroke),0.542169,1.0,0.542169,1.0,1.0,1.0,0.0,inf,0.0,0.542169,0.0,0.771084
37,"(age_group_senior, Residence_type_Urban)",(stroke),0.497992,1.0,0.497992,1.0,1.0,1.0,0.0,inf,0.0,0.497992,0.0,0.748996
40,"(ever_married_Yes, Residence_type_Urban)",(stroke),0.457831,1.0,0.457831,1.0,1.0,1.0,0.0,inf,0.0,0.457831,0.0,0.728916
