In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import itertools
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth

In [2]:
#Reading the data in from Excel

print("loading data")

stroke_dataframe = pd.read_csv("Stroke_Dataset.csv")

loading data


In [3]:

targets = stroke_dataframe["Stroke"]
del stroke_dataframe["id"]

under = RandomUnderSampler(sampling_strategy=1)
stroke_dataframe, k = under.fit_resample(stroke_dataframe, targets)

#del stroke_dataframe["id"]

In [4]:
summary_df = pd.DataFrame()                                         # instantiate dataframe for concatenate step later
models = ['apriori', 'fpmax', 'fpgrowth']                           # define list of model names to add to temp dataframe
param_list = np.linspace(.1, .9, 8)                                 # create range of values to try for min_support
min_support = .1                                                    # set an initial value for min_support
i = 0                                                               # set inital value, use to iterate through 'models'

model_set = [                                                       # Set up list of models to run, min_support will come from param_list
    apriori(stroke_dataframe, min_support, max_len = 3, use_colnames=True),
    fpmax(stroke_dataframe, min_support, max_len = 3, use_colnames=True), 
    fpgrowth(stroke_dataframe, min_support, max_len = 3, use_colnames=True)
    ]

for model in model_set:
    for k in range(len(param_list)):
        min_support = param_list[k]                                 # pull the next value for min_support from the parameter grid
        model_df = pd.DataFrame(model)                              # run the model and dump the results into a temporary df
        model_df['model'] = models[i]                               # add a column with model i's name
        summary_df = pd.concat([summary_df, model_df], axis=0)      # append the temp dataframe onto the full one
    i = i + 1                                                       # increase i to pull next model's name

summary_df.reset_index(inplace=True, drop=True)                     # reset to get rid of duplicate index values




In [5]:
stroke_dataframe

Unnamed: 0,Young,Middle Aged,Old,Hypertension,Heart Disease,Married,Diabetic,Not Diabetic,Underweight,Normal Weight,Overweight,Obese,Stroke,Male,Female,Employed,Never Smoked,Smokes or Smoked,Urban,Rural
0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0
1,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0
2,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,1,0,0,1,0
3,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,1,1,0,0,1
4,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0,0,1,0,0,1,0,1,0,1,0,0,1,0,1,1,0,1,1,0
414,0,0,1,0,1,1,0,1,0,0,0,1,1,0,1,1,0,1,0,1
415,0,1,0,1,0,1,0,1,0,0,0,1,1,0,1,1,1,0,0,1
416,0,0,1,0,0,1,0,1,0,1,0,0,1,1,0,1,0,0,1,0


In [6]:
from mlxtend.frequent_patterns import association_rules

assoc_df = association_rules(summary_df, metric="confidence", min_threshold=0.7)
assoc_df1 = assoc_df[
    (assoc_df['lift'] > .75) &
    (assoc_df['conviction'] > 1)]
assoc_df1

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Young),(Not Diabetic),0.244019,0.715311,0.222488,0.911765,1.274641,0.047938,3.226475
1,(Middle Aged),(Married),0.311005,0.741627,0.287081,0.923077,1.244665,0.056432,3.358852
2,(Middle Aged),(Not Diabetic),0.311005,0.715311,0.236842,0.761538,1.064626,0.014377,1.193857
3,(Middle Aged),(Employed),0.311005,0.918660,0.311005,1.000000,1.088542,0.025297,inf
4,(Old),(Married),0.444976,0.741627,0.401914,0.903226,1.217898,0.071908,2.669856
...,...,...,...,...,...,...,...,...,...
288,"(Urban, Never Smoked)",(Female),0.191388,0.595694,0.138756,0.725000,1.217068,0.024748,1.470204
289,"(Urban, Never Smoked)",(Employed),0.191388,0.918660,0.186603,0.975000,1.061328,0.010783,3.253589
290,"(Rural, Never Smoked)",(Employed),0.181818,0.918660,0.177033,0.973684,1.059896,0.010004,3.090909
291,"(Urban, Smokes or Smoked)",(Employed),0.200957,0.918660,0.200957,1.000000,1.088542,0.016346,inf
