In [1]:
import numpy as np
import pandas as pd
import pyfpgrowth

In [2]:
df = pd.read_csv(filepath_or_buffer='data/preprocessed_100k.csv')
df = df.drop(['Unnamed: 0'], axis=1)

In [3]:
# df = df.sample(n=1000)
print(df.shape)
df.head()

(12939864, 12)


Unnamed: 0,Resident_Status,Education,Month_Of_Death,Sex,Age_Recode_12,Place_Of_Death,Marital_Status,DOW_of_Death,Injured_At_Work,Manner_Of_Death,Race,Hispanic_Origin_Recode
0,1,99,5,M,11,7,W,1,U,9,1,6
1,1,99,5,M,9,1,M,5,U,9,1,6
2,2,99,5,F,11,7,W,1,U,9,1,6
3,1,99,5,M,11,6,M,6,U,9,1,6
4,2,99,6,M,10,4,M,7,U,9,1,6


In [4]:
def convertToStringValues(value, column):
    resident_dict = {1:'Resident', 2:'Same State Non-Resident', 3:'Differetn State Non-Resident', 4:'Foreign Residents'}
    education_dict = {0:'No Formal Education', 1:'1 year of elementary school', 2:'2 years of elementary school', 3:'3 years of elementary school', 4:'4 years of elementary school', 5:'5 years of elementary school', 6:'6 years of elementary school', 7:'7 years of elementary school', 8:'8 years of elementary school', 9:'1 year of high school', 10:'2 years of high school', 11:'3 years of high school', 12:'4 years of high school', 13:'1 year of college', 14:'2 years of college', 15:'3 years of college', 16:'4 years of college', 17:'5 or more years of college', 99:'education not stated'}
    month_dict = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
    sex_dict = {'M':'Male', 'F':'Female'}
    age_dict = {1:'under 1 year', 2:'1-4 years', 3:'5-14 years', 4:'15-24 years', 5:'25-34 years', 6:'35-44 years', 7:'45-54 years', 8:'55-64 years', 9:'65-74 years', 10:'75-84 years', 11:'85 years and over', 12:'age not stated'}
    place_dict = {1:'Hospital-inpatient', 2:'Hospital-outpatient', 3:'Hospital dead on arrival', 4:'home', 5:'hospice facility', 6:'nursing home', 7:'other place', 9:'place unknown'}
    marital_dict = {'S':'Single', 'M':'Married', 'W':'Widowed', 'D':'Divorced', 'U':'marital status unknown'}
    day_dict = {1:'Sunday', 2:'Monday', 3:'Tuesday', 4:'Wednesday', 5:'Thursday', 6:'Friday', 7:'Saturday', 9:'day unknown'}
    injured_dict = {'Y':'Yes', 'N':'No', 'U':'injured unknown'}
    manner_dict = {1:'accident', 2:'suicide', 3:'homicide', 4:'pending investigation', 5:'could not determine manner', 6:'self inflicted', 7:'natural', 9:'manner of death unknown'}
    race_dict = {1:'white', 2:'black', 3:'american indian', 4:'chinese', 5:'japanese', 6:'hawaiian', 7:'filipino', 18:'asian indian', 28:'korean', 38:'samoan', 48:'vietnamese', 58:'guamanian', 68:'other asian or pacific islander', 78:'combined other asain or pacific islander'}
    hisp_orig_dict = {1:'mexican', 2:'puerto rican', 3:'cuban', 4:'central or south american', 5:'other or unknown hispanic', 6:'non-hispanic white', 7:'non-hispanic black', 8:'non-hispanic other races', 9:'hispanic origin unknown'}
    
    if column == 'Resident_Status':
        value = resident_dict[value]
    elif column == 'Education':
        value = education_dict[value]
    elif column == 'Month_Of_Death':
        value = month_dict[value]
    elif column == 'Sex':
        value = sex_dict[value]
    elif column == 'Age_Recode_12':
        value = age_dict[value]
    elif column == 'Place_Of_Death':
        value = place_dict[value]
    elif column == 'Marital_Status':
        value = marital_dict[value]
    elif column == 'DOW_of_Death':
        value = day_dict[value]
    elif column == 'Injured_At_Work':
        value = injured_dict[value]
    elif column == 'Manner_Of_Death':
        value = manner_dict[value]
    elif column == 'Race':
        value = race_dict[value]
    elif column == 'Hispanic_Origin_Recode':
        value = hisp_orig_dict[value]
    return value

In [5]:
for col in df.keys():
    df[col] = df[col].apply(convertToStringValues, args=(col,))
df.head()

Unnamed: 0,Resident_Status,Education,Month_Of_Death,Sex,Age_Recode_12,Place_Of_Death,Marital_Status,DOW_of_Death,Injured_At_Work,Manner_Of_Death,Race,Hispanic_Origin_Recode
0,Resident,education not stated,May,Male,85 years and over,other place,Widowed,Sunday,injured unknown,manner of death unknown,white,non-hispanic white
1,Resident,education not stated,May,Male,65-74 years,Hospital-inpatient,Married,Thursday,injured unknown,manner of death unknown,white,non-hispanic white
2,Same State Non-Resident,education not stated,May,Female,85 years and over,other place,Widowed,Sunday,injured unknown,manner of death unknown,white,non-hispanic white
3,Resident,education not stated,May,Male,85 years and over,nursing home,Married,Friday,injured unknown,manner of death unknown,white,non-hispanic white
4,Same State Non-Resident,education not stated,Jun,Male,75-84 years,home,Married,Saturday,injured unknown,manner of death unknown,white,non-hispanic white


In [6]:
def rulesDF(rules):
    sorted_by_conf = sorted(rules.items(), key=lambda kv: kv[1][1], reverse=True)
    antecedent_list = []
    consequent_list = []
    confidence_list = []
    for rule in sorted_by_conf:
        antecedent_list.append(rule[0])
        consequent_list.append(rule[1][0])
        confidence_list.append(rule[1][1])
    rules_dict = {'Antecedents':antecedent_list, 'Consequents':consequent_list, 'Confidence':confidence_list}
    rules_df = pd.DataFrame(data=rules_dict)
    return rules_df

In [7]:
columns = ['Age_Recode_12', 'Place_Of_Death', 'Manner_Of_Death', 'Sex']
patterns = pyfpgrowth.find_frequent_patterns(df[columns].values.tolist(), 10000)
rules = pyfpgrowth.generate_association_rules(patterns, 0.8)
rules_df = rulesDF(rules)
rules_df

Unnamed: 0,Antecedents,Consequents,Confidence
0,"(65-74 years, Female, hospice facility)","(natural,)",0.931585
1,"(55-64 years, Female, hospice facility)","(natural,)",0.929952
2,"(65-74 years, Male, hospice facility)","(natural,)",0.927947
3,"(45-54 years, Female, hospice facility)","(natural,)",0.925526
4,"(55-64 years, Male, hospice facility)","(natural,)",0.924717
5,"(75-84 years, Male, hospice facility)","(natural,)",0.91685
6,"(75-84 years, Female, hospice facility)","(natural,)",0.915247
7,"(Male, hospice facility)","(natural,)",0.912677
8,"(45-54 years, Male, hospice facility)","(natural,)",0.912429
9,"(Female, hospice facility)","(natural,)",0.909834
