In [1]:
import pandas as pd
import numpy as np

In [2]:
from re import split as splitStr
originalCols = """Class, Cap-shape, Cap-surface,
            Cap-color, Bruises, Odor, Gill-attachment, Gill-spacing,
            Gill-size, Gill-color, Stalk-shape, Stalk-root, Stalk-surface-above-ring,
            Stalk-surface-below-ring, Stalk-color-above-ring, Stalk-color-below-ring,
            Veil-type, Veil-color, Ring-number, Ring-type, Spore-print-color,
            Population, Habitat"""
originalCols = splitStr(',\s*', originalCols)

dataSourcePath = "data/original/agaricus-lepiota.data"

originalDF = pd.read_csv(dataSourcePath, names=originalCols)

# Data peeking

In [3]:
originalDF.sample(5)

Unnamed: 0,Class,Cap-shape,Cap-surface,Cap-color,Bruises,Odor,Gill-attachment,Gill-spacing,Gill-size,Gill-color,...,Stalk-surface-below-ring,Stalk-color-above-ring,Stalk-color-below-ring,Veil-type,Veil-color,Ring-number,Ring-type,Spore-print-color,Population,Habitat
5247,p,x,y,y,f,f,f,c,b,p,...,k,p,p,p,w,o,l,h,v,p
3639,e,f,y,n,t,n,f,c,b,w,...,s,w,p,p,w,o,p,n,y,d
624,e,f,f,g,f,n,f,c,n,p,...,s,w,w,p,w,o,p,k,y,u
4452,p,x,y,g,f,f,f,c,b,g,...,k,n,n,p,w,o,l,h,y,g
5467,p,f,s,w,t,f,f,c,b,w,...,s,w,w,p,w,o,p,h,v,u


In [4]:
originalDF.nunique()

Class                        2
Cap-shape                    6
Cap-surface                  4
Cap-color                   10
Bruises                      2
Odor                         9
Gill-attachment              2
Gill-spacing                 2
Gill-size                    2
Gill-color                  12
Stalk-shape                  2
Stalk-root                   5
Stalk-surface-above-ring     4
Stalk-surface-below-ring     4
Stalk-color-above-ring       9
Stalk-color-below-ring       9
Veil-type                    1
Veil-color                   4
Ring-number                  3
Ring-type                    5
Spore-print-color            9
Population                   6
Habitat                      7
dtype: int64

# Data Processing

In [5]:
# Drop Veil-type (only value "p")
noVeilDF = originalDF.drop("Veil-type", axis=1)

In [6]:
# Creating oneHotDF
from sklearn.preprocessing import OneHotEncoder

workingDF = noVeilDF

enc = OneHotEncoder(sparse=False)
enc.fit(workingDF)
oneHotDF = pd.DataFrame(enc.transform(workingDF).astype(bool), columns=enc.get_feature_names(workingDF.columns))

# Frequent itemsets mining

In [7]:
# Applying apriori frequent pattern mining algorithm
from mlxtend.frequent_patterns import apriori

# Proportion (want support as 50% of poisonous entries)
minSupport = originalDF[originalDF["Class"] == "p"].count().iloc[0] / originalDF.shape[0]
minSupport /= 2
# At least 0.2
if minSupport < 0.2:
    raise runtimeException

freqDF = apriori(oneHotDF, min_support=minSupport, low_memory=True)

In [8]:
# Generate association rules
from mlxtend.frequent_patterns import association_rules

rules = association_rules(freqDF, support_only=False)

In [9]:
# A lot of rules!
len(rules)

56630

# Analysis

In [10]:
# Filter interesting rules
is3Antecentends = rules.antecedents.apply(lambda el: len(el) == 3)
isPoisConsequent = rules.consequents.apply(lambda el: 1 in el and len(el) == 1)
isNot100Confidence = rules.confidence != 1
filteredRules = rules[is3Antecentends & isPoisConsequent & isNot100Confidence]

In [11]:
# Visualization
filteredRules.sort_values(by="lift", ascending=False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2161,"(34, 109, 22)",(1),0.322994,0.482029,0.308223,0.954268,1.979692,0.15253,11.326309
2243,"(86, 109, 22)",(1),0.322994,0.482029,0.308223,0.954268,1.979692,0.15253,11.326309
2181,"(86, 35, 22)",(1),0.412112,0.482029,0.392418,0.95221,1.975423,0.193768,10.838552
2120,"(34, 35, 22)",(1),0.409897,0.482029,0.390202,0.951952,1.974887,0.19262,10.780281
2208,"(86, 59, 22)",(1),0.291974,0.482029,0.274249,0.939292,1.948623,0.133509,8.532141
2361,"(34, 59, 86)",(1),0.289759,0.482029,0.272033,0.938828,1.94766,0.132361,8.467394
2133,"(34, 59, 22)",(1),0.289759,0.482029,0.272033,0.938828,1.94766,0.132361,8.467394
2140,"(34, 22, 63)",(1),0.283604,0.482029,0.265879,0.9375,1.944906,0.129174,8.287543
2222,"(63, 86, 22)",(1),0.283604,0.482029,0.265879,0.9375,1.944906,0.129174,8.287543
2375,"(34, 86, 63)",(1),0.283604,0.482029,0.265879,0.9375,1.944906,0.129174,8.287543


In [12]:
# Analysis (based on lift, leverage, confidence, etc, determines which attributes should be chosen as pattern
choiceAttributes = {86, 109, 22}
choiceRule = filteredRules[filteredRules["antecedents"] == choiceAttributes]

In [13]:
# Fetching attribute names
choiceAttNames = [oneHotDF.columns[el] for el in choiceAttributes]

In [14]:
choiceAttNames

['Population_v', 'Veil-color_w', 'Bruises_f']