# ASSOCIATION RULES - APRIORI - Retail

In [2]:
pip install mlxtend

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import Libraries
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [5]:
# load the dataset
df = pd.read_csv('retail_dataset.csv', sep=',')
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


In [6]:
# shape
df.shape

(315, 7)

In [7]:
df.values

array([['Bread', 'Wine', 'Eggs', ..., 'Cheese', 'Pencil', 'Diaper'],
       ['Bread', 'Cheese', 'Meat', ..., 'Wine', 'Milk', 'Pencil'],
       ['Cheese', 'Meat', 'Eggs', ..., 'Wine', nan, nan],
       ...,
       ['Bread', 'Cheese', 'Eggs', ..., 'Pencil', 'Diaper', 'Wine'],
       ['Meat', 'Cheese', nan, ..., nan, nan, nan],
       ['Eggs', 'Wine', 'Bagel', ..., 'Meat', nan, nan]], dtype=object)

In [10]:
# how many different products are there
items = (df['0'].unique())
len(items)

9

In [11]:
# one-hot encoding - (For theory only)
encoded_vals = []
for index, row in df.iterrows(): 
    labels = {}
    uncommons = list(set(items) - set(row))
    commons = list(set(items).intersection(row))
    for uc in uncommons:
        labels[uc] = 0
    for com in commons:
        labels[com] = 1
    encoded_vals.append(labels)

In [12]:
# Create ohe_df
ohe_df = pd.DataFrame(encoded_vals) 

In [13]:
ohe_df.head()

Unnamed: 0,Bagel,Milk,Cheese,Wine,Diaper,Pencil,Meat,Bread,Eggs
0,0,0,1,1,1,1,1,1,1
1,0,1,1,1,1,1,1,1,0
2,0,1,1,1,0,0,1,0,1
3,0,1,1,1,0,0,1,0,1
4,0,0,0,1,0,1,1,0,0


In [19]:
# set a support value and select those above that value. (Support = 0.2)
freq_items = apriori(ohe_df, min_support = 0.2, use_colnames = True, verbose = 1)
freq_items

Processing 4 combinations | Sampling itemset size 4 3


Unnamed: 0,support,itemsets
0,0.425397,(Bagel)
1,0.501587,(Milk)
2,0.501587,(Cheese)
3,0.438095,(Wine)
4,0.406349,(Diaper)
5,0.361905,(Pencil)
6,0.47619,(Meat)
7,0.504762,(Bread)
8,0.438095,(Eggs)
9,0.225397,"(Bagel, Milk)"


In [21]:
# # Sorting by support values
freq_items.sort_values("support", ascending = False)

Unnamed: 0,support,itemsets
7,0.504762,(Bread)
2,0.501587,(Cheese)
1,0.501587,(Milk)
6,0.47619,(Meat)
3,0.438095,(Wine)
8,0.438095,(Eggs)
0,0.425397,(Bagel)
4,0.406349,(Diaper)
5,0.361905,(Pencil)
19,0.32381,"(Cheese, Meat)"


In [22]:
# The lengths of itemsets
freq_items['length'] = freq_items['itemsets'].apply(lambda x:len(x))
freq_items

Unnamed: 0,support,itemsets,length
0,0.425397,(Bagel),1
1,0.501587,(Milk),1
2,0.501587,(Cheese),1
3,0.438095,(Wine),1
4,0.406349,(Diaper),1
5,0.361905,(Pencil),1
6,0.47619,(Meat),1
7,0.504762,(Bread),1
8,0.438095,(Eggs),1
9,0.225397,"(Bagel, Milk)",2


In [23]:
# review of all metrics
association_rules(freq_items, metric="confidence", min_threshold = 0.6)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265
1,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
2,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
3,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754
4,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891
5,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754
6,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203
7,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624
8,"(Cheese, Meat)",(Milk),0.32381,0.501587,0.203175,0.627451,1.250931,0.040756,1.337845
9,"(Cheese, Milk)",(Meat),0.304762,0.47619,0.203175,0.666667,1.4,0.05805,1.571429


In [24]:
# Let's create a data set and keep this information there.
df_ar = association_rules(freq_items, metric = "confidence", min_threshold = 0.6)
df_ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265
1,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
2,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
3,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754
4,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891
5,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754
6,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203
7,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624
8,"(Cheese, Meat)",(Milk),0.32381,0.501587,0.203175,0.627451,1.250931,0.040756,1.337845
9,"(Cheese, Milk)",(Meat),0.304762,0.47619,0.203175,0.666667,1.4,0.05805,1.571429


In [25]:
df_ar[(df_ar.support < 0.3) & (df_ar.confidence > 0.7)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
10,"(Meat, Milk)",(Cheese),0.244444,0.501587,0.203175,0.831169,1.657077,0.080564,2.952137
12,"(Cheese, Eggs)",(Meat),0.298413,0.47619,0.215873,0.723404,1.519149,0.073772,1.893773
13,"(Meat, Eggs)",(Cheese),0.266667,0.501587,0.215873,0.809524,1.613924,0.082116,2.616667
