## Market Basket Analysis on Grocery Dataset

In [1]:
#load packages
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
#load data into dataframe
df = pd.read_csv ('Groceries_dataset.csv')
df.head(10)

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk
5,4941,14-02-2015,rolls/buns
6,4501,08-05-2015,other vegetables
7,3803,23-12-2015,pot plants
8,2762,20-03-2015,whole milk
9,4119,12-02-2015,tropical fruit


In [3]:
df['Date'] = df['Date'].astype('string') # convert Date from object to string
df['itemDescription'] = df['itemDescription'].astype('string') # convert itemDescription from object to string
df['itemDescription'] = df['itemDescription'].str.strip() # remove spaces from start + end

In [4]:
Quantity = 1
df['Quantity'] = Quantity

In [5]:
df.dtypes

Member_number       int64
Date               string
itemDescription    string
Quantity            int64
dtype: object

In [6]:
# how many individual purchases exist in the data?
df.shape

(38765, 4)

In [7]:
baskets = df.groupby(['Member_number', 'itemDescription'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('Member_number')
baskets.head()

itemDescription,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0
1002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0


In [21]:
#converting all positive vaues to 1 and everything else to 0
def encode(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = baskets.applymap(encode)

In [22]:
basket_sets.head()

itemDescription,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1001,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,1,0,1,0,0
1002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [23]:
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head(10)



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(bottled beer),(whole milk),0.158799,0.458184,0.085428,0.537964,1.174124,0.012669,1.172672
1,(whole milk),(bottled beer),0.458184,0.158799,0.085428,0.18645,1.174124,0.012669,1.033988
2,(other vegetables),(bottled water),0.376603,0.213699,0.093894,0.249319,1.16668,0.013414,1.04745
3,(bottled water),(other vegetables),0.213699,0.376603,0.093894,0.439376,1.16668,0.013414,1.111969
4,(rolls/buns),(bottled water),0.349666,0.213699,0.079271,0.226706,1.060863,0.004548,1.01682
5,(bottled water),(rolls/buns),0.213699,0.349666,0.079271,0.370948,1.060863,0.004548,1.033832
6,(soda),(bottled water),0.313494,0.213699,0.076193,0.243044,1.137318,0.009199,1.038767
7,(bottled water),(soda),0.213699,0.313494,0.076193,0.356543,1.137318,0.009199,1.066902
8,(whole milk),(bottled water),0.458184,0.213699,0.112365,0.245241,1.147597,0.014452,1.04179
9,(bottled water),(whole milk),0.213699,0.458184,0.112365,0.52581,1.147597,0.014452,1.142615


In [24]:
rules[rules['confidence'] >= 0.3]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(bottled beer),(whole milk),0.158799,0.458184,0.085428,0.537964,1.174124,0.012669,1.172672
3,(bottled water),(other vegetables),0.213699,0.376603,0.093894,0.439376,1.16668,0.013414,1.111969
5,(bottled water),(rolls/buns),0.213699,0.349666,0.079271,0.370948,1.060863,0.004548,1.033832
7,(bottled water),(soda),0.213699,0.313494,0.076193,0.356543,1.137318,0.009199,1.066902
9,(bottled water),(whole milk),0.213699,0.458184,0.112365,0.52581,1.147597,0.014452,1.142615
10,(canned beer),(whole milk),0.165213,0.458184,0.087224,0.52795,1.152268,0.011526,1.147795
13,(citrus fruit),(other vegetables),0.18548,0.376603,0.077476,0.417704,1.109135,0.007623,1.070584
15,(citrus fruit),(rolls/buns),0.18548,0.349666,0.071832,0.387275,1.107556,0.006976,1.06138
16,(citrus fruit),(whole milk),0.18548,0.458184,0.092355,0.497925,1.086737,0.007371,1.079155
19,(domestic eggs),(whole milk),0.133145,0.458184,0.070292,0.527938,1.152242,0.009287,1.147766
