In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
from apyori import apriori

In [2]:
df = pd.read_csv('Groceries_dataset.csv', sep=',')
df.head()


Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [3]:
df.dtypes

Member_number       int64
Date               object
itemDescription    object
dtype: object

In [4]:
val_count = df['itemDescription'].value_counts()
print(val_count)
all_products = df['itemDescription'].unique()
print('\nTotal Products Sold: {} product'.format(len(all_products)))

whole milk               2502
other vegetables         1898
rolls/buns               1716
soda                     1514
yogurt                   1334
                         ... 
frozen chicken              5
bags                        4
baby cosmetics              3
kitchen utensil             1
preservation products       1
Name: itemDescription, Length: 167, dtype: int64

Total Products Sold: 167 product


In [5]:
def distribution_plot(x, y, name=None, xaxis=None, yaxis=None):
    fig = go.Figure([go.Bar(x=x, y=y)])
    fig.update_layout(title_text=name, xaxis_title=xaxis, yaxis_title=yaxis)
    fig.show()

x = df['itemDescription'].value_counts()
x = x.sort_values(ascending=False)
x = x[:10]
distribution_plot(x.index, x.values, name='10 Products Most Sold Out', xaxis='Products', yaxis='Count')

In [6]:
one_hot = pd.get_dummies(df['itemDescription'])
df.drop('itemDescription', inplace=True, axis=1)
df = df.join(one_hot)
df.head()

Unnamed: 0,Member_number,Date,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,1808,21-07-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2552,05-01-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2300,19-09-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1187,12-12-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3037,01-02-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
record = df.groupby(['Member_number', 'Date'])[all_products[:]].apply(sum)
record = record.reset_index()[all_products]

def get_Pnames(x):
    for product in all_products:
        if x[product] > 0:
            x[product] = product
    return x

record = record.apply(get_Pnames, axis=1)
print('Total Transacttions: {} Transactions'.format(len(record)))
record.head(10)

Total Transacttions: 14963 Transactions


Unnamed: 0,tropical fruit,whole milk,pip fruit,other vegetables,rolls/buns,pot plants,citrus fruit,beef,frankfurter,chicken,...,flower (seeds),rice,tea,salad dressing,specialty vegetables,pudding powder,ready soups,make up remover,toilet cleaner,preservation products
0,0,whole milk,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,whole milk,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,frankfurter,0,...,0,0,0,0,0,0,0,0,0,0
6,0,whole milk,0,0,rolls/buns,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,whole milk,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,beef,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,frankfurter,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
x = record.values
x = [sub[~(sub == 0)].tolist() for sub in x if sub[sub != 0].tolist()]
transactions = x
transactions[0:10]

[['whole milk', 'yogurt', 'sausage', 'semi-finished bread'],
 ['whole milk', 'pastry', 'salty snack'],
 ['canned beer', 'misc. beverages'],
 ['sausage', 'hygiene articles'],
 ['soda', 'pickled vegetables'],
 ['frankfurter', 'curd'],
 ['whole milk', 'rolls/buns', 'sausage'],
 ['whole milk', 'soda'],
 ['beef', 'white bread'],
 ['frankfurter', 'soda', 'whipped/sour cream']]

In [9]:
rules = apriori(transactions, min_support=0.00030, min_confidance=0.08, min_lift=6, min_length=2, target='rules')
association_results = list(rules)
association_results

[RelationRecord(items=frozenset({'seasonal products', 'soups'}), support=0.0003341575887188398, ordered_statistics=[OrderedStatistic(items_base=frozenset({'seasonal products'}), items_add=frozenset({'soups'}), confidence=0.04716981132075471, lift=14.704205974842766), OrderedStatistic(items_base=frozenset({'soups'}), items_add=frozenset({'seasonal products'}), confidence=0.10416666666666667, lift=14.704205974842768)]),
 RelationRecord(items=frozenset({'canned beer', 'frozen vegetables', 'brown bread'}), support=0.0003341575887188398, ordered_statistics=[OrderedStatistic(items_base=frozenset({'brown bread'}), items_add=frozenset({'canned beer', 'frozen vegetables'}), confidence=0.008880994671403198, lift=6.644316163410303), OrderedStatistic(items_base=frozenset({'canned beer'}), items_add=frozenset({'frozen vegetables', 'brown bread'}), confidence=0.007122507122507123, lift=7.612433862433862), OrderedStatistic(items_base=frozenset({'frozen vegetables', 'brown bread'}), items_add=frozense

In [10]:
for item in association_results:
    pair = item[0]
    items = [x for x in pair]
    print('Rule: ' + items[0] + ' -> ' + items[1])
    print('Support: ' + str(item[1]))
    print('Confidence: ' + str(item[2][0][2]))
    print('Lift: ' + str(item[2][0][3]))
    print('=========================================')

Rule: seasonal products -> soups
Support: 0.0003341575887188398
Confidence: 0.04716981132075471
Lift: 14.704205974842766
Rule: canned beer -> frozen vegetables
Support: 0.0003341575887188398
Confidence: 0.008880994671403198
Lift: 6.644316163410303
Rule: curd -> sausage
Support: 0.0003341575887188398
Confidence: 0.005537098560354374
Lift: 8.28516057585825
Rule: soda -> whole milk
Support: 0.0003341575887188398
Confidence: 0.09090909090909091
Lift: 7.817659352142111
