In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data

## Import

In [10]:
dataset = pd.read_csv('Market_Basket_Optimisation.csv',header=None)
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [11]:
dataset.shape

(7501, 20)

In [12]:
dataset.columns = [f'Product{i+1}'for i in dataset.columns]

In [13]:
dataset.index.name = 'CustomerID'

In [14]:
dataset.sample(5)

Unnamed: 0_level_0,Product1,Product2,Product3,Product4,Product5,Product6,Product7,Product8,Product9,Product10,Product11,Product12,Product13,Product14,Product15,Product16,Product17,Product18,Product19,Product20
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
4923,chocolate,salmon,vegetables mix,green grapes,,,,,,,,,,,,,,,,
3235,eggs,whole wheat rice,,,,,,,,,,,,,,,,,,
5840,french fries,escalope,,,,,,,,,,,,,,,,,,
525,burgers,mineral water,eggs,green tea,pancakes,,,,,,,,,,,,,,,
5596,mint,,,,,,,,,,,,,,,,,,,


## Preprocessing

In [15]:
transactions=dataset.astype(str).values.tolist()
len(transactions)

7501

# Training

In [16]:
from apyori import apriori

- We can state the minimum support as products that appear a minimum of 3 times per day. Given that the dataset is one week of transactions then we need to have items that appear 3 times per day times 7 days per week divided by the total number of transactions.

$$s_{min}= \frac{3*7}{7,501}\approx0.003$$

- Minimum confidence is selected by starting with 0.8 as a rule of thumb and then dividing by 2 until a reasonable result is achieved.
- Minimum lift is selected 3 as a rule of thumb.
- `min_length` and `max_length` is the number of products that will be returned.

In [29]:
rules = apriori(transactions=transactions,min_support=0.003,min_confidence=0.2,min_lift=3,
                min_length=2,max_length=2)

# Visualisation

## Directly from output

In [30]:
results = list(rules)
len(results)

9

In [31]:
results

[RelationRecord(items=frozenset({'light cream', 'chicken'}), support=0.004532728969470737, ordered_statistics=[OrderedStatistic(items_base=frozenset({'light cream'}), items_add=frozenset({'chicken'}), confidence=0.29059829059829057, lift=4.84395061728395)]),
 RelationRecord(items=frozenset({'escalope', 'mushroom cream sauce'}), support=0.005732568990801226, ordered_statistics=[OrderedStatistic(items_base=frozenset({'mushroom cream sauce'}), items_add=frozenset({'escalope'}), confidence=0.3006993006993007, lift=3.790832696715049)]),
 RelationRecord(items=frozenset({'escalope', 'pasta'}), support=0.005865884548726837, ordered_statistics=[OrderedStatistic(items_base=frozenset({'pasta'}), items_add=frozenset({'escalope'}), confidence=0.3728813559322034, lift=4.700811850163794)]),
 RelationRecord(items=frozenset({'honey', 'fromage blanc'}), support=0.003332888948140248, ordered_statistics=[OrderedStatistic(items_base=frozenset({'fromage blanc'}), items_add=frozenset({'honey'}), confidence=0

## Pandas DF

In [8]:
def inspect(results):
    base        = [tuple(result[2][0][0])[0] for result in results]
    add         = [tuple(result[2][0][1])[0] for result in results]
    supports    = [result[1] for result in results]
    #confidences = [result[2][0][2] for result in results] # This is not required for Eclat
    #lifts       = [result[2][0][3] for result in results] # This is not required for eclat
    return list(zip(base,add,supports))#,confidences,lifts))

In [32]:
results_df = pd.DataFrame(inspect(results),columns=['Base','Add','Support'])#,'Confidence','Lift'])
results_df

Unnamed: 0,Base,Add,Support
0,light cream,chicken,0.004533
1,mushroom cream sauce,escalope,0.005733
2,pasta,escalope,0.005866
3,fromage blanc,honey,0.003333
4,herb & pepper,ground beef,0.015998
5,tomato sauce,ground beef,0.005333
6,light cream,olive oil,0.0032
7,whole wheat pasta,olive oil,0.007999
8,pasta,shrimp,0.005066


In [22]:
results_df.sort_values(by='Support',ascending=False)

Unnamed: 0,Base,Add,Support
4,herb & pepper,ground beef,0.015998
7,whole wheat pasta,olive oil,0.007999
2,pasta,escalope,0.005866
1,mushroom cream sauce,escalope,0.005733
5,tomato sauce,ground beef,0.005333
8,pasta,shrimp,0.005066
0,light cream,chicken,0.004533
3,fromage blanc,honey,0.003333
6,light cream,olive oil,0.0032
