# Association Rules Mining / Basket Analysis

Basket Analysis (BA) is a powerful technique to mining the assocation between items within a basket, the result could be communicated as **Frequently Bought Together** items, that is used to:

- Create the bundle of items
- Recommend other items to add to basket (as "Frequently Bought Togther")

BA is recommendation algorithm, but not personalization, as it is item-based, and not personalize the recommend by user.

**The interesting of this hands-on**:

- Instead of using any available public dataset, we will generate the synthetic data, by that:
    - We understand deeply the data structure used for this model
    - We can validate if the output make sense or not
- Instead of using the existing Python package (such as: `mlxtend.frequent_patterns`), we will code the logic of Apriori Algorithm by our own, _it is the best way to understand the model in deeper level_



In [None]:
import os
import sys
from typing import Dict
from numpy.random import choice, randint
from itertools import groupby
import pandas as pd
import pytest
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
import string
from graphviz import Digraph

# Create basket synthetic data

Structure of data input for basket analysis:

- basket_1: apple, egg, cheese
- basket_2: egg, bread
- basket_3: bread, cheese
- basket_4: apple, orange
- basket_5: egg, cheese

## def create_basket

In [619]:
def create_basket(list_item: list, init_freq: list, n_basket: int) -> Dict:
    """Create a basket of items.
    Args:
        list_item (list): list of items ['beer', 'egg', 'cheese', 'bread']
        init_freq (list): initial frequency of items [0.2, 0.3, 0.1, 0.4]
    Returns:
        Dict with key as basket_id (range of int from 0 to n_basket), value is first item of basket
        Example: {1: 'apple', 2: 'egg', 3: 'orange'}
    """
    baskets = {}
    for basket_id in range(n_basket):
        # TODO: Fill in __
        # Hint: Google numpy.random.choice
#         baskets[basket_id] = choice('FILLME_1', 'FILLME_2')
        baskets[basket_id] = choice(list_item, p=init_freq)
    return baskets

In [None]:
# Test function
baskets_dict_test = create_basket(list_item=['beer', 'egg', 'cheese'], init_freq=[0.5, 0.5, 0.], n_basket=5)
## Test 1
for item in baskets_dict_test.values():
    assert item in ['beer', 'egg', 'cheese']
## Test 2
assert len(baskets_dict_test.keys()) == 5
print('PASS !!!!!')

## def generate_next_items_by_associated_rule

In [None]:
# associated_rules = {'bread': {'egg': 0.3, 'cheese': 0.2},
#                     'egg': {'apple': 0.2},
#                     'cheese': {'egg': 0.3},
#                     'beer': {'cheese': 0.2, 'wine': 0.5},
#                     'apple': {'orange': 0.2}
#                    }

# synthetic_item_list = ['beer', 'egg', 'cheese', 'bread', 'wine', 'apple', 'orange']
def generate_next_items_by_associated_rule(item, associated_rules, synthetic_item_list):
    """Pre-defined logic of association
    Args:
        item: seeding item
        associated_rules: we pre-define the rules, to see if the model succesfully to detect it
        synthetic_item_list: full list of item
    Returns:
        next item
    """
    try:
        # (1) By the item inputs, extract the associated_items from the rules
        associated_items = list(associated_rules['FILLME_1'].keys())
        # (2) By the item inputs, extract the freqs of those associated_items from the rules
        associated_items_freqs = list(associated_rules['FILLME_2'].values())
        # (3)Take other items not the list of associated_items, and item itself
        other_items = [i for i in synthetic_item_list if i not in [item, *associated_items]]
        other_freq = (1 - np.sum(associated_items_freqs)) / len(other_items)
        other_freqs = np.repeat(other_freq, len(other_items)-1)
        next_item = choice([*associated_items, *other_items], 
                           p=[*associated_items_freqs, 
                              *other_freqs, 
                              1. - np.sum(associated_items_freqs) - np.sum(other_freqs)])
    except:
        other_items = [i for i in synthetic_item_list if i != item]
        next_item = choice(other_items)
    return next_item

In [625]:
print('EXAMPLE')
for item in synthetic_item_list:
    next_item = generate_associated_items_by_rule(item, associated_rules, synthetic_item_list)
    print(f'{item} ==> {next_item}')

EXAMPLE
beer ==> wine
egg ==> apple
cheese ==> apple
bread ==> cheese
wine ==> orange
apple ==> beer
orange ==> beer


## Parameters

In [620]:
# Parameters
synthetic_item_list = ['beer', 'egg', 'cheese', 'bread', 'wine', 'apple', 'orange']
synthetic_init_freq = [0.2, 0.05, 0.1, 0.3, 0.05, 0.2, 0.1]

In [622]:
associated_rules = {'bread': {'egg': 0.3, 'cheese': 0.2},
                    'egg': {'apple': 0.2},
                    'cheese': {'egg': 0.3},
                    'beer': {'cheese': 0.2, 'wine': 0.5},
                    'apple': {'orange': 0.2}
                   }

## Initialize basket

In [621]:
baskets_dict = create_basket(list_item=synthetic_item_list,
             init_freq=synthetic_init_freq,
             n_basket=1000)

In [None]:
# Create synthetic
basket_df = pd.DataFrame({
    'basket_id': baskets_dict.keys(),
    'items_list': [[item] for item in baskets_dict.values()],
    'size_basket': randint(low=2, high=4, size=1000, dtype=int)
})
for i in range(basket_df.size_basket.max()):
    basket_df['items_list'] = basket_df.items_list.apply(lambda x: [*x, generate_associated_items_by_rule(x[-1], associated_rules, synthetic_item_list)])

basket_df['items_list'] = basket_df.items_list.apply(lambda x: list(dict.fromkeys(x))) # dedup
basket_df['items_list'] = basket_df.apply(lambda r: r.items_list[:r.size_basket], axis=1)
basket_df['item_pairs'] = basket_df.items_list.apply(lambda x: list(combinations(x, 2)))

In [None]:
basket_df

In [None]:
df_item_line = (basket_df[['basket_id', 'items_list', 'size_basket']]
                .explode('items_list')
                .rename(columns={'items_list': 'item_name'})
               )
df_item_support = df_item_line.groupby('item_name').basket_id.nunique().rename('occurence').reset_index() 
df_item_support['item_support'] = df_item_support.occurence / df_item_support.occurence.sum()
df_item_support

In [None]:
df_item_pairs = basket_df[['basket_id', 'item_pairs']].explode('item_pairs')
df_item_pairs = df_item_pairs.groupby('item_pairs').basket_id.nunique().rename('occurence_AB').reset_index()

In [None]:
# df_item_pairs[df_item_pairs.itemA == df_item_pairs.itemB]

In [None]:
# Filter by min occurance of item_pairs
min_occurence_AB = 30 
print(f'Before: {df_item_pairs.shape}')
df_item_pairs = df_item_pairs[df_item_pairs.occurence_AB >= min_occurence_AB]
print(f'After: {df_item_pairs.shape}')

In [None]:
df_item_pairs['itemA'] = df_item_pairs.item_pairs.apply(lambda x: x[0])
df_item_pairs['itemB'] = df_item_pairs.item_pairs.apply(lambda x: x[1])
df_item_pairs = pd.merge(
    df_item_pairs, 
    df_item_support[['item_name', 'item_support']].rename(columns={'item_name':'itemA', 'item_support':'supportA'})
)
df_item_pairs = pd.merge(
    df_item_pairs, 
    df_item_support[['item_name', 'item_support']].rename(columns={'item_name':'itemB', 'item_support':'supportB'})
)

df_item_pairs.head()

In [None]:
# support{apple,egg} = 3/5 or 60%
df_item_pairs['support_AB'] = df_item_pairs.occurence_AB / df_item_pairs.occurence_AB.sum()
df_item_pairs.sort_values(by='support_AB', ascending=False).head()

In [None]:
# confidence{A->B} = support{A,B} / support{A}   
df_item_pairs['confidence_AB'] = df_item_pairs.support_AB / df_item_pairs.supportA
df_item_pairs['confidence_BA'] = df_item_pairs.support_AB / df_item_pairs.supportB
df_item_pairs.sort_values(by='support_AB', ascending=False).head()

In [None]:
# lift{A,B} = lift{B,A} = support{A,B} / (support{A} * support{B}) 
df_item_pairs['lift'] = df_item_pairs.support_AB / (df_item_pairs.supportA * df_item_pairs.supportB)
df_item_pairs.sort_values(by='support_AB', ascending=False).head()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(13, 5))

xs = ['lift', 'support_AB', 'confidence_AB']
colors = ['skyblue', 'olive', 'gold']

for i in range(3):
    sns.histplot(data=df_item_pairs, x=xs[i], kde=True, color=colors[i], ax=axs[i])

In [None]:
# Choise of support/confidence
# Source: https://www.kaggle.com/code/xvivancos/market-basket-analysis/report
# Lower the support: Having more rules, with high confidence level
rules = df_item_pairs#[df_item_pairs.support_AB >= 0.01]
def plot_numrules_confidence_lvl(df_item_pairs, min_support, ax):
    rules_df = df_item_pairs[df_item_pairs.support_AB >= min_support]
    confidence_levels = np.arange(0, 1, 0.1)
    num_rules = []
    for thres_lvl in confidence_levels:
        rule_cnt = rules_df[rules_df.confidence_AB > thres_lvl].shape[0]
        num_rules.append(rule_cnt)
    
    df_confidence_rule_cnt = pd.DataFrame({
        'confidence_level': confidence_levels,
        'num_rules': num_rules
    }).set_index('confidence_level')
    
    ax.plot(confidence_levels, num_rules, 'o-', color='black')
    ax.set_title(f'Cut-off: support_AB >= {min_support}')
    ax.set_xlabel('Confidence level')
    return ax

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(13, 5))
min_support = [0.01, 0.03, 0.05]
for i in range(3):
    plot_numrules_confidence_lvl(df_item_pairs, min_support=min_support[i], ax=axs[i])
    if i == 0:
        axs[i].set_ylabel('Number of rules')

In [None]:
df_item_pairs_flr_support = df_item_pairs[df_item_pairs.support_AB >= 0.01]
df_item_pairs_flr_support_confi = df_item_pairs_flr_support[df_item_pairs_flr_support.confidence_AB >= 0.2]
print(df_item_pairs.shape)
print(df_item_pairs_flr_support.shape)
print(df_item_pairs_flr_support_confi.shape)

In [None]:
# lift = 1 implies no relationship between A and B. 
# (ie: A and B occur together only by chance)
df_item_pairs[df_item_pairs.lift.between(0.9, 1.1)].head()

In [None]:
df_item_pairs_flr_support_confi[df_item_pairs_flr_support_confi.lift.between(0.9, 1.1)].shape

In [None]:
# lift > 1 implies that there is a positive relationship between A and B.
# (ie:  A and B occur together more often than random)
df_item_pairs_flr_support_confi.sort_values(by='lift', ascending=False).head()

In [None]:
df_item_pairs_flr_support_confi.sort_values(by='lift', ascending=False).tail()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(13, 5))

xs = ['lift', 'support_AB', 'confidence_AB']
colors = ['skyblue', 'olive', 'gold']

tit1 = 'without filter'
tit2 = 'min-support>=0.01'
tit3 = tit2 + ' ' + 'confi-lvl>=0.2'

sns.histplot(data=df_item_pairs, x='lift', kde=True, color='gold', ax=axs[0]).set(title=tit1)
sns.histplot(data=df_item_pairs_flr_support, x='lift', kde=True, color='olive', ax=axs[1]).set(title=tit2)
sns.histplot(data=df_item_pairs_flr_support_confi, x='lift', kde=True, color='skyblue', ax=axs[2]).set(title=tit3);

In [None]:
df_apriori = df_item_pairs_flr_support_confi.sort_values(by='lift', ascending=False)
df_apriori

In [None]:
df_heatmap = (df_apriori
              .groupby(['itemA', 'itemB']).lift.mean()
              .reset_index()
              .pivot(index='itemA', columns='itemB', values='lift')
             )
plt.figure(figsize=(7, 5))
sns.heatmap(df_heatmap, annot=True, fmt='.1f', cmap=sns.cubehelix_palette(as_cmap=True))
plt.title('Apriori output: Lift by item pair');

In [None]:
df_apriori = df_apriori[df_apriori.lift >= 1.3]

In [None]:
df_heatmap = (df_apriori
              .groupby(['itemA', 'itemB']).lift.mean()
              .reset_index()
              .pivot(index='itemA', columns='itemB', values='lift')
             )
plt.figure(figsize=(7, 5))
sns.heatmap(df_heatmap, annot=True, fmt='.1f', cmap=sns.cubehelix_palette(as_cmap=True))
plt.title('Apriori output: Lift by item pair');

In [None]:
# Create nodes_df
associated_items_ls = pd.concat([df_apriori.itemA, df_apriori.itemB]).unique()
nodes_df = df_item_support[df_item_support.item_name.isin(associated_items_ls)]

alphanum_nodeid = list(string.ascii_lowercase) + [str(i) for i in range(9)]
nodes_df['node_id'] = [alphanum_nodeid[i] for i in range(len(nodes_df))]
nodes_df

In [None]:
# Add node_id to df_apriori
nodes_id_dict = {r.item_name: r.node_id for _, r in nodes_df.iterrows()}
df_apriori['from_node_id'] = df_apriori.itemA.apply(lambda x: nodes_id_dict[x])
df_apriori['to_node_id'] = df_apriori.itemB.apply(lambda x: nodes_id_dict[x])
df_apriori['edge'] = df_apriori.from_node_id + df_apriori.to_node_id
df_apriori

In [None]:
scaled_size = 4
gra = Digraph(edge_attr={'color':'lightgray'},
              node_attr={'shape': 'circle', 'fixedsize':'true', 'style': 'filled', 'fontsize': '10'},
              graph_attr={'beautify':'true', 'margin': '0.5'},
              format='pdf'
             )

for _, r in nodes_df.iterrows():
    gra.node(r.node_id, r.item_name.lower(), **{'width':str(r.item_support * scaled_size)})

gra.edges(df_apriori.edge.tolist())
gra