In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder

!pip install mlxtend==0.23.1

Collecting mlxtend==0.23.1
  Downloading mlxtend-0.23.1-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mlxtend
  Attempting uninstall: mlxtend
    Found existing installation: mlxtend 0.23.3
    Uninstalling mlxtend-0.23.3:
      Successfully uninstalled mlxtend-0.23.3
Successfully installed mlxtend-0.23.1


# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here:
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [2]:
# load the data set ans show the first five transaction
url = "https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


In [3]:
unique_products = set(df.stack())
print(unique_products)

{'Bread', 'Pencil', 'Cheese', 'Meat', 'Milk', 'Bagel', 'Diaper', 'Wine', 'Eggs'}


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [9]:
#create an itemset based on the products
one_hot_encoded = {product: 0 for product in unique_products}

# Example transaction: Using the first row of the DataFrame
transaction = df.iloc[0].dropna().tolist()  # Drop NaN and convert the row to a list

# Update the dictionary for purchased products
for product in transaction:
    if product in one_hot_encoded:
        one_hot_encoded[product] = 1

# Display the one-hot encoding in vertical format
for product, encoded_value in one_hot_encoded.items():
    print(f"{product}: {encoded_value}")

Bread: 1
Pencil: 1
Cheese: 1
Meat: 1
Milk: 0
Bagel: 0
Diaper: 1
Wine: 1
Eggs: 1


In [21]:
import pandas as pd

# Assuming `df` is already loaded

# Flatten the DataFrame to create a set of unique products and sort them to maintain consistent column order
unique_products = sorted(set(df.stack()))

# Create a list to store the one-hot encoded rows
all_rows = []

# Loop through each transaction (row) in the DataFrame
for index, row in df.iterrows():
    # Initialize a dictionary to store the encoding for the current transaction
    one_hot_encoded = {product: 0 for product in unique_products}

    # Get the list of purchased products (drop NaN values)
    transaction = row.dropna().tolist()

    # Update the dictionary for purchased products
    for product in transaction:
        if product in one_hot_encoded:
            one_hot_encoded[product] = 1

    all_rows.append(one_hot_encoded)  # Append the encoded row to the list

# Create a DataFrame from the list of encoded rows
one_hot_df = pd.DataFrame(all_rows, columns=unique_products)

# The DataFrame `one_hot_df` will automatically be displayed in a Jupyter Notebook or can be assigned for further use.
one_hot_df


Unnamed: 0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine
0,0,1,1,1,1,1,0,1,1
1,0,1,1,1,0,1,1,1,1
2,0,0,1,0,1,1,1,0,1
3,0,0,1,0,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...
310,0,1,1,0,1,0,0,0,0
311,0,0,0,0,0,1,1,1,0
312,0,1,1,1,1,1,0,1,1
313,0,0,1,0,0,1,0,0,0


In [19]:
# Since, the encoded dataframe consist of the empty column. We will drop the NaN column or u can use the index.
import pandas as pd

# Assuming `df` is already loaded

# Flatten the DataFrame to create a set of unique products and sort them to maintain consistent column order
unique_products = sorted(set(df.stack()))

# Create a list to store the one-hot encoded rows
all_rows = []

# Loop through each transaction (row) in the DataFrame
for index, row in df.iterrows():
    # Initialize a dictionary to store the encoding for the current transaction
    one_hot_encoded = {product: 0 for product in unique_products}

    # Get the list of purchased products (drop NaN values)
    transaction = row.dropna().tolist()

    # Update the dictionary for purchased products
    for product in transaction:
        if product in one_hot_encoded:
            one_hot_encoded[product] = 1

    all_rows.append(one_hot_encoded)  # Append the encoded row to the list

# Create a DataFrame from the list of encoded rows
one_hot_df = pd.DataFrame(all_rows, columns=unique_products)

# Drop columns with NaN values and show the first 5 rows
one_hot_df.dropna(axis=1, how='all').head()


Unnamed: 0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine
0,0,1,1,1,1,1,0,1,1
1,0,1,1,1,0,1,1,1,1
2,0,0,1,0,1,1,1,0,1
3,0,0,1,0,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products.
For this case study, we will min_support=0.2

In [24]:
#Set threshold value untuk digunakan dalam penghitungan support
from mlxtend.frequent_patterns import apriori, association_rules

# Apply Apriori algorithm with min_support=0.2
frequent_itemsets = apriori(one_hot_df, min_support=0.2, use_colnames=True)

# Sort the results by support in descending order (optional for presentation)
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False).reset_index(drop=True)

# Display the frequent itemsets DataFrame
frequent_itemsets


  and should_run_async(code)


Unnamed: 0,support,itemsets
0,0.504762,(Bread)
1,0.501587,(Cheese)
2,0.501587,(Milk)
3,0.47619,(Meat)
4,0.438095,(Eggs)
5,0.438095,(Wine)
6,0.425397,(Bagel)
7,0.406349,(Diaper)
8,0.361905,(Pencil)
9,0.32381,"(Meat, Cheese)"


The we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [25]:
from mlxtend.frequent_patterns import apriori, association_rules

# Assuming `one_hot_df` is the one-hot encoded DataFrame

# Apply Apriori algorithm with min_support=0.2
frequent_itemsets = apriori(one_hot_df, min_support=0.2, use_colnames=True)

# Generate association rules with a confidence threshold of 0.6
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

# Sort the rules by confidence and reset index for clarity
rules = rules.sort_values(by="confidence", ascending=False).reset_index(drop=True)

# Display the generated association rules
rules

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,"(Meat, Milk)",(Cheese),0.244444,0.501587,0.203175,0.831169,1.657077,0.080564,2.952137,0.524816
1,"(Meat, Eggs)",(Cheese),0.266667,0.501587,0.215873,0.809524,1.613924,0.082116,2.616667,0.518717
2,"(Cheese, Eggs)",(Meat),0.298413,0.47619,0.215873,0.723404,1.519149,0.073772,1.893773,0.487091
3,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203,0.469167
4,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754,0.500891
5,"(Meat, Cheese)",(Eggs),0.32381,0.438095,0.215873,0.666667,1.521739,0.074014,1.685714,0.507042
6,"(Cheese, Milk)",(Meat),0.304762,0.47619,0.203175,0.666667,1.4,0.05805,1.571429,0.410959
7,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265,0.402687
8,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891,0.526414
9,"(Meat, Cheese)",(Milk),0.32381,0.501587,0.203175,0.627451,1.250931,0.040756,1.337845,0.296655


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__, __conviction__, __conviction__ and the interpretation from the case above (please use text section)