In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder

!pip install mlxtend==0.23.1



# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here:
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [132]:
# load the data set ans show the first five transaction

df = pd.read_table('https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv',delimiter=',')
df.head(5)


Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


In [133]:
print(df['0'].unique())

['Bread' 'Cheese' 'Meat' 'Eggs' 'Wine' 'Bagel' 'Pencil' 'Diaper' 'Milk']


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [144]:
#create an itemset based on the products

encode_dict = {
    'Milk': 1,
    'Bagel': 1,
    np.nan: 0,
    'Wine': 1,
    'Cheese': 1,
    'Diaper': 1,
    'Meat': 1,
    'Eggs': 1,
    'Bread': 1,
    'Pencil': 1
}

# Filter out NaN keys (since NaN cannot be used as a column name)
filtered_dict = {k: v for k, v in encode_dict.items() if pd.notna(k)}

# Create a DataFrame with one row
df_encoded = pd.DataFrame([filtered_dict])

# Display the one-hot encoded DataFrame
print(df_encoded)



   Milk  Bagel  Wine  Cheese  Diaper  Meat  Eggs  Bread  Pencil
0     1      1     1       1       1     1     1      1       1


In [145]:
# Melt the DataFrame into a long format
melted_df = df.stack().reset_index(level=1, drop=True).reset_index()
melted_df.columns = ['Row', 'Item']

# Map the encoded values using the dictionary
melted_df['Encoded'] = melted_df['Item'].map(encode_dict).fillna(0).astype(int)

# Pivot back into a wide format
df_encoded = melted_df.pivot_table(index='Row', columns='Item', values='Encoded', aggfunc='max', fill_value=0)

# Reset index for cleaner output
df_encoded = df_encoded.reset_index(drop=True)

# Display the resulting DataFrame
df_encoded.head()


Item,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine
0,0,1,1,1,1,1,0,1,1
1,0,1,1,1,0,1,1,1,1
2,0,0,1,0,1,1,1,0,1
3,0,0,1,0,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1


In [154]:
# Since, the encoded dataframe consist of the empty column. We will drop the NaN column or u can use the index.
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
df_imputed = pd.DataFrame(imputer.fit_transform(df_encoded), columns=df_encoded.columns)
df_imputed.head(10)

Item,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine
0,0,1,1,1,1,1,0,1,1
1,0,1,1,1,0,1,1,1,1
2,0,0,1,0,1,1,1,0,1
3,0,0,1,0,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1
5,1,1,0,1,1,0,1,1,1
6,0,0,1,0,1,0,0,1,1
7,1,1,0,1,0,0,1,1,0
8,0,1,1,1,1,0,1,0,1
9,1,0,1,1,1,1,0,1,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products.
For this case study, we will min_support=0.2

In [155]:
#Set threshold value untuk digunakan dalam penghitungan support
from mlxtend.frequent_patterns import apriori, association_rules

frequent_itemsets = apriori(df_encoded, min_support=0.2, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
# printing the frequent itemset
frequent_itemsets.head(33)



Unnamed: 0,support,itemsets,length
0,0.425397,(Bagel),1
1,0.504762,(Bread),1
2,0.501587,(Cheese),1
3,0.406349,(Diaper),1
4,0.438095,(Eggs),1
5,0.47619,(Meat),1
6,0.501587,(Milk),1
7,0.361905,(Pencil),1
8,0.438095,(Wine),1
9,0.279365,"(Bread, Bagel)",2


The we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [156]:
rules = association_rules(frequent_itemsets, metric ="lift", min_threshold = 0.6) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
52,"(Milk, Meat)",(Cheese),0.244444,0.501587,0.203175,0.831169,1.657077,0.080564,2.952137,0.524816
46,"(Eggs, Meat)",(Cheese),0.266667,0.501587,0.215873,0.809524,1.613924,0.082116,2.616667,0.518717
44,"(Cheese, Eggs)",(Meat),0.298413,0.47619,0.215873,0.723404,1.519149,0.073772,1.893773,0.487091
19,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203,0.469167
21,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754,0.500891


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__, __conviction__, __conviction__ and the interpretation from the case above (please use text section)

### Antecedents
Antecedents are the items or products on the left-hand side of an association rule. They represent the "if" condition in the rule. For example, in the rule {Bread} → {Butter}, the antecedent is {Bread}.

### Consequents
Consequents are the items or products on the right-hand side of an association rule. They represent the "then" outcome. Using the same example, in {Bread} → {Butter}, the consequent is {Butter}.

### Support
Support is the proportion of transactions containing both the antecedent and the consequent. It reflects how often the rule occurs in the dataset. For example, if {Bread, Butter} appears in 20 out of 100 transactions, the support is 0.2
20/100=0.2.

###### Antecedent Support
Antecedent support measures how often the antecedent appears in the dataset. It is calculated as the proportion of transactions containing the antecedent. For instance, if {Bread} appears in 30 out of 100 transactions, the antecedent support is 0.3
30/100=0.3.

###### Consequent Support
Consequent support measures how often the consequent appears in the dataset. Similar to antecedent support, it is the proportion of transactions containing the consequent. For example, if {Butter} appears in 40 out of 100 transactions, the consequent support is 0.4
40/100=0.4.

### Confidence
he confidence of a rule A->C is the probability of seeing the consequent in a transaction given that it also contains the antecedent.

### Lift
The lift metric is commonly used to measure how much more often the antecedent and consequent of a rule A->C occur together than we would expect if they were statistically independent. 

### Leverage
Leverage is a measure that indicates how much more likely two items (antecedent and consequent) appear together than if they were independent. A positive leverage value suggests that the items occur together more often than expected, while a negative value means they occur less frequently than expected.

### Conviction
Conviction measures the degree to which an association rule holds true compared to random chance. It considers how likely the rule is to be correct, based on the absence of the consequent when the antecedent is present. Higher conviction means the rule is more reliable.

Source
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/
https://medium.com/analytics-vidhya/apriori-algorithm-in-association-rule-learning-9287fe17e944
