In [15]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
# Our dataset doesn't have any variable names, so we created a product variable.
df = pd.read_csv("/content/drive/MyDrive/0.Latest_DS_Course/USL/AssociationRules/Basic/GroceryStoreDataSet.csv", names =['Products'], header = None)

In [18]:
df.head()

Unnamed: 0,Products
0,"MILK,BREAD,BISCUIT"
1,"BREAD,MILK,BISCUIT,CORNFLAKES"
2,"BREAD,TEA,BOURNVITA"
3,"JAM,MAGGI,BREAD,MILK"
4,"MAGGI,TEA,BISCUIT"


In [19]:
df.shape


(20, 1)

In [20]:
# Since our data is combined, we will separate them according to ",".
product_list = list(df["Products"].apply(lambda x:x.split(',')))
product_list

[['MILK', 'BREAD', 'BISCUIT'],
 ['BREAD', 'MILK', 'BISCUIT', 'CORNFLAKES'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['JAM', 'MAGGI', 'BREAD', 'MILK'],
 ['MAGGI', 'TEA', 'BISCUIT'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['MAGGI', 'TEA', 'CORNFLAKES'],
 ['MAGGI', 'BREAD', 'TEA', 'BISCUIT'],
 ['JAM', 'MAGGI', 'BREAD', 'TEA'],
 ['BREAD', 'MILK'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'COCK'],
 ['BREAD', 'SUGER', 'BISCUIT'],
 ['COFFEE', 'SUGER', 'CORNFLAKES'],
 ['BREAD', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['TEA', 'MILK', 'COFFEE', 'CORNFLAKES']]

In [21]:
type(product_list)

list

###In association analysis, the values must be either 1 and 0's or a Boolean data structure type as True / False.

In [22]:
from mlxtend.preprocessing import TransactionEncoder

In [23]:
# One Hot Encoding process has been done.
temp = TransactionEncoder()
temp_df = temp.fit(product_list).transform(product_list)
new_df = pd.DataFrame(temp_df,columns=temp.columns_)
new_df

Unnamed: 0,BISCUIT,BOURNVITA,BREAD,COCK,COFFEE,CORNFLAKES,JAM,MAGGI,MILK,SUGER,TEA
0,True,False,True,False,False,False,False,False,True,False,False
1,True,False,True,False,False,True,False,False,True,False,False
2,False,True,True,False,False,False,False,False,False,False,True
3,False,False,True,False,False,False,True,True,True,False,False
4,True,False,False,False,False,False,False,True,False,False,True
5,False,True,True,False,False,False,False,False,False,False,True
6,False,False,False,False,False,True,False,True,False,False,True
7,True,False,True,False,False,False,False,True,False,False,True
8,False,False,True,False,False,False,True,True,False,False,True
9,False,False,True,False,False,False,False,False,True,False,False


Dataframe is now suitable for association analysis. The next step will be to calculate and interpret the support and confidence values.

In [30]:
# We set our support value as 20%

support_values = apriori(new_df, min_support=0.20, use_colnames=True, verbose = 1)
support_values

Processing 72 combinations | Sampling itemset size 2Processing 42 combinations | Sampling itemset size 3


Unnamed: 0,support,itemsets
0,0.35,(BISCUIT)
1,0.2,(BOURNVITA)
2,0.65,(BREAD)
3,0.4,(COFFEE)
4,0.3,(CORNFLAKES)
5,0.25,(MAGGI)
6,0.25,(MILK)
7,0.3,(SUGER)
8,0.35,(TEA)
9,0.2,"(BISCUIT, BREAD)"


### **To get a better understanding of how support is calculated we are adding "Support_calc" column that will show the calculation part**

In [25]:
# Total number of transactions
total_txns = len(new_df)

# Add a calculation column
support_values['support_calc'] = support_values['support'].apply(
    lambda s: f"{int(s * total_txns)} / {total_txns} = {round(s, 2)}"
)

support_values

Unnamed: 0,support,itemsets,support_calc
0,0.35,(BISCUIT),7 / 20 = 0.35
1,0.2,(BOURNVITA),4 / 20 = 0.2
2,0.65,(BREAD),13 / 20 = 0.65
3,0.4,(COFFEE),8 / 20 = 0.4
4,0.3,(CORNFLAKES),6 / 20 = 0.3
5,0.25,(MAGGI),5 / 20 = 0.25
6,0.25,(MILK),5 / 20 = 0.25
7,0.3,(SUGER),6 / 20 = 0.3
8,0.35,(TEA),7 / 20 = 0.35
9,0.2,"(BISCUIT, BREAD)",4 / 20 = 0.2


## We ranked our data according to the decreasing values for supporter


In [26]:

support_values.sort_values(by = "support", ascending = False)


Unnamed: 0,support,itemsets,support_calc
2,0.65,(BREAD),13 / 20 = 0.65
3,0.4,(COFFEE),8 / 20 = 0.4
0,0.35,(BISCUIT),7 / 20 = 0.35
8,0.35,(TEA),7 / 20 = 0.35
7,0.3,(SUGER),6 / 20 = 0.3
4,0.3,(CORNFLAKES),6 / 20 = 0.3
6,0.25,(MILK),5 / 20 = 0.25
5,0.25,(MAGGI),5 / 20 = 0.25
1,0.2,(BOURNVITA),4 / 20 = 0.2
9,0.2,"(BISCUIT, BREAD)",4 / 20 = 0.2


We access our "confidence and lift" values using association_rules function


###  2. **Confidence**
**When A is bought, how often is B also bought?**  
It shows the **strength of implication**.

>  *"Out of those who bought Bread, how many also bought Butter?"*

 **Formula:**  
$[
\text{Confidence}(A \rightarrow B) = \frac{\text{Support}(A \cup B)}{\text{Support}(A)}
]$

---

In [27]:
rules = association_rules(support_values, metric= "confidence", min_threshold = 0.5)

rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])

rules


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
8,(MAGGI),(TEA),0.25,0.35,0.2,0.8,2.285714,1.0,0.1125,3.25,0.75,0.5,0.692308,0.685714
1,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,1.0,0.0375,1.75,0.25,0.285714,0.428571,0.553846
5,(CORNFLAKES),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,1.0,0.08,1.8,0.571429,0.4,0.444444,0.583333
7,(SUGER),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,1.0,0.08,1.8,0.571429,0.4,0.444444,0.583333
2,(SUGER),(BREAD),0.3,0.65,0.2,0.666667,1.025641,1.0,0.005,1.05,0.035714,0.266667,0.047619,0.487179
9,(TEA),(MAGGI),0.35,0.25,0.2,0.571429,2.285714,1.0,0.1125,1.75,0.865385,0.5,0.428571,0.685714
0,(BISCUIT),(BREAD),0.35,0.65,0.2,0.571429,0.879121,1.0,-0.0275,0.816667,-0.174603,0.25,-0.22449,0.43956
3,(TEA),(BREAD),0.35,0.65,0.2,0.571429,0.879121,1.0,-0.0275,0.816667,-0.174603,0.25,-0.22449,0.43956
4,(COFFEE),(CORNFLAKES),0.4,0.3,0.2,0.5,1.666667,1.0,0.08,1.4,0.666667,0.4,0.285714,0.583333
6,(COFFEE),(SUGER),0.4,0.3,0.2,0.5,1.666667,1.0,0.08,1.4,0.666667,0.4,0.285714,0.583333


## **Interpreting Results**
---

### **1. Support (0.2)**

- **Interpretation**:  
  "The rate of seeing MAGGI and TEA together in all purchases is 0.2."  
  This means that 20% of the total transactions contain both **MAGGI** and **TEA**. This is the **joint probability** that both items appear together in the same transaction.

---

### **2. Confidence (0.8 or 80%)**

- **Interpretation**:  
  "80% of customers who buy MAGGI also buy TEA."  
  This is the **conditional probability** of TEA being purchased given that MAGGI has been purchased. Specifically, out of the transactions that contain MAGGI, 80% of them also contain TEA. This shows the **reliability** of the rule "{MAGGI} → {TEA}".

---

### **3. Lift (2.29)**

- **Interpretation**:  
  "Sales of TEA increase by 2.28 for purchases with MAGGI."  
  This means that **TEA** is **2.29 times more likely** to be purchased when MAGGI is bought than if they were independent.  
  - **Lift > 1** indicates a **positive association**.  
  - Lift here shows that the presence of **MAGGI** makes **TEA** purchases more likely — specifically, it **boosts the likelihood** of buying TEA by a factor of **2.29** compared to what you would expect if the two were unrelated.

---

### In Summary:
Your interpretations are correct:

- **Support**: Measures the overall frequency of the items together.
- **Confidence**: Measures how likely it is that a customer will buy TEA if they’ve already bought MAGGI.
- **Lift**: Measures how much more likely it is for customers to buy TEA when they buy MAGGI, compared to if the two were independent.

---


#**In Real-Time Scenarios LIFT is a better metric than confidence , so always filter results by LIFT  rather than confidence**

In [28]:
rules = association_rules(support_values, metric= "lift", min_threshold = 0.5)

rules = rules.sort_values(['lift', 'confidence'], ascending =[False, False])

rules


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
12,(MAGGI),(TEA),0.25,0.35,0.2,0.8,2.285714,1.0,0.1125,3.25,0.75,0.5,0.692308,0.685714
13,(TEA),(MAGGI),0.35,0.25,0.2,0.571429,2.285714,1.0,0.1125,1.75,0.865385,0.5,0.428571,0.685714
9,(CORNFLAKES),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,1.0,0.08,1.8,0.571429,0.4,0.444444,0.583333
11,(SUGER),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,1.0,0.08,1.8,0.571429,0.4,0.444444,0.583333
8,(COFFEE),(CORNFLAKES),0.4,0.3,0.2,0.5,1.666667,1.0,0.08,1.4,0.666667,0.4,0.285714,0.583333
10,(COFFEE),(SUGER),0.4,0.3,0.2,0.5,1.666667,1.0,0.08,1.4,0.666667,0.4,0.285714,0.583333
2,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,1.0,0.0375,1.75,0.25,0.285714,0.428571,0.553846
3,(BREAD),(MILK),0.65,0.25,0.2,0.307692,1.230769,1.0,0.0375,1.083333,0.535714,0.285714,0.076923,0.553846
4,(SUGER),(BREAD),0.3,0.65,0.2,0.666667,1.025641,1.0,0.005,1.05,0.035714,0.266667,0.047619,0.487179
5,(BREAD),(SUGER),0.65,0.3,0.2,0.307692,1.025641,1.0,0.005,1.011111,0.071429,0.266667,0.010989,0.487179
