In [1]:
# Import necessary libraries
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
# Data Preprocessing 
# Load the dataset
df_raw = pd.read_excel("Online retail.xlsx", header=None)
df_raw

Unnamed: 0,0
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."
...,...
7496,"butter,light mayo,fresh bread"
7497,"burgers,frozen vegetables,eggs,french fries,ma..."
7498,chicken
7499,"escalope,green tea"


In [3]:
df_raw.info

<bound method DataFrame.info of                                                       0
0     shrimp,almonds,avocado,vegetables mix,green gr...
1                                burgers,meatballs,eggs
2                                               chutney
3                                        turkey,avocado
4     mineral water,milk,energy bar,whole wheat rice...
...                                                 ...
7496                      butter,light mayo,fresh bread
7497  burgers,frozen vegetables,eggs,french fries,ma...
7498                                            chicken
7499                                 escalope,green tea
7500    eggs,frozen smoothie,yogurt cake,low fat yogurt

[7501 rows x 1 columns]>

In [4]:
df_raw.head()

Unnamed: 0,0
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."


In [5]:
df_raw.columns = ['Basket']
df_raw.columns

Index(['Basket'], dtype='object')

In [6]:
# Handle missing values: remove empty baskets
df_clean = df_raw.dropna(subset=['Basket'])
df_clean

Unnamed: 0,Basket
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."
...,...
7496,"butter,light mayo,fresh bread"
7497,"burgers,frozen vegetables,eggs,french fries,ma..."
7498,chicken
7499,"escalope,green tea"


In [7]:
# Remove duplicate baskets
df_clean = df_clean.drop_duplicates()
df_clean 

Unnamed: 0,Basket
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."
...,...
7493,"burgers,salmon,pancakes,french fries,frozen sm..."
7494,"turkey,burgers,dessert wine,shrimp,pasta,tomat..."
7496,"butter,light mayo,fresh bread"
7497,"burgers,frozen vegetables,eggs,french fries,ma..."


In [8]:
# Convert each basket string into a list of items
baskets = df_clean['Basket'].apply(lambda x: [item.strip() for item in str(x).split(',') if item.strip()])
baskets 

0       [shrimp, almonds, avocado, vegetables mix, gre...
1                              [burgers, meatballs, eggs]
2                                               [chutney]
3                                       [turkey, avocado]
4       [mineral water, milk, energy bar, whole wheat ...
                              ...                        
7493    [burgers, salmon, pancakes, french fries, froz...
7494    [turkey, burgers, dessert wine, shrimp, pasta,...
7496                    [butter, light mayo, fresh bread]
7497    [burgers, frozen vegetables, eggs, french frie...
7500    [eggs, frozen smoothie, yogurt cake, low fat y...
Name: Basket, Length: 5176, dtype: object

In [9]:
# Convert list of baskets into one-hot encoded DataFrame
te = TransactionEncoder()
te

In [10]:
te_ary = te.fit(baskets.tolist()).transform(baskets.tolist())
te_ary

array([[ True,  True, False, ...,  True, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False,  True, False]])

In [11]:
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
df_encoded

Unnamed: 0,almonds,antioxydant juice,asparagus,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,body spray,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,True,True,False,True,False,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5171,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5172,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
5173,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5174,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [12]:
#Association Rule Mining
# Set support threshold 0.5
min_support = 0.05
min_support

0.05

In [13]:
frequent_itemsets = apriori(df_encoded, min_support=min_support, use_colnames=True)
frequent_itemsets 

Unnamed: 0,support,itemsets
0,0.113794,(burgers)
1,0.103555,(cake)
2,0.054869,(champagne)
3,0.083849,(chicken)
4,0.205178,(chocolate)
5,0.060665,(cookies)
6,0.07187,(cooking oil)
7,0.208076,(eggs)
8,0.083849,(escalope)
9,0.19262,(french fries)


In [14]:
# Print the top frequent itemsets
print("Most frequent itemsets:")
print(frequent_itemsets.sort_values('support', ascending=False).head())

Most frequent itemsets:
     support         itemsets
20  0.299845  (mineral water)
26  0.229521      (spaghetti)
7   0.208076           (eggs)
4   0.205178      (chocolate)
9   0.192620   (french fries)


In [15]:
# Set thresholds for rules
min_confidence = 0.3 
min_lift = 1.2  

In [16]:
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=min_confidence)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(chocolate),(mineral water),0.205178,0.299845,0.073223,0.356874,1.190193,1.0,0.011701,1.088674,0.201051,0.169575,0.081451,0.300537
1,(eggs),(mineral water),0.208076,0.299845,0.070131,0.337047,1.12407,1.0,0.007741,1.056115,0.139377,0.160194,0.053134,0.28547
2,(frozen vegetables),(mineral water),0.12983,0.299845,0.050425,0.388393,1.29531,1.0,0.011496,1.144778,0.262,0.13296,0.126468,0.278281
3,(ground beef),(mineral water),0.135819,0.299845,0.058733,0.432432,1.442184,1.0,0.018008,1.233606,0.354795,0.155818,0.189368,0.314154
4,(ground beef),(spaghetti),0.135819,0.229521,0.055835,0.411095,1.791102,1.0,0.024661,1.308326,0.511102,0.1804,0.235664,0.327181
5,(milk),(mineral water),0.170015,0.299845,0.067813,0.398864,1.330231,1.0,0.016835,1.164718,0.299103,0.168669,0.141423,0.312512
6,(spaghetti),(mineral water),0.229521,0.299845,0.085008,0.37037,1.235204,1.0,0.016187,1.11201,0.247141,0.191304,0.100728,0.326938


In [17]:
# Filter rules to include only those with lift > threshold
rules = rules[rules['lift'] > min_lift]
rules 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
2,(frozen vegetables),(mineral water),0.12983,0.299845,0.050425,0.388393,1.29531,1.0,0.011496,1.144778,0.262,0.13296,0.126468,0.278281
3,(ground beef),(mineral water),0.135819,0.299845,0.058733,0.432432,1.442184,1.0,0.018008,1.233606,0.354795,0.155818,0.189368,0.314154
4,(ground beef),(spaghetti),0.135819,0.229521,0.055835,0.411095,1.791102,1.0,0.024661,1.308326,0.511102,0.1804,0.235664,0.327181
5,(milk),(mineral water),0.170015,0.299845,0.067813,0.398864,1.330231,1.0,0.016835,1.164718,0.299103,0.168669,0.141423,0.312512
6,(spaghetti),(mineral water),0.229521,0.299845,0.085008,0.37037,1.235204,1.0,0.016187,1.11201,0.247141,0.191304,0.100728,0.326938


In [18]:
#Analysis & Interpretation
# Display top 10 rules sorted by lift
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]
      .sort_values('lift', ascending=False)
      .head(10))

           antecedents      consequents   support  confidence      lift
4        (ground beef)      (spaghetti)  0.055835    0.411095  1.791102
3        (ground beef)  (mineral water)  0.058733    0.432432  1.442184
5               (milk)  (mineral water)  0.067813    0.398864  1.330231
2  (frozen vegetables)  (mineral water)  0.050425    0.388393  1.295310
6          (spaghetti)  (mineral water)  0.085008    0.370370  1.235204


In [19]:
# Additional insights: For example, rules involving 'eggs'
eggs_rules = rules[(rules['antecedents'].apply(lambda x: 'eggs' in x)) |
                   (rules['consequents'].apply(lambda x: 'eggs' in x))]
eggs_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski


In [20]:
print(eggs_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []


In [21]:
#Save to Csv File
rules.to_csv('association_rules_output.csv', index=False)

In [22]:
###Interview Questions:
#1.
###What is lift and why is it important in Association rules?
#Lift measures how much more likely items A and B are bought together compared to if they were independent.
#(Lift(A → B) = Confidence(A → B)/Support(B)
#L>1 → Positive association → A and B are bought together more often than by chance.
#L= → No association → A and B are independent.
#L<1 → Negative association → Buying A reduces the chance of buying B.
#2.
###What is support and Confidence. How do you calculate them?
#Support:frequency
#Support(A)=Transactions containing A/Total Transactions
#Confidence:reliability of rule.
#Confidence(A→B)=Support(A∪B)/Support(A)
#3.
###What are some limitations or challenges of Association rules mining?
#Association rule mining generates too many rules, is sensitive to thresholds, ignores purchase order/time,
#misses rare items, and can be computationally expensive.