# Lab 6: Association Rule Mining with Apriori and FP-Growth
**Name:** Your Name Here  
**Course:** MSCS_634  
**Assignment:** Lab 6 - Association Rule Mining


In [None]:
import os
import io
import time
import zipfile
import warnings
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth
from mlxtend.preprocessing import TransactionEncoder

warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (10,6)

def show_top(series, n=20, title=None):
    top = series.sort_values(ascending=False).head(n)
    print(top)
    sns.barplot(x=top.values, y=top.index)
    plt.title(title or 'Top items')
    plt.xlabel('Count')
    plt.tight_layout()
    plt.show()

In [None]:
DATA_URL = (
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx"
)
DATA_FILE = "Online_Retail.xlsx"

if not os.path.exists(DATA_FILE):
    print('Downloading dataset...')
    df = pd.read_excel(DATA_URL)
    df.to_excel(DATA_FILE, index=False)
else:
    df = pd.read_excel(DATA_FILE)

print('Shape:', df.shape)
print(df.head())

In [None]:
df = df.dropna(subset=['InvoiceNo', 'Description'])
df['InvoiceNo'] = df['InvoiceNo'].astype(str)
df = df[~df['InvoiceNo'].str.startswith('C')]
df = df[df['Quantity'] > 0]
df['Description'] = df['Description'].str.strip()

print('After cleaning shape:', df.shape)
print(df[['InvoiceNo','Description','Quantity']].head())

In [None]:
item_counts = df.groupby('Description')['Quantity'].sum().sort_values(ascending=False)
show_top(item_counts, n=20, title='Top 20 Most Frequently Sold Items (by Quantity)')

items_per_invoice = df.groupby('InvoiceNo')['Description'].nunique()
sns.histplot(items_per_invoice, bins=50)
plt.title('Distribution of distinct items per invoice')
plt.show()

TOP_K = 20
top_items = item_counts.head(TOP_K).index.tolist()

subset = df[df['Description'].isin(top_items)]
trans = subset.groupby(['InvoiceNo','Description']).size().unstack(fill_value=0)
trans = (trans > 0).astype(int)

coocc = trans.T.dot(trans)
sns.heatmap(coocc, xticklabels=top_items, yticklabels=top_items)
plt.title(f'Co-occurrence among top {TOP_K} items')
plt.show()

In [None]:
transactions = df.groupby('InvoiceNo')['Description'].apply(list).tolist()
print('Transactions:', len(transactions))

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
trans_df = pd.DataFrame(te_ary, columns=te.columns_)

min_support = 0.01
print('Using min_support:', min_support)

In [None]:
start = time.time()
frequent_ap = apriori(trans_df, min_support=min_support, use_colnames=True)
ap_time = time.time() - start

frequent_ap['length'] = frequent_ap['itemsets'].apply(lambda x: len(x))
frequent_ap = frequent_ap.sort_values(['support','length'], ascending=[False, False])
print(f'Apriori found {len(frequent_ap)} itemsets in {ap_time:.2f}s')
print(frequent_ap.head())

In [None]:
start = time.time()
frequent_fp = fpgrowth(trans_df, min_support=min_support, use_colnames=True)
fp_time = time.time() - start

frequent_fp['length'] = frequent_fp['itemsets'].apply(lambda x: len(x))
frequent_fp = frequent_fp.sort_values(['support','length'], ascending=[False, False])
print(f'FP-Growth found {len(frequent_fp)} itemsets in {fp_time:.2f}s')
print(frequent_fp.head())

In [None]:
min_confidence = 0.3
rules_ap = association_rules(frequent_ap, metric='confidence', min_threshold=min_confidence)
print('Rules (Apriori):', len(rules_ap))

sns.scatterplot(x='confidence', y='lift', size='support', data=rules_ap)
plt.title('Confidence vs Lift (Apriori)')
plt.show()

In [None]:
print('Apriori time:', ap_time)
print('FP-Growth time:', fp_time)
print('Itemsets:', len(frequent_ap), len(frequent_fp))
print('Rules:', len(rules_ap))