# Notebook 02 — Apriori Association Rules

**Purpose:** Find which products are frequently bought together using the Apriori
algorithm. Produces `co_purchase_rules.json` — a lightweight lookup used by
the recommendation engine at runtime.

**Key metrics:**
- **Support:** how often the pair appears across all orders
- **Confidence:** P(B | A) — given A is bought, how likely is B?
- **Lift:** how much more likely B is when A is present vs random

**Input:** `order_baskets.pkl`, `item_catalog.json`

**Output:** `co_purchase_rules.json`

**Runtime:** ~5–10 min on Kaggle


In [13]:
pass  # warnings cell removed


In [14]:
import sys
sys.executable

'/Users/ranaraunitrazsingh/Desktop/Placements Assignments/Unthinkable solutions/voice-shopping-assistant/instacart-env/bin/python'

In [15]:
import sys, subprocess

subprocess.run(
    [sys.executable, "-m", "pip", "install",
     "--force-reinstall",
     "numpy==1.26.4",
     "scikit-learn",
     "pyarrow",
     "mlxtend"],
    check=True
)

Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (61 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.8.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting pyarrow
  Using cached pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (3.1 kB)
Collecting mlxtend
  Using cached mlxtend-0.24.0-py3-none-any.whl.metadata (7.3 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Using cached scipy-1.17.1-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Using cached joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
INFO: pip is looking at multiple versions of mlxtend to determine which version is compatible with other requirements. This could take a while.
Collecting mlxtend
  Using cached mlxtend-0.23.4-py3-none-any.whl.metadata (7.3 kB)
Collecting pandas>=0.

CompletedProcess(args=['/Users/ranaraunitrazsingh/Desktop/Placements Assignments/Unthinkable solutions/voice-shopping-assistant/instacart-env/bin/python', '-m', 'pip', 'install', '--force-reinstall', 'numpy==1.26.4', 'scikit-learn', 'pyarrow', 'mlxtend'], returncode=0)

In [16]:
import os, json, pickle, time
import pandas as pd
import numpy as np
from collections import defaultdict

IS_KAGGLE  = os.path.exists("/kaggle/input")
OUTPUT_DIR = "/kaggle/working" if IS_KAGGLE else "../data/output"
MODELS_DIR = "/kaggle/working" if IS_KAGGLE else "../data/models"
os.makedirs(MODELS_DIR, exist_ok=True)


from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

print("mlxtend imported OK")
print(f"OUTPUT_DIR = {OUTPUT_DIR}")


mlxtend imported OK
OUTPUT_DIR = ../data/output


In [17]:
print("=" * 60)
print("STEP 1: Loading data...")
print("=" * 60)

with open(f"{OUTPUT_DIR}/order_baskets.pkl", "rb") as f:
    baskets = pickle.load(f)
with open(f"{OUTPUT_DIR}/item_catalog.json", "r") as f:
    item_catalog = json.load(f)

name_to_category = {item["name_lower"]: item["category"] for item in item_catalog}
print(f"Loaded {len(baskets):,} baskets")
print(f"Catalog: {len(item_catalog)} items")


STEP 1: Loading data...
Loaded 2,849,883 baskets
Catalog: 3000 items


In [18]:
print("\n" + "=" * 60)
print("STEP 2: Subsampling for memory efficiency...")
print("=" * 60)

# 200K baskets × 3000 products ≈ 600MB boolean matrix — safe on Kaggle 16GB
# Reduce to 100_000 if you hit MemoryError
SAMPLE_SIZE = 200_000

if len(baskets) > SAMPLE_SIZE:
    np.random.seed(42)
    idx = np.random.choice(len(baskets), SAMPLE_SIZE, replace=False)
    baskets_sample = [baskets[i] for i in idx]
    print(f"Sampled {SAMPLE_SIZE:,} of {len(baskets):,} baskets")
else:
    baskets_sample = baskets
    print(f"Using all {len(baskets_sample):,} baskets")



STEP 2: Subsampling for memory efficiency...
Sampled 200,000 of 2,849,883 baskets


In [19]:
print("\n" + "=" * 60)
print("STEP 3: Encoding baskets (TransactionEncoder)...")
print("=" * 60)

t0 = time.time()
te = TransactionEncoder()
te_array = te.fit(baskets_sample).transform(baskets_sample)
basket_df = pd.DataFrame(te_array, columns=te.columns_)
print(f"Encoded in {time.time()-t0:.1f}s")
print(f"Matrix shape: {basket_df.shape}  (baskets × unique products)")
print(f"Memory: {basket_df.memory_usage(deep=True).sum() / 1024**2:.0f} MB")



STEP 3: Encoding baskets (TransactionEncoder)...
Encoded in 4.5s
Matrix shape: (200000, 3000)  (baskets × unique products)
Memory: 572 MB


In [20]:
print("\n" + "=" * 60)
print("STEP 4: Running Apriori (2–5 min)...")
print("=" * 60)

# TUNABLE — reduce min_support if you get too few rules; increase if too slow
MIN_SUPPORT    = 0.001   # item pair must appear in ≥0.5% of orders
MAX_LEN        = 2       # pairwise only (triples are slow and rarely needed)

t0 = time.time()
frequent_itemsets = apriori(
    basket_df,
    min_support=MIN_SUPPORT,
    use_colnames=True,
    max_len=MAX_LEN,
    low_memory=True,
)
print(f"Done in {time.time()-t0:.1f}s")
print(f"Frequent itemsets: {len(frequent_itemsets):,}")
print(f"  Singletons: {len(frequent_itemsets[frequent_itemsets['itemsets'].apply(len)==1]):,}")
print(f"  Pairs:      {len(frequent_itemsets[frequent_itemsets['itemsets'].apply(len)==2]):,}")



STEP 4: Running Apriori (2–5 min)...
Done in 20.0s
Frequent itemsets: 4,342
  Singletons: 1,989
  Pairs:      2,353


In [21]:
print("\n" + "=" * 60)
print("STEP 5: Extracting association rules...")
print("=" * 60)

MIN_CONFIDENCE = 0.2
MIN_LIFT       = 1.0

rules = association_rules(
    frequent_itemsets,
    metric="confidence",
    min_threshold=MIN_CONFIDENCE,
    num_itemsets=len(frequent_itemsets),
)
rules = rules[rules["lift"] >= MIN_LIFT]
print(f"Association rules after filters: {len(rules):,}")

print("\nTop 15 rules by confidence:")
top = rules.sort_values("confidence", ascending=False).head(15)
for _, r in top.iterrows():
    a = list(r["antecedents"])[0]
    c = list(r["consequents"])[0]
    print(f"  {a:40s} → {c:40s}  conf={r['confidence']:.2f}  lift={r['lift']:.1f}")

rules.to_pickle(f"{MODELS_DIR}/apriori_rules.pkl")
print(f"\nSaved full rules: {MODELS_DIR}/apriori_rules.pkl")



STEP 5: Extracting association rules...
Association rules after filters: 470

Top 15 rules by confidence:
  Zero Calorie Cola                        → Soda                                      conf=0.50  lift=45.4
  Organic Yellow Squash                    → Organic Zucchini                          conf=0.49  lift=13.0
  Yotoddler Organic Pear Spinach Mango Yogurt → Organic Whole Milk Strawberry Beet Berry Yogurt Pouch  conf=0.48  lift=205.4
  Non Fat Acai & Mixed Berries Yogurt      → Icelandic Style Skyr Blueberry Non-fat Yogurt  conf=0.47  lift=67.5
  Total 2% Lowfat Greek Strained Yogurt With Blueberry → Total 2% with Strawberry Lowfat Greek Strained Yogurt  conf=0.47  lift=45.4
  Non Fat Raspberry Yogurt                 → Icelandic Style Skyr Blueberry Non-fat Yogurt  conf=0.44  lift=63.6
  Organic Whole Milk Strawberry Beet Berry Yogurt Pouch → Yotoddler Organic Pear Spinach Mango Yogurt  conf=0.44  lift=205.4
  Nonfat Icelandic Style Strawberry Yogurt → Icelandic Style Skyr Bl

In [22]:
print("\n" + "=" * 60)
print("STEP 6: Building co-purchase JSON lookup...")
print("=" * 60)

TOP_PER_ITEM = 10
co_purchase = defaultdict(list)

for _, rule in rules.iterrows():
    ant = list(rule["antecedents"])[0]
    con = list(rule["consequents"])[0]
    co_purchase[ant.lower()].append({
        "item":       con,
        "confidence": round(float(rule["confidence"]), 3),
        "lift":       round(float(rule["lift"]), 2),
        "support":    round(float(rule["support"]), 4),
        "category":   name_to_category.get(con.lower(), "other"),
    })

for k in co_purchase:
    co_purchase[k] = sorted(co_purchase[k], key=lambda x: x["confidence"], reverse=True)[:TOP_PER_ITEM]

co_purchase_dict = dict(co_purchase)

with open(f"{OUTPUT_DIR}/co_purchase_rules.json", "w") as f:
    json.dump(co_purchase_dict, f, indent=2)

print(f"Saved: {OUTPUT_DIR}/co_purchase_rules.json")
print(f"Products with rules:    {len(co_purchase_dict)}")
print(f"Total suggestion pairs: {sum(len(v) for v in co_purchase_dict.values())}")



STEP 6: Building co-purchase JSON lookup...
Saved: ../data/output/co_purchase_rules.json
Products with rules:    336
Total suggestion pairs: 470


In [25]:
print("\n" + "=" * 60)
print("STEP 7: Spot-check — are suggestions sensible?")
print("=" * 60)

test_items = ["banana", "whole milk", "pasta", "bread", "chicken breast"]
for item in test_items:
    key = item
    if key not in co_purchase_dict:
        matches = [k for k in co_purchase_dict if item in k]
        if matches:
            key = matches[0]
        else:
            print(f"  {item}: not found"); continue
    print(f"\n  {key}:")
    for s in co_purchase_dict[key][:5]:
        print(f"    → {s['item']:40s}  conf={s['confidence']:.2f}  lift={s['lift']:.1f}  [{s['category']}]")

catalog_lower = set(i["name_lower"] for i in item_catalog)
rules_lower   = set(co_purchase_dict.keys())
coverage = len(catalog_lower & rules_lower) / len(catalog_lower) * 100
avg_rules = np.mean([len(v) for v in co_purchase_dict.values()])
print(f"\nCatalog coverage: {coverage:.1f}%")
print(f"Avg rules/item:   {avg_rules:.1f}")
print("\n✓ NOTEBOOK 02 COMPLETE — co_purchase_rules.json ready")



STEP 7: Spot-check — are suggestions sensible?
  banana: not found

  whole milk:
    → Banana                                    conf=0.24  lift=1.5  [produce]

  organic tomato basil pasta sauce:
    → Banana                                    conf=0.23  lift=1.4  [produce]
    → Bag of Organic Bananas                    conf=0.20  lift=1.6  [produce]

  100% whole wheat bread:
    → Banana                                    conf=0.25  lift=1.5  [produce]

  air chilled organic boneless skinless chicken breasts:
    → Bag of Organic Bananas                    conf=0.21  lift=1.6  [produce]

Catalog coverage: 11.2%
Avg rules/item:   1.4

✓ NOTEBOOK 02 COMPLETE — co_purchase_rules.json ready


In [24]:
import numpy, scipy, sklearn
print("numpy:", numpy.__version__)
print("scipy:", scipy.__version__)
print("sklearn:", sklearn.__version__)

numpy: 1.26.4
scipy: 1.17.1
sklearn: 1.8.0
