In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/associative-rules-data/retail_transactional.csv
/kaggle/input/associative-rules-data/groceries.csv


In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import csv
import time
import tracemalloc
from itertools import combinations

In [7]:
import os
from collections import Counter

# File paths
GROCERIES_PATH = '/kaggle/input/associative-rules-data/groceries.csv'
RETAIL_PATH = '/kaggle/input/associative-rules-data/retail_transactional.csv'

def inspect_and_analyze(file_path, name):
    print(f"\n{'='*50}")
    print(f" Analyzing Dataset: {name}")
    print(f"{'='*50}")
    
    if not os.path.exists(file_path):
        print(f" File not found: {file_path}")
        return

    # 1. Preview raw content (First 5 lines) to understand structure
    print(" Raw File Content (First 5 lines):")
    try:
        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
            raw_lines = [f.readline().strip() for _ in range(5)]
            for i, line in enumerate(raw_lines):
                print(f"  Line {i+1}: {line}")
    except Exception as e:
        print(f"  Could not read raw lines: {e}")
        return

    # 2. Process file line by line to calculate statistics
    transactions = []
    unique_items = set()
    
    try:
        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
            for line in f:
                line = line.strip()
                if not line: continue
                
                # Split by comma (assuming standard CSV or basket format)
                # We remove quotes to clean up data like "milk" -> milk
                items = [i.strip().replace('"', '') for i in line.split(',')]
                
                # Filter out empty strings
                items = [i for i in items if i]
                
                if items:
                    transactions.append(items)
                    unique_items.update(items)

        # 3. Calculate Statistics
        num_transactions = len(transactions)
        num_unique_items = len(unique_items)
        
        if num_transactions == 0:
            print(" Dataset appears empty.")
            return

        avg_len = sum(len(t) for t in transactions) / num_transactions
        
        # Calculate Top 5 Frequent Items
        all_items = [item for t in transactions for item in t]
        item_counts = Counter(all_items)
        top_5 = item_counts.most_common(5)

        print(f"\n Statistics Summary:")
        print(f"  • Total Transactions (Rows): {num_transactions:,}")
        print(f"  • Unique Items Found: {num_unique_items:,}")
        print(f"  • Avg Items per Row: {avg_len:.2f}")
        
        print(f"\n Top 5 Frequent Items (Check for headers or strange IDs):")
        for item, count in top_5:
            support = count / num_transactions
            print(f"  - '{item}': {count} times (Freq: {support:.4f})")

    except Excep as e:
        print(f" Error calculating stats: {e}")

# Run analysis
inspect_and_analyze(GROCERIES_PATH, "Groceries")
inspect_and_analyze(RETAIL_PATH, "Retail Transactional")



 Analyzing Dataset: Groceries
 Raw File Content (First 5 lines):
  Line 1: citrus fruit,semi-finished bread,margarine,ready soups
  Line 2: tropical fruit,yogurt,coffee
  Line 3: whole milk
  Line 4: pip fruit,yogurt,cream cheese ,meat spreads
  Line 5: other vegetables,whole milk,condensed milk,long life bakery product

 Statistics Summary:
  • Total Transactions (Rows): 9,835
  • Unique Items Found: 169
  • Avg Items per Row: 4.41

 Top 5 Frequent Items (Check for headers or strange IDs):
  - 'whole milk': 2513 times (Freq: 0.2555)
  - 'other vegetables': 1903 times (Freq: 0.1935)
  - 'rolls/buns': 1809 times (Freq: 0.1839)
  - 'soda': 1715 times (Freq: 0.1744)
  - 'yogurt': 1372 times (Freq: 0.1395)

 Analyzing Dataset: Retail Transactional
 Raw File Content (First 5 lines):
  Line 1: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
  Line 2: 30,31,32
  Line 3: 33,34,35
  Line 4: 36,37,38,39,40,41,42,43,44,45,46
  Line 5: 38,39,47,48

 Statistics Summ

# Analysis of Groceries Dataset (The “Control” Group)
- Structure: It is a Sparse Dataset.
- Volume: ~10k transactions.
- Dimensionality: Only 169 unique items.
- Implication for Algorithms:
- Standard Apriori: Will run very fast. Generating pairs from 169 items is trivial for modern CPUs.
- DHP:  probably won’t see a huge performance gain here because the overhead of hashing might outweigh the benefit of pruning such a small candidate set.
- Sampling: With 10k rows, a 10% sample is only ~980 rows. This is small enough that statistical variance might cause some False Negatives at low support thresholds.

# Analysis of Retail Transactional Dataset (The “Stress Test”)
- Structure: It is a High-Dimensional / Dense Core Dataset.
- Volume: ~88k transactions (9x larger than Groceries).
- Dimensionality: 16,470 Unique Items!
- The “Explosion” Risk:
- In Standard Apriori, step k= 2  tries to generate pairs from frequent single items.
- Worst case scenario (if many items are frequent):135,630,465 pairs.
- Item Distribution: Item '39' appears in 57% of baskets. Item '48' in 47%. This means L1 (Frequent 1-itemsets) will be large, guaranteeing that C2 will be huge.


In [10]:
# ==========================================
# 1. DATA LOADING UTILS
# ==========================================
def load_dataset(file_path):
    
    dataset = []
    print(f" Loading: {file_path} ...")
    try:
        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
            for line in f:
                line = line.strip()
                if not line: continue
                # Clean and split items
                items = [i.strip().replace('"', '') for i in line.split(',')]
                items = [i for i in items if i] # Remove empty strings
                if items:
                    dataset.append(frozenset(items))
        print(f"✅ Loaded {len(dataset)} transactions.")
        return dataset
    except Exception as e:
        print(f" Error loading file: {e}")
        return []

In [11]:
# ==========================================
# 2. HELPER FUNCTIONS (The Engine)
# ==========================================

def create_c1(dataset):
    c1 = {}
    for transaction in dataset:
        for item in transaction:
            itemset = frozenset([item])
            c1[itemset] = c1.get(itemset, 0) + 1
    return c1

def filter_candidates(candidates_counts, min_support_count):
    ret_l = {}
    for key, value in candidates_counts.items():
        if value >= min_support_count:
            ret_l[key] = value
    return ret_l

def generate_Ck(Lk_minus_1, k):
   
    candidates = set()
    lk_list = list(Lk_minus_1.keys())
    len_lk = len(lk_list)
    
    # Join Step
    for i in range(len_lk):
        for j in range(i + 1, len_lk):
            l1 = list(lk_list[i])
            l2 = list(lk_list[j])
            l1.sort()
            l2.sort()
            
            # Check if first k-2 items are equal
            if l1[:k-2] == l2[:k-2]:
                # Union creates the new candidate
                candidate = lk_list[i] | lk_list[j]
                
                # Prune Step: Check if all subsets are frequent
                # (Optional for speed in Python, but standard for strict Apriori)
                if has_infrequent_subset(candidate, Lk_minus_1):
                    continue
                candidates.add(candidate)
    return candidates

def has_infrequent_subset(candidate, Lk_minus_1):
    # generate all subsets of size k-1
    for subset in combinations(candidate, len(candidate) - 1):
        if frozenset(subset) not in Lk_minus_1:
            return True
    return False

def count_candidates(dataset, candidates):
    counts = {cand: 0 for cand in candidates}
    
    # Optimization: Iterate through dataset once
    for transaction in dataset:
        # For each candidate, check if it is in transaction
        # Optimization for k=2: Double loop over transaction items is sometimes faster than subset check
        # But for general k, issubset is safest in Python
        for cand in candidates:
            if cand.issubset(transaction):
                counts[cand] += 1
    return counts


In [13]:
# ==========================================
# 3. STANDARD APRIORI ALGORITHM (Instrumented)
# ==========================================

def run_standard_apriori(dataset, min_support_fraction):
    
    tracemalloc.start()
    start_time = time.time()
    
    min_support_count = int(len(dataset) * min_support_fraction)
    stats = {
        "algorithm": "Standard Apriori",
        "min_sup": min_support_fraction,
        "c2_count": 0, # To store |C2|
        "total_freq_itemsets": 0,
        "execution_time": 0,
        "memory_peak_mb": 0
    }
    
    L_global = {}
    
    # --- Step 1: C1 & L1 ---
    print(f"   [Standard] Generating L1 (Min Count: {min_support_count})...")
    C1_counts = create_c1(dataset)
    L1 = filter_candidates(C1_counts, min_support_count)
    L_global.update(L1)
    
    L_current = L1
    k = 2
    
    while L_current:
        # print(f"   [Standard] Step k={k}...")
        
        # Generate Candidates
        Ck = generate_Ck(L_current, k)
        
        # *** CAPTURE |C2| METRIC ***
        if k == 2:
            stats["c2_count"] = len(Ck)
            print(f"   [Metric Captured] |C2_standard| = {len(Ck)}")
        
        if not Ck:
            break
            
        # Count Candidates
        Ck_counts = count_candidates(dataset, Ck)
        
        # Filter
        Lk = filter_candidates(Ck_counts, min_support_count)
        
        if not Lk:
            break
            
        L_global.update(Lk)
        L_current = Lk
        k += 1

    end_time = time.time()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    
    stats["total_freq_itemsets"] = len(L_global)
    stats["execution_time"] = end_time - start_time
    stats["memory_peak_mb"] = peak / (1024 * 1024)
    
    return L_global, stats

In [17]:

GROCERIES_PATH = '/kaggle/input/associative-rules-data/groceries.csv'

groceries_data = load_dataset(GROCERIES_PATH)

min_sup_list = [0.05, 0.03, 0.02, 0.01]

results_summary = []

if groceries_data:
    print(f"\n{'='*60}")
    print(f" STARTING STANDARD APRIORI TEST LOOP (Groceries)")
    print(f"{'='*60}")

    for min_sup in min_sup_list:
        print(f"\nTesting Min Support: {min_sup}")
        
        _, metrics = run_standard_apriori(groceries_data, min_support_fraction=min_sup)
        
        results_summary.append(metrics)
        
        print(f"    Done.")
        print(f"   • |C2_standard|:    {metrics['c2_count']}")
        print(f"   • Frequent Sets:    {metrics['total_freq_itemsets']}")
        print(f"   • Execution Time:   {metrics['execution_time']:.4f} s")
        print(f"   • Peak Memory:      {metrics['memory_peak_mb']:.4f} MB")

   
    print(f"\n{'='*60}")
    print(" FINAL SUMMARY TABLE (Standard Apriori)")
    print(f"{'='*60}")
    
    df_results = pd.DataFrame(results_summary)
    cols = ['min_sup', 'c2_count', 'total_freq_itemsets', 'execution_time', 'memory_peak_mb']
    print(df_results[cols].to_string(index=False))


 Loading: /kaggle/input/associative-rules-data/groceries.csv ...
✅ Loaded 9835 transactions.

 STARTING STANDARD APRIORI TEST LOOP (Groceries)

Testing Min Support: 0.05
   [Standard] Generating L1 (Min Count: 491)...
   [Metric Captured] |C2_standard| = 378
    Done.
   • |C2_standard|:    378
   • Frequent Sets:    31
   • Execution Time:   0.4329 s
   • Peak Memory:      0.1830 MB

Testing Min Support: 0.03
   [Standard] Generating L1 (Min Count: 295)...
   [Metric Captured] |C2_standard| = 946
    Done.
   • |C2_standard|:    946
   • Frequent Sets:    64
   • Execution Time:   0.8367 s
   • Peak Memory:      0.3252 MB

Testing Min Support: 0.02
   [Standard] Generating L1 (Min Count: 196)...
   [Metric Captured] |C2_standard| = 1711
    Done.
   • |C2_standard|:    1711
   • Frequent Sets:    123
   • Execution Time:   1.5250 s
   • Peak Memory:      0.6292 MB

Testing Min Support: 0.01
   [Standard] Generating L1 (Min Count: 98)...
   [Metric Captured] |C2_standard| = 3828
    Do