Lab 12: Association Rule Mining
This script demonstrates Association Rule Mining (Market Basket Analysis).

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations


In [None]:
def create_transaction_data():
    """Create sample transaction data"""
    transactions = [
        ['Milk', 'Bread', 'Butter'],
        ['Beer', 'Diapers', 'Bread'],
        ['Milk', 'Diapers', 'Beer', 'Butter'],
        ['Milk', 'Bread', 'Butter', 'Beer'],
        ['Bread', 'Milk', 'Diapers'],
        ['Bread', 'Butter'],
        ['Beer', 'Diapers'],
        ['Milk', 'Bread', 'Butter', 'Beer'],
        ['Milk', 'Bread'],
        ['Bread', 'Butter', 'Diapers']
    ]
    return transactions


In [None]:
def calculate_support(transactions, itemset):
    """Calculate support for an itemset"""
    count = 0
    for transaction in transactions:
        if set(itemset).issubset(set(transaction)):
            count += 1
    return count / len(transactions)


In [None]:
def calculate_confidence(transactions, antecedent, consequent):
    """Calculate confidence for a rule"""
    antecedent_support = calculate_support(transactions, antecedent)
    if antecedent_support == 0:
        return 0
    
    rule_support = calculate_support(transactions, antecedent + consequent)
    return rule_support / antecedent_support


In [None]:
def calculate_lift(transactions, antecedent, consequent):
    """Calculate lift for a rule"""
    rule_confidence = calculate_confidence(transactions, antecedent, consequent)
    consequent_support = calculate_support(transactions, consequent)
    
    if consequent_support == 0:
        return 0
    
    return rule_confidence / consequent_support


In [None]:
def get_frequent_itemsets(transactions, min_support=0.3):
    """Find frequent itemsets using Apriori algorithm"""
    print("=" * 50)
    print("Finding Frequent Itemsets (Apriori)")
    print("=" * 50)
    
    # Get all unique items
    all_items = set()
    for transaction in transactions:
        all_items.update(transaction)
    
    all_items = sorted(list(all_items))
    print(f"\nAll items: {all_items}")
    print(f"Number of transactions: {len(transactions)}")
    print(f"Minimum support threshold: {min_support}")
    
    # Find frequent 1-itemsets
    frequent_itemsets = {}
    
    print("\n--- Frequent 1-itemsets ---")
    for item in all_items:
        support = calculate_support(transactions, [item])
        if support >= min_support:
            frequent_itemsets[frozenset([item])] = support
            print(f"{{{item}}}: support = {support:.3f}")
    
    # Find frequent k-itemsets (k > 1)
    k = 2
    while True:
        # Generate candidate itemsets
        previous_itemsets = list(frequent_itemsets.keys())
        candidates = set()
        
        for i in range(len(previous_itemsets)):
            for j in range(i + 1, len(previous_itemsets)):
                union = previous_itemsets[i] | previous_itemsets[j]
                if len(union) == k:
                    candidates.add(union)
        
        if not candidates:
            break
        
        print(f"\n--- Frequent {k}-itemsets ---")
        new_frequent = {}
        for candidate in candidates:
            support = calculate_support(transactions, list(candidate))
            if support >= min_support:
                new_frequent[candidate] = support
                print(f"{set(candidate)}: support = {support:.3f}")
        
        if not new_frequent:
            break
        
        frequent_itemsets.update(new_frequent)
        k += 1
    
    return frequent_itemsets


In [None]:
def generate_association_rules(transactions, frequent_itemsets, min_confidence=0.6):
    """Generate association rules from frequent itemsets"""
    print("\n" + "=" * 50)
    print("Generating Association Rules")
    print("=" * 50)
    
    print(f"\nMinimum confidence threshold: {min_confidence}")
    
    rules = []
    
    # Generate rules from itemsets with 2 or more items
    for itemset in frequent_itemsets:
        if len(itemset) < 2:
            continue
        
        # Generate all possible rules
        items = list(itemset)
        for i in range(1, len(items)):
            for antecedent_items in combinations(items, i):
                antecedent = list(antecedent_items)
                consequent = [item for item in items if item not in antecedent]
                
                confidence = calculate_confidence(transactions, antecedent, consequent)
                
                if confidence >= min_confidence:
                    support = frequent_itemsets[itemset]
                    lift = calculate_lift(transactions, antecedent, consequent)
                    
                    rules.append({
                        'antecedent': antecedent,
                        'consequent': consequent,
                        'support': support,
                        'confidence': confidence,
                        'lift': lift
                    })
    
    # Sort rules by confidence
    rules = sorted(rules, key=lambda x: x['confidence'], reverse=True)
    
    print(f"\nTotal rules found: {len(rules)}")
    print("\nTop Association Rules:")
    print("-" * 80)
    print(f"{'Rule':<40} {'Support':<10} {'Confidence':<12} {'Lift':<10}")
    print("-" * 80)
    
    for rule in rules[:10]:
        antecedent_str = ', '.join(rule['antecedent'])
        consequent_str = ', '.join(rule['consequent'])
        rule_str = f"{{{antecedent_str}}} → {{{consequent_str}}}"
        print(f"{rule_str:<40} {rule['support']:.3f}{'':<5} "
              f"{rule['confidence']:.3f}{'':<7} {rule['lift']:.3f}")
    
    return rules


In [None]:
def visualize_rules(rules):
    """Visualize association rules"""
    print("\n" + "=" * 50)
    print("Visualizing Association Rules")
    print("=" * 50)
    
    if not rules:
        print("\nNo rules to visualize")
        return
    
    # Extract metrics
    supports = [rule['support'] for rule in rules]
    confidences = [rule['confidence'] for rule in rules]
    lifts = [rule['lift'] for rule in rules]
    
    # Create scatter plot
    plt.figure(figsize=(12, 5))
    
    # Plot 1: Support vs Confidence
    plt.subplot(1, 2, 1)
    scatter = plt.scatter(supports, confidences, c=lifts, cmap='viridis', 
                         s=100, alpha=0.6, edgecolors='black')
    plt.colorbar(scatter, label='Lift')
    plt.xlabel('Support')
    plt.ylabel('Confidence')
    plt.title('Association Rules: Support vs Confidence')
    plt.grid(True, alpha=0.3)
    
    # Plot 2: Bar chart of top rules by lift
    plt.subplot(1, 2, 2)
    top_rules = sorted(rules, key=lambda x: x['lift'], reverse=True)[:8]
    rule_labels = []
    rule_lifts = []
    
    for rule in top_rules:
        ant = ', '.join(rule['antecedent'][:2])  # Show first 2 items
        cons = ', '.join(rule['consequent'][:1])  # Show first item
        label = f"{{{ant}}} → {{{cons}}}"
        rule_labels.append(label)
        rule_lifts.append(rule['lift'])
    
    plt.barh(range(len(rule_labels)), rule_lifts, color='skyblue', edgecolor='black')
    plt.yticks(range(len(rule_labels)), rule_labels, fontsize=8)
    plt.xlabel('Lift')
    plt.title('Top 8 Rules by Lift')
    plt.grid(True, alpha=0.3, axis='x')
    
    plt.tight_layout()
    plt.savefig('lab12_association_rules.png', dpi=100)
    plt.close()
    print("\nAssociation rules plot saved as 'lab12_association_rules.png'")


In [None]:
def analyze_item_frequency():
    """Analyze frequency of items in transactions"""
    print("\n" + "=" * 50)
    print("Item Frequency Analysis")
    print("=" * 50)
    
    transactions = create_transaction_data()
    
    # Count item frequencies
    item_counts = {}
    for transaction in transactions:
        for item in transaction:
            item_counts[item] = item_counts.get(item, 0) + 1
    
    # Sort by frequency
    sorted_items = sorted(item_counts.items(), key=lambda x: x[1], reverse=True)
    
    print("\nItem Frequencies:")
    for item, count in sorted_items:
        support = count / len(transactions)
        print(f"{item}: {count} transactions (support = {support:.3f})")
    
    # Visualize
    items = [item for item, _ in sorted_items]
    counts = [count for _, count in sorted_items]
    
    plt.figure(figsize=(10, 6))
    plt.bar(items, counts, color='lightblue', edgecolor='black')
    plt.xlabel('Items')
    plt.ylabel('Frequency')
    plt.title('Item Frequency in Transactions')
    plt.grid(True, alpha=0.3, axis='y')
    
    for i, (item, count) in enumerate(zip(items, counts)):
        plt.text(i, count + 0.1, str(count), ha='center')
    
    plt.tight_layout()
    plt.savefig('lab12_item_frequency.png')
    plt.close()
    print("\nItem frequency plot saved as 'lab12_item_frequency.png'")


In [None]:
def interpret_metrics():
    """Explain the meaning of support, confidence, and lift"""
    print("\n" + "=" * 50)
    print("Understanding Metrics")
    print("=" * 50)
    
    print("\n1. SUPPORT:")
    print("   - Measures how frequently an itemset appears in the dataset")
    print("   - Support(A) = (Transactions containing A) / (Total transactions)")
    print("   - Higher support = more frequent itemset")
    
    print("\n2. CONFIDENCE:")
    print("   - Measures the likelihood that B is purchased when A is purchased")
    print("   - Confidence(A → B) = Support(A ∪ B) / Support(A)")
    print("   - Higher confidence = stronger rule")
    
    print("\n3. LIFT:")
    print("   - Measures how much more likely B is purchased when A is purchased")
    print("   - Lift(A → B) = Confidence(A → B) / Support(B)")
    print("   - Lift > 1: A and B are positively correlated")
    print("   - Lift = 1: A and B are independent")
    print("   - Lift < 1: A and B are negatively correlated")


In [None]:
def real_world_example():
    """Real-world market basket analysis example"""
    print("\n" + "=" * 50)
    print("Real-World Example: Supermarket Analysis")
    print("=" * 50)
    
    # Create more realistic transaction data
    transactions = [
        ['Bread', 'Milk', 'Eggs', 'Butter'],
        ['Bread', 'Butter', 'Jam'],
        ['Milk', 'Eggs', 'Cheese'],
        ['Bread', 'Milk', 'Butter'],
        ['Coffee', 'Sugar', 'Milk'],
        ['Bread', 'Butter', 'Eggs'],
        ['Tea', 'Sugar', 'Milk'],
        ['Bread', 'Milk', 'Butter', 'Cheese'],
        ['Coffee', 'Sugar', 'Cream'],
        ['Bread', 'Butter'],
        ['Milk', 'Eggs', 'Butter'],
        ['Bread', 'Milk', 'Eggs'],
        ['Tea', 'Sugar'],
        ['Coffee', 'Milk', 'Sugar'],
        ['Bread', 'Butter', 'Milk']
    ]
    
    print(f"\nAnalyzing {len(transactions)} supermarket transactions")
    
    # Find frequent itemsets
    frequent_itemsets = {}
    min_support = 0.25
    
    all_items = set()
    for transaction in transactions:
        all_items.update(transaction)
    
    # Check 1-itemsets
    for item in all_items:
        support = calculate_support(transactions, [item])
        if support >= min_support:
            frequent_itemsets[frozenset([item])] = support
    
    # Check 2-itemsets
    for item1, item2 in combinations(all_items, 2):
        support = calculate_support(transactions, [item1, item2])
        if support >= min_support:
            frequent_itemsets[frozenset([item1, item2])] = support
    
    print(f"\nFrequent itemsets (min_support={min_support}):")
    for itemset, support in sorted(frequent_itemsets.items(), 
                                   key=lambda x: x[1], reverse=True):
        if len(itemset) > 1:
            print(f"  {set(itemset)}: {support:.3f}")
    
    # Generate rules
    print("\nKey Insights:")
    
    # Specific rule examples
    if calculate_confidence(transactions, ['Bread'], ['Butter']) >= 0.5:
        conf = calculate_confidence(transactions, ['Bread'], ['Butter'])
        print(f"  • Customers who buy Bread often buy Butter (confidence: {conf:.3f})")
    
    if calculate_confidence(transactions, ['Coffee', 'Sugar'], ['Milk']) >= 0.5:
        conf = calculate_confidence(transactions, ['Coffee', 'Sugar'], ['Milk'])
        print(f"  • Coffee and Sugar buyers often add Milk (confidence: {conf:.3f})")


In [None]:
def main():
    """Main function to demonstrate association rule mining"""
    print("\n" + "=" * 50)
    print("Lab 12: Association Rule Mining")
    print("=" * 50)
    
    # Create transaction data
    transactions = create_transaction_data()
    
    print("\nSample Transactions:")
    for i, transaction in enumerate(transactions[:5], 1):
        print(f"  Transaction {i}: {transaction}")
    print(f"  ... and {len(transactions) - 5} more transactions")
    
    # Analyze item frequency
    analyze_item_frequency()
    
    # Find frequent itemsets
    frequent_itemsets = get_frequent_itemsets(transactions, min_support=0.3)
    
    # Generate association rules
    rules = generate_association_rules(transactions, frequent_itemsets, 
                                       min_confidence=0.6)
    
    # Visualize rules
    visualize_rules(rules)
    
    # Interpret metrics
    interpret_metrics()
    
    # Real-world example
    real_world_example()
    
    print("\n" + "=" * 50)
    print("Lab 12 Complete!")
    print("=" * 50)


In [None]:
if __name__ == "__main__":
    main()
