In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Configuration
NUM_TRANSACTIONS = 1000000
OUTPUT_FILE = "retail_transactions.csv"
PRODUCTS = ["Milk", "Bread", "Eggs", "Butter", "Cheese", "Beer", "Diapers", "Pasta", "Sauce", "Coke", "Chips"]

def generate_simulated_data():
    data = []
    start_date = datetime(2025, 1, 1)
    
    print(f"Generating {NUM_TRANSACTIONS} transactions...")
    
    for i in range(NUM_TRANSACTIONS):
        tx_id = i
        timestamp = start_date + timedelta(minutes=random.randint(0, 525600))
        
        # Randomly decide basket size
        basket_size = random.randint(1, 5)
        basket = random.sample(PRODUCTS, basket_size)
        
        # Injecting Association Rules (Patterns for the AI to find)
        if "Diapers" in basket and random.random() < 0.7:
            basket.append("Beer") # High correlation: Diapers -> Beer
        if "Pasta" in basket and random.random() < 0.8:
            basket.append("Sauce") # High correlation: Pasta -> Sauce
            
        for item in set(basket):
            data.append([tx_id, item, timestamp])
            
    df = pd.DataFrame(data, columns=["TransactionID", "ItemName", "Timestamp"])
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"Simulation complete. File saved as {OUTPUT_FILE}")

generate_simulated_data()

Generating 1000000 transactions...
Simulation complete. File saved as retail_transactions.csv


In [3]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules

# 1. Load the simulated data
print('Loading data...')
df = pd.read_csv("retail_transactions.csv", parse_dates=["Timestamp"]) 

# 2. Pre-process: Group by TransactionID to create 'baskets' (list of items)
baskets = df.groupby("TransactionID")["ItemName"].apply(list).tolist()

# 3. Encode transactions for frequent pattern mining
te = TransactionEncoder()
te_ary = te.fit(baskets).transform(baskets)
basket_df = pd.DataFrame(te_ary, columns=te.columns_)

# 4. Frequent itemsets using FP-Growth (mlxtend)
# min_support: The item must appear in 1% of total transactions
freq_itemsets = fpgrowth(basket_df, min_support=0.01, use_colnames=True)
freq_itemsets = freq_itemsets.sort_values("support", ascending=False)
print("### TOP FREQUENT ITEMSETS ###")
print(freq_itemsets.head(10))

# 5. Generate association rules
rules = association_rules(freq_itemsets, metric="lift", min_threshold=1.0)
rules = rules.sort_values("lift", ascending=False)
print("### GENERATED ASSOCIATION RULES ###")
print(rules.head(10))

# 6. Simple prediction: suggest consequents for baskets that match antecedents
from collections import Counter

def predict_next(basket, rules_df, top_k=3):
    preds = []
    for _, row in rules_df.iterrows():
        if set(row['antecedents']).issubset(set(basket)):
            preds.extend(list(row['consequents']))
    c = Counter(preds)
    return [item for item,_ in c.most_common(top_k)]

print('\n### EXAMPLE PREDICTIONS ###')
for i in range(min(5, len(baskets))):
    print(baskets[i], '->', predict_next(baskets[i], rules))

Loading data...
### TOP FREQUENT ITEMSETS ###
     support   itemsets
0   0.432838    (Sauce)
3   0.413041     (Beer)
1   0.273826    (Bread)
4   0.273115     (Coke)
8   0.273111    (Pasta)
10  0.273070  (Diapers)
7   0.272905     (Eggs)
2   0.272876   (Butter)
5   0.272722    (Chips)
9   0.272421     (Milk)
### GENERATED ASSOCIATION RULES ###
                  antecedents               consequents  antecedent support  \
80              (Sauce, Beer)          (Pasta, Diapers)            0.177106   
83           (Pasta, Diapers)             (Sauce, Beer)            0.073005   
81           (Sauce, Diapers)             (Beer, Pasta)            0.116692   
82              (Beer, Pasta)          (Sauce, Diapers)            0.111043   
655    (Diapers, Pasta, Coke)             (Sauce, Beer)            0.018319   
664             (Sauce, Beer)    (Diapers, Pasta, Coke)            0.177106   
755  (Diapers, Pasta, Butter)             (Sauce, Beer)            0.018199   
756             (Sauce