In [2]:
import os
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


## Project Part 1: Create Data

In [13]:
# load retailer data from CSV
data_path = "/Users/amanda/Documents/transactional_datasets/retailers.csv"
df = pd.read_csv(data_path)

# group items 
retailers = df.groupby("Retailer")["Item"].apply(list).to_dict()

# define output directory
output_dir = "/Users/amanda/Documents/transactional_datasets/transactions"
os.makedirs(output_dir, exist_ok=True)

# Function to create deterministic transactions
def generate_transactions(items, n=20):
    transactions = []
    for i in range(n):
        start = i % len(items)
        end = (start + (i % 5 + 3)) % len(items)
        if start < end:
            trans_items = items[start:end]
        else:
            trans_items = items[start:] + items[:end]
        transactions.append(trans_items)
    return transactions

# Build datasets and save
for retailer, items in retailers.items():
    transactions = generate_transactions(items)
    df_out = pd.DataFrame({
        "TransactionID": [f"{i+1}" for i in range(len(transactions))],
        "ItemsPurchased": [", ".join(t) for t in transactions]
    })
    df_out.to_csv(f"{output_dir}/{retailer}.csv", index=False)

print("Transactional datasets created in 'Documents/transactional_datasets'")


Transactional datasets created in 'Documents/transactional_datasets'


In [19]:
import time
import itertools

## Project Part 2: Brute Force Algorithm - A

In [None]:
#  Helper functions

def get_support(itemset, transactions):
    count = sum(1 for t in transactions if itemset.issubset(t))
    return count / len(transactions)

# brute force frequent itemset generation
def get_frequent_itemsets(transactions, min_support=0.3):
    all_items = sorted(set(itertools.chain.from_iterable(transactions)))
    frequent_itemsets = []
    k = 1

    while True:
        candidates = list(itertools.combinations(all_items, k))
        current_frequents = []

        for candidate in candidates:
            itemset = set(candidate)
            support = get_support(itemset, transactions)
            if support >= min_support:
                current_frequents.append((itemset, support))

        if not current_frequents:
            break

        frequent_itemsets.extend(current_frequents)
        k += 1

    return frequent_itemsets

# generate association rules from frequent itemsets
def generate_association_rules(frequent_itemsets, transactions, min_confidence=0.6):
    rules = []
    for itemset, support in frequent_itemsets:
        if len(itemset) < 2:
            continue
        for i in range(1, len(itemset)):
            for antecedent in itertools.combinations(itemset, i):
                antecedent = set(antecedent)
                consequent = itemset - antecedent
                if not consequent:
                    continue
                support_antecedent = get_support(antecedent, transactions)
                confidence = support / support_antecedent if support_antecedent > 0 else 0
                if confidence >= min_confidence:
                    rules.append({
                        "Antecedent": ", ".join(sorted(antecedent)),
                        "Consequent": ", ".join(sorted(consequent)),
                        "Support": round(support, 3),
                        "Confidence": round(confidence, 3)
                    })
    return rules


#  Paths 
input_dir = "/Users/amanda/Documents/transactional_datasets/transactions"
output_dir = "/Users/amanda/Documents/transactional_datasets/results"
os.makedirs(output_dir, exist_ok=True)

#  Parameter combinations to test 
parameter_sets = [
    {"support": 0.3, "confidence": 0.6},
    {"support": 0.2, "confidence": 0.5},
    {"support": 0.1, "confidence": 0.4}
]

summary = []

# process all retailer CSVs 
for filename in os.listdir(input_dir):
    if not filename.endswith(".csv"):
        continue

    retailer = os.path.splitext(filename)[0]
    filepath = os.path.join(input_dir, filename)
    print(f"\n=== Processing {retailer} ===")

    # Load transactions
    df = pd.read_csv(filepath)
    transactions = [set(t.split(", ")) for t in df["ItemsPurchased"]]
    df_onehot = transactions_to_df(transactions)

    #  Brute Force Timing 
    start_bf = time.time()
    frequent_itemsets = get_frequent_itemsets(transactions, min_support=min_support)
    rules = generate_association_rules(frequent_itemsets, transactions, min_confidence=min_confidence)
    end_bf = time.time()
    brute_force_time = round(end_bf - start_bf, 3)

    #  Apriori Timing 
    start_ap = time.time()
    apriori_itemsets = apriori(df_onehot, min_support=min_support, use_colnames=True)
    apriori_rules = association_rules(apriori_itemsets, metric="confidence", min_threshold=min_confidence)
    end_ap = time.time()
    apriori_time = round(end_ap - start_ap, 3)

    #  FP-Growth Timing 
    start_fp = time.time()
    fpg_itemsets = fpgrowth(df_onehot, min_support=min_support, use_colnames=True)
    fpg_rules = association_rules(fpg_itemsets, metric="confidence", min_threshold=min_confidence)
    end_fp = time.time()
    fpg_time = round(end_fp - start_fp, 3)

    #  Save results 
    # Apriori
    apriori_itemsets["Itemset"] = apriori_itemsets["itemsets"].apply(lambda x: ", ".join(sorted(x)))
    apriori_itemsets = apriori_itemsets[["Itemset", "support"]]
    apriori_itemsets.to_csv(f"{output_dir}/{retailer}_apriori_itemsets.csv", index=False)

    apriori_rules["antecedents"] = apriori_rules["antecedents"].apply(lambda x: ", ".join(sorted(x)))
    apriori_rules["consequents"] = apriori_rules["consequents"].apply(lambda x: ", ".join(sorted(x)))
    apriori_rules = apriori_rules[["antecedents", "consequents", "support", "confidence", "lift"]]
    apriori_rules.to_csv(f"{output_dir}/{retailer}_apriori_rules.csv", index=False)

    # FP-Growth
    fpg_itemsets["Itemset"] = fpg_itemsets["itemsets"].apply(lambda x: ", ".join(sorted(x)))
    fpg_itemsets = fpg_itemsets[["Itemset", "support"]]
    fpg_itemsets.to_csv(f"{output_dir}/{retailer}_fpgrowth_itemsets.csv", index=False)

    fpg_rules["antecedents"] = fpg_rules["antecedents"].apply(lambda x: ", ".join(sorted(x)))
    fpg_rules["consequents"] = fpg_rules["consequents"].apply(lambda x: ", ".join(sorted(x)))
    fpg_rules = fpg_rules[["antecedents", "consequents", "support", "confidence", "lift"]]
    fpg_rules.to_csv(f"{output_dir}/{retailer}_fpgrowth_rules.csv", index=False)

    #  Add to summary
    summary.append({
        "Retailer": retailer,
        "Apriori Itemsets": len(apriori_itemsets),
        "Apriori Rules": len(apriori_rules),
        "FP-Growth Itemsets": len(fpg_itemsets),
        "FP-Growth Rules": len(fpg_rules),
        "Brute Force Time (s)": brute_force_time,
        "Apriori Time (s)": apriori_time,
        "FP-Growth Time (s)": fpg_time
    })

#  Summary Table 
summary_df = pd.DataFrame(summary)
print("\n=== Summary Across All Retailers ===")
print(summary_df.to_string(index=False))

# Save to CSV
summary_df.to_csv(f"{output_dir}/summary.csv", index=False)

# --- Average Execution Time Across All Retailers ---
avg_times = {
    "Algorithm": ["Brute Force", "Apriori", "FP-Growth"],
    "Execution Time (s)": [
        round(summary_df["Brute Force Time (s)"].mean(), 3),
        round(summary_df["Apriori Time (s)"].mean(), 3),
        round(summary_df["FP-Growth Time (s)"].mean(), 3)
    ]
}
avg_times_df = pd.DataFrame(avg_times)

print("\n=== Average Execution Time (All Retailers) ===")
print(avg_times_df.to_string(index=False))
avg_times_df.to_csv(f"{output_dir}/execution_times.csv", index=False)

print(f"\nAll results saved in: {output_dir}")



=== Processing Amazon ===





=== Processing Costco ===

=== Processing Walmart ===





=== Processing Nike ===





=== Processing BestBuy ===





=== Summary Across All Retailers ===
Retailer  Apriori Itemsets  Apriori Rules  FP-Growth Itemsets  FP-Growth Rules  Brute Force Time (s)  Apriori Time (s)  FP-Growth Time (s)
  Amazon               310           3092                 310             3092                 0.135             2.828               0.077
  Costco               310           3092                 310             3092                 0.027             0.037               0.043
 Walmart               310           3092                 310             3092                 0.021             0.142               0.110
    Nike               310           3092                 310             3092                 0.046             0.030               0.387
 BestBuy               310           3092                 310             3092                 0.025             0.152               0.055

=== Average Execution Time (All Retailers) ===
  Algorithm  Execution Time (s)
Brute Force               0.051
    Apriori     

# Part 3

In [None]:
# User-specified thresholds
min_support = float(input("Enter minimum support (e.g., 0.3): ") or 0.3)
min_confidence = float(input("Enter minimum confidence (e.g., 0.6): ") or 0.6)

# === Helper Functions ===

def get_support(itemset, transactions):
    count = sum(1 for t in transactions if itemset.issubset(t))
    return count / len(transactions)

def get_frequent_itemsets(transactions, min_support):
    all_items = sorted(set(itertools.chain.from_iterable(transactions)))
    frequent_itemsets = []
    k = 1

    while True:
        candidates = list(itertools.combinations(all_items, k))
        current_frequents = []

        for candidate in candidates:
            itemset = set(candidate)
            support = get_support(itemset, transactions)
            if support >= min_support:
                current_frequents.append((itemset, support))

        if not current_frequents:
            break

        frequent_itemsets.extend(current_frequents)
        k += 1

    return frequent_itemsets

def generate_association_rules(frequent_itemsets, transactions, min_confidence):
    rules = []
    for itemset, support in frequent_itemsets:
        if len(itemset) < 2:
            continue  # rules require at least 2 items
        for i in range(1, len(itemset)):
            for antecedent in itertools.combinations(itemset, i):
                antecedent = set(antecedent)
                consequent = itemset - antecedent
                if not consequent:
                    continue
                support_antecedent = get_support(antecedent, transactions)
                confidence = support / support_antecedent if support_antecedent > 0 else 0
                if confidence >= min_confidence:
                    rules.append({
                        "Antecedent": ", ".join(sorted(antecedent)),
                        "Consequent": ", ".join(sorted(consequent)),
                        "Support": round(support, 3),
                        "Confidence": round(confidence, 3)
                    })
    return rules

# === Process All 5 Retailer Databases ===
summary = []

for filename in os.listdir(input_dir):
    if not filename.endswith(".csv"):
        continue

    retailer = os.path.splitext(filename)[0]
    file_path = os.path.join(input_dir, filename)

    print(f"\n=== Processing {retailer} ===")

    df = pd.read_csv(file_path)
    transactions = [set(t.split(", ")) for t in df["ItemsPurchased"]]

    # Step 1: Find frequent itemsets
    frequent_itemsets = get_frequent_itemsets(transactions, min_support)

    # Save frequent itemsets
    freq_data = [
        {"Itemset": ", ".join(sorted(itemset)), "Support": round(support, 3)}
        for itemset, support in frequent_itemsets
    ]
    freq_df = pd.DataFrame(freq_data)
    freq_df.to_csv(f"{output_dir}/{retailer}_frequent_itemsets.csv", index=False)

    # Step 2: Generate association rules
    rules = generate_association_rules(frequent_itemsets, transactions, min_confidence)

    # Save association rules
    rules_df = pd.DataFrame(rules)
    rules_df.to_csv(f"{output_dir}/{retailer}_association_rules.csv", index=False)

    summary.append({
        "Retailer": retailer,
        "Total Frequent Itemsets": len(frequent_itemsets),
        "Total Rules": len(rules)
    })

# === Summary ===
print("\n=== Summary Across All Retailers ===")
summary_df = pd.DataFrame(summary)
print(summary_df.to_string(index=False))
summary_df.to_csv(f"{output_dir}/summary.csv", index=False)

print(f"\nAll results saved in: {output_dir}")



=== Processing Amazon ===

=== Processing Costco ===

=== Processing Walmart ===

=== Processing Nike ===

=== Processing BestBuy ===

=== Summary Across All Retailers ===
Retailer  Total Frequent Itemsets  Total Rules
  Amazon                       40          100
  Costco                       40          100
 Walmart                       40          100
    Nike                       40          100
 BestBuy                       40          100

All results saved in: /Users/amanda/Documents/transactional_datasets/results


In [14]:
!pip install mlxtend


Collecting mlxtend
  Downloading mlxtend-0.23.4-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.4-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m16.9 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: mlxtend
Successfully installed mlxtend-0.23.4


In [None]:
import time
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules

In [7]:
#  Helper function: Convert transactions to one-hot encoded DataFrame 
def transactions_to_df(transactions):
    all_items = sorted(set(item for t in transactions for item in t))
    one_hot = pd.DataFrame(0, index=range(len(transactions)), columns=all_items)
    for i, t in enumerate(transactions):
        one_hot.loc[i, list(t)] = 1
    return one_hot

summary = []

# process all retailer CSVs 
for filename in os.listdir(input_dir):
    if not filename.endswith(".csv"):
        continue

    retailer = os.path.splitext(filename)[0]
    filepath = os.path.join(input_dir, filename)
    print(f"\n=== Processing {retailer} ===")

    # Load transactions
    df = pd.read_csv(filepath)
    transactions = [set(t.split(", ")) for t in df["ItemsPurchased"]]
    df_onehot = transactions_to_df(transactions)

    #  Apriori 
    apriori_itemsets = apriori(df_onehot, min_support=min_support, use_colnames=True)
    apriori_itemsets["Itemset"] = apriori_itemsets["itemsets"].apply(lambda x: ", ".join(sorted(x)))
    apriori_itemsets = apriori_itemsets[["Itemset", "support"]]
    apriori_itemsets.to_csv(f"{output_dir}/{retailer}_apriori_itemsets.csv", index=False)

    apriori_rules = association_rules(
        apriori(df_onehot, min_support=min_support, use_colnames=True),
        metric="confidence",
        min_threshold=min_confidence
    )
    apriori_rules["antecedents"] = apriori_rules["antecedents"].apply(lambda x: ", ".join(sorted(x)))
    apriori_rules["consequents"] = apriori_rules["consequents"].apply(lambda x: ", ".join(sorted(x)))
    apriori_rules = apriori_rules[["antecedents", "consequents", "support", "confidence", "lift"]]
    apriori_rules.to_csv(f"{output_dir}/{retailer}_apriori_rules.csv", index=False)

    #  FP-Growth 
    fpg_itemsets = fpgrowth(df_onehot, min_support=min_support, use_colnames=True)
    fpg_itemsets["Itemset"] = fpg_itemsets["itemsets"].apply(lambda x: ", ".join(sorted(x)))
    fpg_itemsets = fpg_itemsets[["Itemset", "support"]]
    fpg_itemsets.to_csv(f"{output_dir}/{retailer}_fpgrowth_itemsets.csv", index=False)

    fpg_rules = association_rules(
        fpgrowth(df_onehot, min_support=min_support, use_colnames=True),
        metric="confidence",
        min_threshold=min_confidence
    )
    fpg_rules["antecedents"] = fpg_rules["antecedents"].apply(lambda x: ", ".join(sorted(x)))
    fpg_rules["consequents"] = fpg_rules["consequents"].apply(lambda x: ", ".join(sorted(x)))
    fpg_rules = fpg_rules[["antecedents", "consequents", "support", "confidence", "lift"]]
    fpg_rules.to_csv(f"{output_dir}/{retailer}_fpgrowth_rules.csv", index=False)

    summary.append({
        "Retailer": retailer,
        "Apriori Itemsets": len(apriori_itemsets),
        "Apriori Rules": len(apriori_rules),
        "FP-Growth Itemsets": len(fpg_itemsets),
        "FP-Growth Rules": len(fpg_rules)
    })

# summary
summary_df = pd.DataFrame(summary)
print("\n=== Summary Across All Retailers ===")
print(summary_df.to_string(index=False))
summary_df.to_csv(f"{output_dir}/summary.csv", index=False)

print(f"\nAll results saved in: {output_dir}")


=== Processing Amazon ===

=== Processing Costco ===

=== Processing Walmart ===





=== Processing Nike ===

=== Processing BestBuy ===

=== Summary Across All Retailers ===
Retailer  Apriori Itemsets  Apriori Rules  FP-Growth Itemsets  FP-Growth Rules
  Amazon                40            100                  40              100
  Costco                40            100                  40              100
 Walmart                40            100                  40              100
    Nike                40            100                  40              100
 BestBuy                40            100                  40              100

All results saved in: /Users/amanda/Documents/transactional_datasets/results




## Part 4 

In [8]:
#  Paths 
input_dir = "/Users/amanda/Documents/transactional_datasets/transactions"
output_dir = "/Users/amanda/Documents/transactional_datasets/results"
os.makedirs(output_dir, exist_ok=True)

# part 4: Execution & Input Parameters 
import sys

# Available datasets dynamically from the transactions folder
available_datasets = [
    f for f in os.listdir(input_dir) if f.endswith(".csv")
]

if not available_datasets:
    print("No datasets found in the transactions directory. Please run the data creation part first.")
    sys.exit(1)

print("\n=== Available Retailer Datasets ===")
for i, dataset in enumerate(available_datasets, start=1):
    print(f"{i}. {dataset}")

#  dataset selection
while True:
    try:
        choice = int(input("\nSelect a dataset by number: "))
        if 1 <= choice <= len(available_datasets):
            selected_dataset = available_datasets[choice - 1]
            break
        else:
            print(f"Please enter a number between 1 and {len(available_datasets)}.")
    except ValueError:
        print("Invalid input. Please enter a valid number.")

#  minimum support
while True:
    try:
        min_support = float(input("Enter minimum support (e.g., 0.3): "))
        if 0 < min_support <= 1:
            break
        else:
            print("Support must be between 0 and 1.")
    except ValueError:
        print("Invalid input. Please enter a numeric value between 0 and 1.")

#  minimum confidence
while True:
    try:
        min_confidence = float(input("Enter minimum confidence (e.g., 0.6): "))
        if 0 < min_confidence <= 1:
            break
        else:
            print("Confidence must be between 0 and 1.")
    except ValueError:
        print("Invalid input. Please enter a numeric value between 0 and 1.")

# confirm input
print("\n=== Execution Summary ===")
print(f"Selected Dataset: {selected_dataset}")
print(f"Minimum Support: {min_support}")
print(f"Minimum Confidence: {min_confidence}")

# Load and process the selected dataset
selected_path = os.path.join(input_dir, selected_dataset)
df = pd.read_csv(selected_path)
transactions = [set(t.split(", ")) for t in df["ItemsPurchased"]]

# Run Apriori and FP-Growth using user-specified parameters
df_onehot = transactions_to_df(transactions)

# apriori
apriori_itemsets = apriori(df_onehot, min_support=min_support, use_colnames=True)
apriori_rules = association_rules(apriori_itemsets, metric="confidence", min_threshold=min_confidence)

# FP-Growth
fpg_itemsets = fpgrowth(df_onehot, min_support=min_support, use_colnames=True)
fpg_rules = association_rules(fpg_itemsets, metric="confidence", min_threshold=min_confidence)

# Save 
apriori_itemsets.to_csv(f"{output_dir}/{selected_dataset.replace('.csv', '')}_user_apriori_itemsets.csv", index=False)
apriori_rules.to_csv(f"{output_dir}/{selected_dataset.replace('.csv', '')}_user_apriori_rules.csv", index=False)
fpg_itemsets.to_csv(f"{output_dir}/{selected_dataset.replace('.csv', '')}_user_fpgrowth_itemsets.csv", index=False)
fpg_rules.to_csv(f"{output_dir}/{selected_dataset.replace('.csv', '')}_user_fpgrowth_rules.csv", index=False)

print("\nExecution complete. User-specified results saved in:")
print(output_dir)



=== Available Retailer Datasets ===
1. Amazon.csv
2. Costco.csv
3. Walmart.csv
4. Nike.csv
5. BestBuy.csv



=== Execution Summary ===
Selected Dataset: Amazon.csv
Minimum Support: 0.2
Minimum Confidence: 0.6

Execution complete. User-specified results saved in:
/Users/amanda/Documents/transactional_datasets/results


