In [None]:
# Importing required libraries
import pandas as pd # Data manipulation library
import random 

# Defining a pool of 30 unique supermarket items
item_pool = [ # List of items available in the supermarket
    'Milk', 'Bread', 'Eggs', 'Butter', 'Cheese', 'Apples', 'Bananas', 'Chicken',
    'Beef', 'Fish', 'Yogurt', 'Juice', 'Cereal', 'Rice', 'Pasta', 'Tomatoes',
    'Potatoes', 'Onions', 'Lettuce', 'Carrots', 'Beans', 'Peas', 'Toilet Paper',
    'Soap', 'Shampoo', 'Toothpaste', 'Water', 'Soda', 'Coffee', 'Tea'
] # A list of items to simulate transactions

# Setting a random seed for reproducibility
random.seed(42)

# Generating 3000 transactions with 2–7 random items each
transactions = []
for _ in range(3000):
    num_items = random.randint(2, 7)
    transaction = random.sample(item_pool, num_items)
    transactions.append(', '.join(transaction))  # Join items into a comma-separated string

# Saving transactions to CSV
df_transactions = pd.DataFrame({'Transaction': transactions})
df_transactions.to_csv('supermarket_transactions.csv', index=False)

# Preview first 5 transactions
print(df_transactions.head())


                                         Transaction
0  Butter, Milk, Soap, Beef, Chicken, Toothpaste,...
1    Butter, Peas, Soap, Onions, Eggs, Lettuce, Rice
2                                         Milk, Eggs
3                         Chicken, Potatoes, Carrots
4                                    Onions, Bananas


In [4]:
import pandas as pd
from itertools import combinations
import csv

# Load the transactions from the CSV file generated previously
df_transactions = pd.read_csv('supermarket_transactions.csv')

# Convert transactions to a list of lists (split comma-separated strings)
# Each transaction is a list of items, e.g., ['Milk', 'Bread', 'Eggs']
transactions = [trans.split(', ') for trans in df_transactions['Transaction']]

# Get the total number of transactions for support calculations
total_transactions = len(transactions)

# Define the minimum support threshold (5% of transactions)
min_support = 0.05

# Helper function to get all unique items in the dataset
def get_unique_items(transactions):
    # Create a set of all unique items across all transactions
    unique_items = set()
    for trans in transactions:
        unique_items.update(trans)
    return sorted(unique_items)  # Return sorted list for consistency

# Helper function to calculate support for a given itemset
def calculate_support(itemset, transactions):
    # Count how many transactions contain the itemset
    count = sum(1 for trans in transactions if set(itemset).issubset(set(trans)))
    # Return support as a fraction of total transactions
    return count / total_transactions

# Helper function to generate candidate itemsets of size k
def generate_candidates(prev_frequent, k):
    # Generate all possible k-itemsets from (k-1)-itemsets
    candidates = []
    for i in range(len(prev_frequent)):
        for j in range(i + 1, len(prev_frequent)):
            # Ensure itemsets can be combined (first k-2 items match)
            if k == 2 or prev_frequent[i][:-1] == prev_frequent[j][:-1]:
                # Combine two (k-1)-itemsets to form a k-itemset
                new_candidate = tuple(sorted(set(prev_frequent[i]) | set(prev_frequent[j])))
                if len(new_candidate) == k:
                    candidates.append(new_candidate)
    return candidates

# Custom Apriori algorithm implementation
def apriori(transactions, min_support):
    # Initialize frequent itemsets dictionary to store results
    frequent_itemsets = []
    
    # Step 1: Generate 1-itemsets
    unique_items = get_unique_items(transactions)
    # Calculate support for each single item
    itemsets_1 = [(item,) for item in unique_items]
    frequent_1 = [(itemset, calculate_support(itemset, transactions)) 
                  for itemset in itemsets_1 if calculate_support(itemset, transactions) >= min_support]
    frequent_itemsets.extend(frequent_1)
    
    # Step 2: Generate k-itemsets iteratively
    k = 2
    prev_frequent = [itemset for itemset, _ in frequent_1]
    
    while prev_frequent:
        # Generate candidate k-itemsets
        candidates = generate_candidates(prev_frequent, k)
        # Calculate support for each candidate
        frequent_k = [(itemset, calculate_support(itemset, transactions)) 
                      for itemset in candidates if calculate_support(itemset, transactions) >= min_support]
        # Add frequent k-itemsets to results
        frequent_itemsets.extend(frequent_k)
        # Prepare for next iteration
        prev_frequent = [itemset for itemset, _ in frequent_k]
        k += 1
    
    return frequent_itemsets

# Run Apriori algorithm to get frequent itemsets
frequent_itemsets = apriori(transactions, min_support)

# Convert to DataFrame for sorting and display
# Format itemsets as strings for readability
frequent_itemsets_df = pd.DataFrame(frequent_itemsets, columns=['itemsets', 'support'])
frequent_itemsets_df['itemsets'] = frequent_itemsets_df['itemsets'].apply(lambda x: ', '.join(x))

# Sort by support in descending order and get top 10
top_10_itemsets = frequent_itemsets_df.sort_values(by='support', ascending=False).head(10)

# Display the top 10 frequent itemsets
print("Top 10 Frequent Itemsets:")
print(top_10_itemsets)

# Export the top 10 itemsets to a CSV file
top_10_itemsets.to_csv('top_10_frequent_itemsets.csv', index=False)
print("\nTop 10 frequent itemsets exported to 'top_10_frequent_itemsets.csv'")

Top 10 Frequent Itemsets:
        itemsets   support
18          Peas  0.170000
5         Butter  0.162333
26      Tomatoes  0.161000
2          Beans  0.159333
8         Cheese  0.159000
25  Toilet Paper  0.156667
20          Rice  0.154333
22          Soap  0.154000
17         Pasta  0.154000
7         Cereal  0.152667

Top 10 frequent itemsets exported to 'top_10_frequent_itemsets.csv'


In [6]:
# [Student: Ziza 
# 4. Identify Closed Frequent Itemsets]
import pandas as pd

# Identify closed frequent itemsets
closed_itemsets = []

# Convert frequent_itemsets to a dict for fast support lookup
support_lookup = dict(frequent_itemsets)

# For each itemset, check if there is any proper superset in frequent_itemsets with the same support
for itemset, support in frequent_itemsets:
    is_closed = True
    for other_itemset, other_support in frequent_itemsets:
        # Check if other_itemset is a proper superset and has the same support
        if set(itemset) < set(other_itemset) and support == other_support:
            is_closed = False
            break
    if is_closed:
        closed_itemsets.append((itemset, support))

# Convert to DataFrame for display and export

closed_itemsets_df = pd.DataFrame(closed_itemsets, columns=['itemsets', 'support'])
closed_itemsets_df['itemsets'] = closed_itemsets_df['itemsets'].apply(lambda x: ', '.join(x))

# Export to CSV
closed_itemsets_df.to_csv('closed_itemsets.csv', index=False)

# Display the closed frequent itemsets
print("Closed Frequent Itemsets:")
print(closed_itemsets_df)

Closed Frequent Itemsets:
        itemsets   support
0         Apples  0.146000
1        Bananas  0.144333
2          Beans  0.159333
3           Beef  0.136667
4          Bread  0.147000
5         Butter  0.162333
6        Carrots  0.147667
7         Cereal  0.152667
8         Cheese  0.159000
9        Chicken  0.143667
10        Coffee  0.139333
11          Eggs  0.152000
12          Fish  0.152667
13         Juice  0.145000
14       Lettuce  0.146667
15          Milk  0.141333
16        Onions  0.152333
17         Pasta  0.154000
18          Peas  0.170000
19      Potatoes  0.141333
20          Rice  0.154333
21       Shampoo  0.146667
22          Soap  0.154000
23          Soda  0.142000
24           Tea  0.145000
25  Toilet Paper  0.156667
26      Tomatoes  0.161000
27    Toothpaste  0.144000
28         Water  0.145333
29        Yogurt  0.150333


In [None]:
# Identifying maximal frequent itemsets

def is_maximal(itemset, itemset_support, all_frequent_dict):
    """
    Check if an itemset is maximal by verifying no frequent superset exists.
    """
    itemset_set = set(itemset)
    for other in all_frequent_dict:
        other_set = set(other)
        if len(other_set) > len(itemset_set) and itemset_set.issubset(other_set):
            return False
    return True

# Convert list of tuples into dictionary for fast lookup
all_frequent_dict = {itemset: support for itemset, support in frequent_itemsets}

# Find maximal itemsets
maximal_itemsets = []
for itemset, support in frequent_itemsets:
    if is_maximal(itemset, support, all_frequent_dict):
        maximal_itemsets.append((itemset, support))

# Convert to DataFrame
maximal_df = pd.DataFrame(maximal_itemsets, columns=["itemsets", "support"])
maximal_df["itemsets"] = maximal_df["itemsets"].apply(lambda x: ", ".join(x))
maximal_df = maximal_df.sort_values(by="support", ascending=False)

# Print result
print("\nMaximal Frequent Itemsets:")
print(maximal_df)

# Export to CSV
maximal_df.to_csv("maximal_frequent_itemsets.csv", index=False)
print("\nMaximal frequent itemsets exported to 'maximal_frequent_itemsets.csv'")


Maximal Frequent Itemsets:
        itemsets   support
18          Peas  0.170000
5         Butter  0.162333
26      Tomatoes  0.161000
2          Beans  0.159333
8         Cheese  0.159000
25  Toilet Paper  0.156667
20          Rice  0.154333
22          Soap  0.154000
17         Pasta  0.154000
7         Cereal  0.152667
12          Fish  0.152667
16        Onions  0.152333
11          Eggs  0.152000
29        Yogurt  0.150333
6        Carrots  0.147667
4          Bread  0.147000
21       Shampoo  0.146667
14       Lettuce  0.146667
0         Apples  0.146000
28         Water  0.145333
13         Juice  0.145000
24           Tea  0.145000
1        Bananas  0.144333
27    Toothpaste  0.144000
9        Chicken  0.143667
23          Soda  0.142000
19      Potatoes  0.141333
15          Milk  0.141333
10        Coffee  0.139333
3           Beef  0.136667

Maximal frequent itemsets exported to 'maximal_frequent_itemsets.csv'
