DATA GENERATION FOR SUPERMARKET TRANSACTIONS
Contributor: Abdiqalaq

In [None]:
# Section 1: Generate Transaction Data
# ---------------------------------------------------------------
import random
import pandas as pd

# Create a pool of at least 30 supermarket items
item_pool = [
    "Bread", "Milk", "Eggs", "Butter", "Cheese", "Apples", "Bananas", "Chicken",
    "Rice", "Pasta", "Tomatoes", "Onions", "Cereal", "Yogurt", "Juice", "Beef",
    "Fish", "Salt", "Sugar", "Tea", "Coffee", "Chips", "Soda", "Water",
    "Flour", "Oil", "Spices", "Ice Cream", "Chocolate", "Lettuce"
]

# Generate at least 3000 transactions (each with 2-7 items)
num_transactions = 3000
transaction_list = []

for _ in range(num_transactions):
    transaction_size = random.randint(2, 7)
    transaction = random.sample(item_pool, transaction_size)
    transaction_list.append(transaction)

print(f" Generated {num_transactions} supermarket transactions successfully!")

Generated 3000 supermarket transactions successfully!


One hot encoder ~ Bradley Ochola 346

In [18]:
# Section 2: One-Hot Encoding (Preprocessing)
# ---------------------------------------------------------------
from mlxtend.preprocessing import TransactionEncoder

# Initialize and apply TransactionEncoder
te = TransactionEncoder()
to_data = te.fit_transform(transaction_list)  # Use the list, not DataFrame

# Convert to DataFrame
transactions = pd.DataFrame(to_data, columns=te.columns_)

print("\n One-hot encoding completed!")
print(f"Shape of encoded data: {transactions.shape}")
print("\nSample of the one-hot encoded transactions:")
print(transactions.head(10))


 One-hot encoding completed!
Shape of encoded data: (3000, 30)

Sample of the one-hot encoded transactions:
   Apples  Bananas   Beef  Bread  Butter  Cereal  Cheese  Chicken  Chips  \
0   False     True  False  False    True   False   False    False  False   
1   False    False  False   True   False   False   False    False  False   
2   False    False   True  False   False   False   False     True  False   
3   False    False  False  False    True    True   False    False  False   
4    True    False  False  False   False   False   False    False   True   
5   False    False  False  False   False   False   False    False  False   
6   False     True  False  False   False   False   False    False  False   
7   False    False  False  False   False   False   False    False  False   
8    True    False  False  False   False    True   False    False  False   
9   False    False  False   True   False   False    True    False  False   

   Chocolate  ...  Pasta   Rice   Salt   Soda  Spices 

In [19]:
from mlxtend.frequent_patterns import apriori


print("\n" + "="*60)
print("Generating Frequent Itemsets using Apriori Algorithm...")
print("="*60)

min_support = 0.05  # 5% minimum support threshold
frequent_itemsets = apriori(transactions, min_support=min_support, use_colnames=True)


# frequent_itemsets = frequent_itemsets.sort_values('support', ascending=False).reset_index(drop=True)


print(f"\nTotal number of frequent itemsets found: {len(frequent_itemsets)}")


print("\n" + "="*60)
print("TOP 10 FREQUENT ITEMSETS")
print("="*60)
print(frequent_itemsets.head(10).to_string(index=False))

top_10_itemsets = frequent_itemsets.head(10)
top_10_itemsets.to_csv("top_10_frequent_itemsets.csv", index=False)
print("\n Top 10 frequent itemsets exported to 'top_10_frequent_itemsets.csv'")


print("\n" + "="*60)
print("STATISTICS")
print("="*60)
print(f"Minimum Support Used: {min_support}")
print(f"Highest Support: {frequent_itemsets['support'].max():.4f}")
print(f"Average Support: {frequent_itemsets['support'].mean():.4f}")
print(f"Lowest Support: {frequent_itemsets['support'].min():.4f}")


print("\nItemset Size Distribution:")
itemset_sizes = frequent_itemsets['itemsets'].apply(lambda x: len(x))
print(itemset_sizes.value_counts().sort_index())


Generating Frequent Itemsets using Apriori Algorithm...

Total number of frequent itemsets found: 30

TOP 10 FREQUENT ITEMSETS
 support    itemsets
0.153333    (Apples)
0.142000   (Bananas)
0.154000      (Beef)
0.157667     (Bread)
0.140333    (Butter)
0.160000    (Cereal)
0.156000    (Cheese)
0.154667   (Chicken)
0.152667     (Chips)
0.150667 (Chocolate)

 Top 10 frequent itemsets exported to 'top_10_frequent_itemsets.csv'

STATISTICS
Minimum Support Used: 0.05
Highest Support: 0.1600
Average Support: 0.1507
Lowest Support: 0.1370

Itemset Size Distribution:
itemsets
1    30
Name: count, dtype: int64


In [17]:
# ---------------------------------------------------------------
# Section 1: Generate Transaction Data
# ---------------------------------------------------------------
import random
import pandas as pd

# Create a pool of at least 30 supermarket items
item_pool = [
    "Bread", "Milk", "Eggs", "Butter", "Cheese", "Apples", "Bananas", "Chicken",
    "Rice", "Pasta", "Tomatoes", "Onions", "Cereal", "Yogurt", "Juice", "Beef",
    "Fish", "Salt", "Sugar", "Tea", "Coffee", "Chips", "Soda", "Water",
    "Flour", "Oil", "Spices", "Ice Cream", "Chocolate", "Lettuce"
]

# Generate at least 3000 transactions (each with 2-7 items)
num_transactions = 3000
transaction_list = []

for _ in range(num_transactions):
    transaction_size = random.randint(2, 7)
    transaction = random.sample(item_pool, transaction_size)
    transaction_list.append(transaction)

print(f"✅ Generated {num_transactions} supermarket transactions successfully!")

# ---------------------------------------------------------------
# Section 2: One-Hot Encoding (Preprocessing)
# ---------------------------------------------------------------
from mlxtend.preprocessing import TransactionEncoder

# Initialize and apply TransactionEncoder
te = TransactionEncoder()
to_data = te.fit_transform(transaction_list)  # Use the list, not DataFrame

# Convert to DataFrame
transactions = pd.DataFrame(to_data, columns=te.columns_)

print("\n✅ One-hot encoding completed!")
print(f"Shape of encoded data: {transactions.shape}")
print("\nSample of the one-hot encoded transactions:")
print(transactions.head(10))

# ---------------------------------------------------------------
# Section 3: Generate Frequent Itemsets using Apriori Algorithm
# Author: [Your Name Here]
# ---------------------------------------------------------------
from mlxtend.frequent_patterns import apriori

print("\n" + "="*60)
print("Generating Frequent Itemsets using Apriori Algorithm...")
print("="*60)

# Apply Apriori with minimum support of 0.05
min_support = 0.05
frequent_itemsets = apriori(transactions, min_support=min_support, use_colnames=True)

# Sort by support in descending order
frequent_itemsets = frequent_itemsets.sort_values('support', ascending=False).reset_index(drop=True)

print(f"\nTotal number of frequent itemsets found: {len(frequent_itemsets)}")

# Display top 10
print("\n" + "="*60)
print("TOP 10 FREQUENT ITEMSETS")
print("="*60)
print(frequent_itemsets.head(10).to_string(index=False))

# Export to CSV
top_10_itemsets = frequent_itemsets.head(10)
top_10_itemsets.to_csv("top_10_frequent_itemsets.csv", index=False)
print("\n✅ Top 10 frequent itemsets exported to 'top_10_frequent_itemsets.csv'")

# Display statistics
print("\n" + "="*60)
print("STATISTICS")
print("="*60)
print(f"Minimum Support Used: {min_support}")
print(f"Highest Support: {frequent_itemsets['support'].max():.4f}")
print(f"Average Support: {frequent_itemsets['support'].mean():.4f}")
print(f"Lowest Support: {frequent_itemsets['support'].min():.4f}")

✅ Generated 3000 supermarket transactions successfully!

✅ One-hot encoding completed!
Shape of encoded data: (3000, 30)

Sample of the one-hot encoded transactions:
   Apples  Bananas   Beef  Bread  Butter  Cereal  Cheese  Chicken  Chips  \
0   False     True  False  False    True   False   False    False  False   
1   False    False  False   True   False   False   False    False  False   
2   False    False   True  False   False   False   False     True  False   
3   False    False  False  False    True    True   False    False  False   
4    True    False  False  False   False   False   False    False   True   
5   False    False  False  False   False   False   False    False  False   
6   False     True  False  False   False   False   False    False  False   
7   False    False  False  False   False   False   False    False  False   
8    True    False  False  False   False    True   False    False  False   
9   False    False  False   True   False   False    True    False  False  