In [100]:
import pandas as pd
from itertools import combinations
from collections import defaultdict


### Take user inputs


In [101]:

file_name = input("Enter the CSV file name (with .csv extension): ")
# 0.2
confidence_threshold = float(input("Enter the confidence threshold (e.g., 0.20): "))
# 10
sup_count = int(input("Enter the minimum support count: "))

percentage = float(input("Enter the percentage of the dataset to process (e.g., 50 for 50%): "))


### Load the dataset

In [102]:
df = pd.read_csv(file_name)


# Ensure the percentage is valid
if 0 < percentage <= 100:
    df = df.sample(frac=percentage / 100, random_state=42)  
else:
    print("Invalid percentage! Using the full dataset.")


# Remove rows with NaN values
df = df.dropna()

# Remove rows with duplicate values
df = df.drop_duplicates()



### Generate Frequent 1 itemset 

In [103]:


# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])


# Create Transaction_ID with the format "Member_number_Date"
df['Transaction_ID'] = df['Member_number'].astype(str) + "_" + df['Date'].astype(str)



# Create vertical data format (item → set of transactions)
vertical_data = defaultdict(set)


# Show the number of the transactions
num_transactions = df['Transaction_ID'].nunique()
print("Number of unique transactions:", num_transactions)

# Create the vertical data 
for item, txn in zip(df['itemDescription'], df['Transaction_ID']):
    vertical_data[item].add(txn)

# Convert to dictionary and print sample
vertical_data = dict(vertical_data)

# Create a copy from vertical data to use in the confidance calculation
one_item_vertical_data = vertical_data.copy()


Number of unique transactions: 14963


### Generate Candidate 1 itemset 

In [104]:

# Filter vertical data based on support count
filtered_vertical_data = {}
for item, transactions in vertical_data.items():
    if len(transactions) >= sup_count:
        filtered_vertical_data[item] = transactions


### Generate The frequent item sets 

In [105]:

# Initialize max_k for itemset sizes
max_k = 2  

# Store previous frequent itemsets
filtered_vertical_data_copy = filtered_vertical_data.copy()

# print("Initial Frequent Itemsets:", filtered_vertical_data_copy)

# Continue generating itemsets until no more frequent sets are found
while True:
    prev_frequent_itemsets = list(filtered_vertical_data_copy.keys())

    # print(f"\n=== Iteration {max_k-1} ===")

    # Stop if there are no more frequent itemsets
    if len(prev_frequent_itemsets) < 2:
        # print("No more frequent itemsets. Stopping.")
        break

    # Generate candidate itemsets of size max_k from previous (max_k-1) itemsets
    new_filtered_vertical_data = {}

    for a, b in combinations(prev_frequent_itemsets, 2):

        # Ensure merging happens correctly
        # Assuming `a` and `b` are two items to be merged
        if isinstance(a, tuple) and isinstance(b, tuple):
            # Merge and sort
            merged_set = sorted(set(a) | set(b))  
        else:
            # If not tuples, keep them as is
            merged_set = (a, b)  

        # convert merged_set to a tuple
        merged_set = tuple(merged_set)
        
        # Only merge sets that differ by one item
        if len(merged_set) == max_k:

            # Merge transaction sets from previous frequent itemsets
            common_transactions = filtered_vertical_data_copy[a] & filtered_vertical_data_copy[b]


            # Store only frequent itemsets
            if len(common_transactions) >= sup_count:
                new_filtered_vertical_data[merged_set] = common_transactions

    # Stop if no new frequent itemsets are found
    if not new_filtered_vertical_data:
        break

    # Update for the next iteration
    filtered_vertical_data_copy = new_filtered_vertical_data
    max_k += 1



### Print The frequent item sets 

In [106]:
for key in filtered_vertical_data_copy.keys():
    print(key)


('rolls/buns', 'tropical fruit', 'whole milk')
('soda', 'tropical fruit', 'whole milk')
('other vegetables', 'tropical fruit', 'whole milk')
('tropical fruit', 'whole milk', 'yogurt')
('frankfurter', 'other vegetables', 'whole milk')
('pastry', 'rolls/buns', 'whole milk')
('pastry', 'soda', 'whole milk')
('pastry', 'sausage', 'whole milk')
('other vegetables', 'pastry', 'whole milk')
('canned beer', 'rolls/buns', 'whole milk')
('rolls/buns', 'soda', 'whole milk')
('rolls/buns', 'sausage', 'whole milk')
('pip fruit', 'rolls/buns', 'whole milk')
('other vegetables', 'rolls/buns', 'whole milk')
('rolls/buns', 'whole milk', 'yogurt')
('bottled beer', 'rolls/buns', 'whole milk')
('citrus fruit', 'rolls/buns', 'whole milk')
('rolls/buns', 'sausage', 'soda')
('other vegetables', 'rolls/buns', 'soda')
('rolls/buns', 'shopping bags', 'soda')
('other vegetables', 'rolls/buns', 'sausage')
('sausage', 'soda', 'whole milk')
('bottled water', 'soda', 'whole milk')
('other vegetables', 'soda', 'whole

### Generate and print The strong association rules 

In [107]:



# Function to generate all non-empty subsets of a set
def get_subsets(itemset):
    subsets = []
    for i in range(1, len(itemset)):
        subsets.extend(combinations(itemset, i))
    return subsets



# Function to calculate support for a given itemset
def calculate_support(itemset, filtered_vertical_data):
    support_count = 0
    # Fetch the transaction sets for each item in the combination
    transaction_sets = [filtered_vertical_data[item] for item in itemset]

    
    # Find the intersection of all transaction sets (common transactions)
    common_transactions = set.intersection(*transaction_sets)
    
    # If the combination has common transactions, add it to the new map
    if len(common_transactions) > 0:
        support_count += len(common_transactions)

        
       
    return support_count


# Generate strong association rules
def generate_association_rules(filtered_vertical_data):
    rules = []
    
    # Iterate over all itemsets (frequent itemsets)
    for itemset, transactions in filtered_vertical_data.items():
        itemset_size = len(itemset)
        
        if itemset_size > 1:  

            # Get all non-empty subsets of the itemset
            subsets = get_subsets(itemset)
            
            for subset in subsets:
                # X is the subset and Y is the complement (rest of the items in the itemset)
                X = set(subset)
                Y = set(itemset) - X
                
                # Calculate support for X and X ∪ Y
                # For X, count how many transactions contain all items in X
                # and take the one_item vertical data 
                support_X = calculate_support(X, one_item_vertical_data)
                # Support for the entire itemset (X ∪ Y)
                support_XY = len(transactions)  
                # Calculate confidence
                confidence = support_XY / support_X if support_X > 0 else 0
                
                # If confidence is greater than or equal to the threshold, we have a strong rule
                if confidence >= confidence_threshold:
                    rule = {
                        'Rule': f"{X} => {Y}",
                        'SupportAll': support_XY,
                        'Support' : support_X,
                        'Confidence': confidence
                    }
                    rules.append(rule)
    
    return rules

# Now generate the strong association rules from filtered_vertical_data
strong_rules = generate_association_rules(filtered_vertical_data_copy)

# Print the strong association rules
print("\nStrong Association Rules:")
for rule in strong_rules:
    print(f"Rule: {rule['Rule']}, SupportAll: {rule['SupportAll']}, Support : {rule['Support']} ,Confidence: {rule['Confidence']}")



Strong Association Rules:
Rule: {'pastry', 'soda'} => {'whole milk'}, SupportAll: 14, Support : 61 ,Confidence: 0.22950819672131148
Rule: {'pastry', 'sausage'} => {'whole milk'}, SupportAll: 11, Support : 48 ,Confidence: 0.22916666666666666
Rule: {'sausage', 'rolls/buns'} => {'whole milk'}, SupportAll: 17, Support : 80 ,Confidence: 0.2125
Rule: {'sausage', 'yogurt'} => {'whole milk'}, SupportAll: 22, Support : 86 ,Confidence: 0.2558139534883721
