In [3]:
import pandas as pd
from itertools import combinations
from collections import defaultdict


### Take user inputs


In [4]:

file_name = input("Enter the txt file name (with .txt extension): ")
# 0.5
confidence_threshold = float(input("Enter the confidence threshold (e.g., 0.20): "))
#1000
sup_count = int(input("Enter the minimum support count: "))



### Load the dataset

In [5]:
# Define the header as None beause the file has no header
# The file has only one column with the name "Categories"
# skip_blank_lines=True will skip any blank lines
# on_bad_lines='skip' will skip any bad lines that cannot be parsed

df = pd.read_csv(file_name, header=None, names=["Categories"], skip_blank_lines=True , on_bad_lines='skip')

print(df)
# Get the number of rows 
print(len(df))



                                              Categories
0      Breakfast & Brunch;American (Traditional);Rest...
1                                 Sandwiches;Restaurants
2           Local Services;IT Services & Computer Repair
3                                    Restaurants;Italian
4                                      Food;Coffee & Tea
...                                                  ...
75813  Doctors;Cosmetic Surgeons;Beauty & Spas;Medica...
75814                                   Fashion;Shopping
75815  Doctors;Health & Medical;Obstetricians & Gynec...
75816                                  Food;Coffee & Tea
75817         Food;Health Markets;Grocery;Specialty Food

[75818 rows x 1 columns]
75818


### Generate Frequent 1 itemset 

In [6]:
# Create a unique transaction ID for each row
# Using row index as unique transaction ID
# Converts MultiIndex into regular columns
df = df.reset_index()  
# Assign unique transaction ID
df["Transaction_ID"] = df.index.astype(str)  

# Create vertical data format (category → set of transactions)
vertical_data = defaultdict(set)

# Show the number of transactions (each line is a transaction)
num_transactions = len(df)
print("Number of unique transactions:", num_transactions)

# Populate vertical data with category → transaction mapping
for txn_id, categories in zip(df["Transaction_ID"], df["Categories"]):
    category_list = categories.split(";")  # Split categories by semicolon
    for category in category_list:
        # Strip spaces for consistency
        vertical_data[category.strip()].add(txn_id)  

# Convert defaultdict to a regular dictionary
vertical_data = dict(vertical_data)
print("Number of unique categories:", len(vertical_data))

# Create a copy of vertical data for confidence calculations
one_item_vertical_data = vertical_data.copy()


Number of unique transactions: 75818
Number of unique categories: 882


### Generate Candidate 1 itemset 

In [7]:

# Filter vertical data based on support count
filtered_vertical_data = {}
for item, transactions in vertical_data.items():
    if len(transactions) >= sup_count:
        filtered_vertical_data[item] = transactions


### Generate The frequent item sets 

In [8]:

# Initialize max_k for itemset sizes
max_k = 2  

# Store previous frequent itemsets
filtered_vertical_data_copy = filtered_vertical_data.copy()

# print("Initial Frequent Itemsets:", filtered_vertical_data_copy)

# Continue generating itemsets until no more frequent sets are found
while True:
    prev_frequent_itemsets = list(filtered_vertical_data_copy.keys())

    # print(f"\n=== Iteration {max_k-1} ===")

    # Stop if there are no more frequent itemsets
    if len(prev_frequent_itemsets) < 2:
        # print("No more frequent itemsets. Stopping.")
        break

    # Generate candidate itemsets of size max_k from previous (max_k-1) itemsets
    new_filtered_vertical_data = {}

    for a, b in combinations(prev_frequent_itemsets, 2):

        # Ensure merging happens correctly
        # Assuming `a` and `b` are two items to be merged
        if isinstance(a, tuple) and isinstance(b, tuple):
            # Merge and sort
            merged_set = sorted(set(a) | set(b))  
        else:
            # If not tuples, keep them as is
            merged_set = (a, b)  

        # convert merged_set to a tuple
        merged_set = tuple(merged_set)
        
        # Only merge sets that differ by one item
        if len(merged_set) == max_k:

            # Merge transaction sets from previous frequent itemsets
            common_transactions = filtered_vertical_data_copy[a] & filtered_vertical_data_copy[b]


            # Store only frequent itemsets
            if len(common_transactions) >= sup_count:
                new_filtered_vertical_data[merged_set] = common_transactions

    # Stop if no new frequent itemsets are found
    if not new_filtered_vertical_data:
        break

    # Update for the next iteration
    filtered_vertical_data_copy = new_filtered_vertical_data
    max_k += 1



### Print The frequent item sets 

In [9]:
for key in filtered_vertical_data_copy.keys():
    print(key)


('Bars', 'Nightlife', 'Restaurants')
('Event Planning & Services', 'Hotels', 'Hotels & Travel')
('Fashion', 'Shopping', "Women's Clothing")


### Generate and print The strong association rules 

In [10]:



# Function to generate all non-empty subsets of a set
def get_subsets(itemset):
    subsets = []
    for i in range(1, len(itemset)):
        subsets.extend(combinations(itemset, i))
    return subsets



# Function to calculate support for a given itemset
def calculate_support(itemset, filtered_vertical_data):
    support_count = 0
    # Fetch the transaction sets for each item in the combination
    transaction_sets = [filtered_vertical_data[item] for item in itemset]

    
    # Find the intersection of all transaction sets (common transactions)
    common_transactions = set.intersection(*transaction_sets)
    
    # If the combination has common transactions, add it to the new map
    if len(common_transactions) > 0:
        support_count += len(common_transactions)

        
       
    return support_count


# Generate strong association rules
def generate_association_rules(filtered_vertical_data):
    rules = []
    
    # Iterate over all itemsets (frequent itemsets)
    for itemset, transactions in filtered_vertical_data.items():
        itemset_size = len(itemset)
        
        if itemset_size > 1:  

            # Get all non-empty subsets of the itemset
            subsets = get_subsets(itemset)
            
            for subset in subsets:
                # X is the subset and Y is the complement (rest of the items in the itemset)
                X = set(subset)
                Y = set(itemset) - X
                
                # Calculate support for X and X ∪ Y
                # For X, count how many transactions contain all items in X
                # and take the one_item vertical data 
                support_X = calculate_support(X, one_item_vertical_data)
                # Support for the entire itemset (X ∪ Y)
                support_XY = len(transactions)  
                # Calculate confidence
                confidence = support_XY / support_X if support_X > 0 else 0
                
                # If confidence is greater than or equal to the threshold, we have a strong rule
                if confidence >= confidence_threshold:
                    rule = {
                        'Rule': f"{X} => {Y}",
                        'SupportAll': support_XY,
                        'Support' : support_X,
                        'Confidence': confidence
                    }
                    rules.append(rule)
    
    return rules

# Now generate the strong association rules from filtered_vertical_data
strong_rules = generate_association_rules(filtered_vertical_data_copy)

# Print the strong association rules
print("\nStrong Association Rules:")
for rule in strong_rules:
    print(f"Rule: {rule['Rule']}, SupportAll: {rule['SupportAll']}, Support : {rule['Support']} ,Confidence: {rule['Confidence']}")



Strong Association Rules:
Rule: {'Bars'} => {'Restaurants', 'Nightlife'}, SupportAll: 2399, Support : 4269 ,Confidence: 0.5619583040524713
Rule: {'Bars', 'Nightlife'} => {'Restaurants'}, SupportAll: 2399, Support : 4269 ,Confidence: 0.5619583040524713
Rule: {'Bars', 'Restaurants'} => {'Nightlife'}, SupportAll: 2399, Support : 2399 ,Confidence: 1.0
Rule: {'Nightlife', 'Restaurants'} => {'Bars'}, SupportAll: 2399, Support : 2507 ,Confidence: 0.9569206222576785
Rule: {'Hotels'} => {'Hotels & Travel', 'Event Planning & Services'}, SupportAll: 1431, Support : 1431 ,Confidence: 1.0
Rule: {'Hotels & Travel'} => {'Event Planning & Services', 'Hotels'}, SupportAll: 1431, Support : 2492 ,Confidence: 0.5742375601926164
Rule: {'Event Planning & Services', 'Hotels'} => {'Hotels & Travel'}, SupportAll: 1431, Support : 1431 ,Confidence: 1.0
Rule: {'Hotels & Travel', 'Event Planning & Services'} => {'Hotels'}, SupportAll: 1431, Support : 1471 ,Confidence: 0.972807613868117
Rule: {'Hotels & Travel', '