In [2]:
import pandas as pd
from itertools import combinations

filePath = "Groceries_dataset.csv"
data = pd.read_csv(filePath, delimiter=",", names=["member_number", "date", "item_description"])

# Group transactions by member_number and date, and collect items as lists
transactions = data.groupby(['member_number', 'date'])['item_description'].apply(list)
transaction_list = transactions.tolist()

# Function to generate itemsets
def generateItemsets(transactions):
    itemsets = set()
    for transaction in transactions:
        for item in transaction:
            itemsets.add(frozenset([item]))  # Convert to frozenset for immutability
    return itemsets

# Converts each transaction to a set for more efficient computation
def convertToSet(transactions):
    return [set(transaction) for transaction in transactions]

# Calculate support for itemsets
def calculateSupport(itemsets, transactions):
    transaction_sets = convertToSet(transactions)
    itemset_counts = {}

    for transaction_set in transaction_sets:
        for itemset in itemsets:
            if itemset.issubset(transaction_set):
                itemset_counts[itemset] = itemset_counts.get(itemset, 0) + 1

    total_transactions = len(transactions)
    support = {itemset: count / total_transactions for itemset, count in itemset_counts.items()}

    return support

# Filter itemsets by minimum support
def filterItemsets(itemsets, support, min_support):
    return {itemset for itemset, sup in support.items() if sup >= min_support}

# Generate candidate itemsets by joining frequent itemsets
def joinItemsets(frequent_itemsets, k):
    candidates = set()
    frequent_list = list(frequent_itemsets)

    for i in range(len(frequent_list)):
        for j in range(i + 1, len(frequent_list)):
            itemset1, itemset2 = frequent_list[i], frequent_list[j]
            union_itemset = itemset1 | itemset2

            if len(union_itemset) == k:
                candidates.add(union_itemset)

    return candidates

# Apriori algorithm
def apriori(transactions, min_support):
    frequent_itemsets = generateItemsets(transactions)
    support = calculateSupport(frequent_itemsets, transactions)

    frequent_itemsets = filterItemsets(frequent_itemsets, support, min_support)

    all_frequent_itemsets = dict()
    all_frequent_itemsets[1] = frequent_itemsets

    k = 2
    while frequent_itemsets:
        candidates = joinItemsets(frequent_itemsets, k)
        support = calculateSupport(candidates, transactions)
        frequent_itemsets = filterItemsets(candidates, support, min_support)

        if frequent_itemsets:
            all_frequent_itemsets[k] = frequent_itemsets
        k = k + 1

    return all_frequent_itemsets

# Calculate support for a specific itemset
def calculateItemsetSupport(itemset, transaction_list):
    count = sum(1 for transaction in transaction_list if itemset.issubset(set(transaction)))
    support = count / len(transaction_list)
    return support

# Calculate confidence for a rule
def calculateConfidence(antecedent, consequent, support, transaction_list):
    support_antecedent = support.get(antecedent, 0)
    support_antecedent_consequent = support.get(antecedent | consequent, 0)

    confidence = support_antecedent_consequent / support_antecedent if support_antecedent > 0 else 0
    return confidence

# Calculate lift for a rule
def calculateLift(antecedent, consequent, support, transaction_list):
    support_antecedent = support.get(antecedent, 0)
    support_consequent = support.get(consequent, 0)
    support_antecedent_consequent = support.get(antecedent | consequent, 0)

    lift = support_antecedent_consequent / (support_antecedent * support_consequent) if (support_antecedent > 0 and support_consequent > 0) else 0
    return lift

# Generate association rules
def generateRules(frequent_itemsets, transaction_list, min_confidence):
    rules = []
    support = {}

    # Calculate support for all itemsets
    for itemset_size, itemsets in frequent_itemsets.items():
        for itemset in itemsets:
            support[itemset] = calculateItemsetSupport(itemset, transaction_list)

    # Generate rules
    for itemset_size, itemsets in frequent_itemsets.items():
        for itemset in itemsets:
            for antecedent_size in range(1, len(itemset)):
                for antecedent in combinations(itemset, antecedent_size):
                    antecedent = frozenset(antecedent)
                    consequent = itemset - antecedent

                    confidence = calculateConfidence(antecedent, consequent, support, transaction_list)

                    if confidence >= min_confidence:
                        lift = calculateLift(antecedent, consequent, support, transaction_list)
                        rules.append((antecedent, consequent, confidence, lift))

    return rules


In [3]:
min_support = 0.0005
frequent_itemsets = apriori(transaction_list, min_support)


for k, itemsets in frequent_itemsets.items():
    print(f"Frequent {k}-itemsets: {itemsets}")

Frequent 1-itemsets: {frozenset({'brown bread'}), frozenset({'dishes'}), frozenset({'photo/film'}), frozenset({'pot plants'}), frozenset({'long life bakery product'}), frozenset({'male cosmetics'}), frozenset({'flower soil/fertilizer'}), frozenset({'waffles'}), frozenset({'chicken'}), frozenset({'processed cheese'}), frozenset({'frozen potato products'}), frozenset({'pastry'}), frozenset({'liquor'}), frozenset({'pasta'}), frozenset({'canned vegetables'}), frozenset({'mayonnaise'}), frozenset({'flower (seeds)'}), frozenset({'tea'}), frozenset({'frozen fish'}), frozenset({'soap'}), frozenset({'root vegetables'}), frozenset({'candles'}), frozenset({'hard cheese'}), frozenset({'domestic eggs'}), frozenset({'margarine'}), frozenset({'soft cheese'}), frozenset({'hair spray'}), frozenset({'dental care'}), frozenset({'ready soups'}), frozenset({'skin care'}), frozenset({'salty snack'}), frozenset({'butter milk'}), frozenset({'baking powder'}), frozenset({'pudding powder'}), frozenset({'female 

In [4]:
min_confidence = 0.003

association_rules = generateRules(frequent_itemsets, transaction_list, min_confidence)
print("Total number of rules: " + str(len(association_rules)))

for rule in association_rules:
    antecedent, consequent, confidence, lift = rule
    print(f"{antecedent} -> {consequent}, confidence: {confidence:.3f}, lift: {lift:.3f}")

Total number of rules: 2866
frozenset({'chicken'}) -> frozenset({'onions'}), confidence: 0.029, lift: 1.421
frozenset({'onions'}) -> frozenset({'chicken'}), confidence: 0.040, lift: 1.421
frozenset({'pip fruit'}) -> frozenset({'coffee'}), confidence: 0.022, lift: 0.690
frozenset({'coffee'}) -> frozenset({'pip fruit'}), confidence: 0.034, lift: 0.690
frozenset({'tropical fruit'}) -> frozenset({'condensed milk'}), confidence: 0.008, lift: 1.205
frozenset({'condensed milk'}) -> frozenset({'tropical fruit'}), confidence: 0.082, lift: 1.205
frozenset({'sausage'}) -> frozenset({'chocolate'}), confidence: 0.023, lift: 0.986
frozenset({'chocolate'}) -> frozenset({'sausage'}), confidence: 0.059, lift: 0.986
frozenset({'bottled water'}) -> frozenset({'fruit/vegetable juice'}), confidence: 0.020, lift: 0.583
frozenset({'fruit/vegetable juice'}) -> frozenset({'bottled water'}), confidence: 0.035, lift: 0.583
frozenset({'sausage'}) -> frozenset({'misc. beverages'}), confidence: 0.018, lift: 1.123
f

Support - Measure of the prevalence of an item / itemset in the overall dataset. A high support means the item or set of items are purchased frequently. Practically, higher support items should be placed more predominantly in stores and should be easier to find, as they generate more profit for the store.

Confidence - Measure of how likely when one item A is purchased, another item B is purchased with it. A high confidence means it is likely that when a customer buys A, they also buy B. Hot dogs and hot dog buns would likely be two items that have a high confidence, as they are likely to be purchased together.

Lift - Measure of how strong an association of two items being bought together assuming the two items are idependent. Lift(A->B) = Confidence(A->B) / Support(B). Lift factors in how popular item B is, with a lift greater than 1.0 meaning they are more likely to be purchased together than pure chance. A lift of 1.0 means the items are idependent and have no significant association, and a lift less than 1.0 means the items are negatively associated, so purchasing A would decrease the likelihood of purchasing B.

The Apriori algorithm generates frequent itemsets that have a support >= the minimum support threshold. Next it generates possible association rules from the frequent itemsets that have a confidence >= the minimum confidence threshold.

Finding these associations can help businesses generate more revenue and improve customer satisfaction. High lift items could be placed physically close to each other in stores or discounted together in promotions to further increase the likelihood that they are purchased together.