Apriori Algorithm

In [1]:
import pandas as pd
import numpy as np
from itertools import combinations

# Parameters
support_threshold = 0.005     
confidence_threshold = 0.3    
itemset_size = 3  

# Dataset load
df = pd.read_csv('groceriesdataset.csv')

# Transaction ID Creation
df['transaction_id'] = df['Member_number'].astype(str) + '_' + df['Date'].astype(str)

In [2]:
# Basketing
transactions = df.groupby('transaction_id')['itemDescription'].apply(list).tolist()
transactions_sets = [set(t) for t in transactions]

# Support calculations
def get_support(itemset, transactions):
    count = sum(1 for t in transactions if itemset.issubset(t))
    return count / len(transactions)

def get_support_count(itemset, transactions):
    return sum(1 for t in transactions if itemset.issubset(t))

In [3]:
# Item extraction and frequent itemset calculations
all_items = sorted(set(item for t in transactions_sets for item in t))
min_support = 0.001

freq_1_itemsets = {}
for item in all_items:
    sup = get_support(frozenset([item]), transactions_sets)
    if sup >= min_support:
        freq_1_itemsets[frozenset([item])] = sup

freq_2_itemsets = {}
if itemset_size >= 2:
    for combo in combinations(all_items, 2):
        itemset = frozenset(combo)
        sup = get_support(itemset, transactions_sets)
        if sup >= min_support:
            freq_2_itemsets[itemset] = sup


freq_3_itemsets = {}
if itemset_size >= 3 and freq_2_itemsets:
    candidate_3_itemsets = set()
    freq_2_list = list(freq_2_itemsets.keys())

# Candidate joining and support filtering
    for i in range(len(freq_2_list)):
        for j in range(i + 1, len(freq_2_list)):
            union = freq_2_list[i] | freq_2_list[j]
            if len(union) == 3:
                if all((union - frozenset([item])) in freq_2_itemsets for item in union):
                    candidate_3_itemsets.add(union)

    for itemset in candidate_3_itemsets:
        sup = get_support(itemset, transactions_sets)
        if sup >= min_support:
            freq_3_itemsets[itemset] = sup

all_freq_itemsets = {**freq_1_itemsets, **freq_2_itemsets, **freq_3_itemsets}

In [4]:
# Association rules for itemsets
rules_from_2 = []
for itemset, sup in freq_2_itemsets.items():
    items = list(itemset)
    for i in range(2):
        pre = frozenset([items[i]])
        conc = frozenset([items[1 - i]])
        conf = sup / freq_1_itemsets[pre]
        prior = all_freq_itemsets[conc]
        interestingness = abs(conf - prior)
        
        rules_from_2.append({
            'premise': pre,
            'conclusion': conc,
            'support': sup,
            'confidence': conf,
            'prior': prior,
            'interestingness': interestingness,
            'itemset_size': 2
        })


rules_from_3 = []
for itemset, sup in freq_3_itemsets.items():
    items = list(itemset)
    for r in range(1, len(items)):
        for pre_tuple in combinations(items, r):
            pre = frozenset(pre_tuple)
            conc = itemset - pre
            if pre in all_freq_itemsets:
                conf = sup / all_freq_itemsets[pre]
                prior = all_freq_itemsets[conc] 
                interestingness = abs(conf - prior)
                
                rules_from_3.append({
                    'premise': pre,
                    'conclusion': conc,
                    'support': sup,
                    'confidence': conf,
                    'prior': prior,
                    'interestingness': interestingness,
                    'itemset_size': 3
                })

all_rules = rules_from_2 + rules_from_3

In [5]:
# Rule filtering
min_support_threshold = support_threshold
confidence_threshold_threshold = confidence_threshold


filtered_rules = [r for r in all_rules if r['support'] >= min_support_threshold and r['confidence'] >= confidence_threshold_threshold]

min_support_threshold = support_threshold
filtered_rules = [r for r in all_rules if r['support'] >= min_support_threshold]

max_confidence = max(r['confidence'] for r in filtered_rules)
max_conf_rule = [r for r in filtered_rules if abs(r['confidence'] - max_confidence) < 1e-6][0]

print(max_confidence)

sorted_rules = sorted(filtered_rules, key=lambda r: r['confidence'], reverse=True)

0.15781710914454278


In [6]:
# Top 10 frequent items by support
sorted_itemsets = sorted(all_freq_itemsets.items(), key=lambda x: x[1], reverse=True)

number_of_itemsets = 10

for i, (itemset, support) in enumerate(sorted_itemsets[:number_of_itemsets], 1):
    print(f"{i}: {set(itemset)} Support: {support:.4f}")


1: {'whole milk'} Support: 0.1579
2: {'other vegetables'} Support: 0.1221
3: {'rolls/buns'} Support: 0.1100
4: {'soda'} Support: 0.0971
5: {'yogurt'} Support: 0.0859
6: {'root vegetables'} Support: 0.0696
7: {'tropical fruit'} Support: 0.0678
8: {'bottled water'} Support: 0.0607
9: {'sausage'} Support: 0.0603
10: {'citrus fruit'} Support: 0.0531


In [7]:
# Top 5 association rules by interestingness
sorted_by_interestingness = sorted(all_rules, key=lambda r: r['interestingness'], reverse=True)

number_of_rul_inte = 5

for i, r in enumerate(sorted_by_interestingness[:number_of_rul_inte], 1):
    print(f"{i}: {set(r['premise'])} -> {set(r['conclusion'])} Support: {r['support']:.4f} Confidence: {r['confidence']:.4f} Prior: {r['prior']:.4f} Interestingness: {r['interestingness']:.4f}")

1: {'yogurt', 'sausage'} -> {'whole milk'} Support: 0.0015 Confidence: 0.2558 Prior: 0.1579 Interestingness: 0.0979
2: {'sausage', 'whole milk'} -> {'yogurt'} Support: 0.0015 Confidence: 0.1642 Prior: 0.0859 Interestingness: 0.0783
3: {'specialty chocolate'} -> {'whole milk'} Support: 0.0013 Confidence: 0.0837 Prior: 0.1579 Interestingness: 0.0742
4: {'yogurt', 'whole milk'} -> {'sausage'} Support: 0.0015 Confidence: 0.1317 Prior: 0.0603 Interestingness: 0.0714
5: {'misc. beverages'} -> {'whole milk'} Support: 0.0014 Confidence: 0.0890 Prior: 0.1579 Interestingness: 0.0689


PCY Algorithm

In [8]:
import pandas as pd
import numpy as np
from itertools import combinations
from collections import defaultdict

# Parameters
support_threshold = 0.005
confidence_threshold = 0.3
itemset_size = 3

# Dataset load
df = pd.read_csv('groceriesdataset.csv')

# Transaction ID Creation
df['transaction_id'] = df['Member_number'].astype(str) + '_' + df['Date'].astype(str)

In [9]:
# Basketing
transactions = df.groupby('transaction_id')['itemDescription'].apply(list).tolist()
transactions_sets = [set(t) for t in transactions]

# Support calculations

def get_support(itemset, transactions):
    count = sum(1 for t in transactions if itemset.issubset(t))
    return count / len(transactions)


def get_support_count(itemset, transactions):
    return sum(1 for t in transactions if itemset.issubset(t))

In [10]:
# Pass 1

all_items = sorted(set(item for t in transactions_sets for item in t))
min_support = support_threshold

# Bucket hashing 
num_buckets = 10000
hash_table = defaultdict(int)

# Frequent item count 
freq_1_itemsets = {}
for item in all_items:
    sup = get_support(frozenset([item]), transactions_sets)
    if sup >= min_support:
        freq_1_itemsets[frozenset([item])] = sup

# Frequency pair hashing 
for basket in transactions_sets:
    basket_items = list(basket)
    for i in range(len(basket_items)):
        for j in range(i + 1, len(basket_items)):
            item_i = basket_items[i]
            item_j = basket_items[j]
            if item_i < item_j:
                pair = (item_i, item_j)
            else:
                pair = (item_j, item_i)
            bucket_id = hash(pair) % num_buckets
            hash_table[bucket_id] += 1

# Frequent buckets
frequent_buckets = set(bid for bid, count in hash_table.items() if count >= min_support * len(transactions_sets))

In [None]:
# Pass 2

freq_2_itemsets = {}

if itemset_size >= 2:
    for itemset, sup in freq_2_itemsets.items():
        pass
    
    for basket in transactions_sets:
        basket_items = list(basket)
        frequent_items_in_basket = [item for item in basket_items 
                                     if frozenset([item]) in freq_1_itemsets]
    
        for i in range(len(frequent_items_in_basket)):
            for j in range(i + 1, len(frequent_items_in_basket)):
                item_i = frequent_items_in_basket[i]
                item_j = frequent_items_in_basket[j]
                
                if item_i < item_j:
                    pair = (item_i, item_j)
                else:
                    pair = (item_j, item_i)
                
                bucket_id = hash(pair) % num_buckets
                
                if bucket_id in frequent_buckets:
                    itemset = frozenset([item_i, item_j])
                    if itemset not in freq_2_itemsets:
                        freq_2_itemsets[itemset] = 0
                    freq_2_itemsets[itemset] += 1
    
    freq_2_itemsets = {
        itemset: count / len(transactions_sets)
        for itemset, count in freq_2_itemsets.items()
        if count / len(transactions_sets) >= min_support
    }

In [12]:
# Pass 3

freq_3_itemsets = {}
if itemset_size >= 3 and freq_2_itemsets:
    candidate_3_itemsets = set()
    freq_2_list = list(freq_2_itemsets.keys())

    for i in range(len(freq_2_list)):
        for j in range(i + 1, len(freq_2_list)):
            union = freq_2_list[i] | freq_2_list[j]
            if len(union) == 3:
                if all((union - frozenset([item])) in freq_2_itemsets for item in union):
                    candidate_3_itemsets.add(union)

    for itemset in candidate_3_itemsets:
        sup = get_support(itemset, transactions_sets)
        if sup >= min_support:
            freq_3_itemsets[itemset] = sup

all_freq_itemsets = {**freq_1_itemsets, **freq_2_itemsets, **freq_3_itemsets}

In [13]:
# Association rules
rules_from_2 = []
for itemset, sup in freq_2_itemsets.items():
    items = list(itemset)
    for i in range(2):
        pre = frozenset([items[i]])
        conc = frozenset([items[1 - i]])
        conf = sup / freq_1_itemsets[pre]
        prior = all_freq_itemsets[conc]
        interestingness = abs(conf - prior)
        
        rules_from_2.append({
            'premise': pre,
            'conclusion': conc,
            'support': sup,
            'confidence': conf,
            'prior': prior,
            'interestingness': interestingness,
            'itemset_size': 2
        })

rules_from_3 = []
for itemset, sup in freq_3_itemsets.items():
    items = list(itemset)
    for r in range(1, len(items)):
        for pre_tuple in combinations(items, r):
            pre = frozenset(pre_tuple)
            conc = itemset - pre
            if pre in all_freq_itemsets:
                conf = sup / all_freq_itemsets[pre]
                prior = all_freq_itemsets[conc]
                interestingness = abs(conf - prior)
                
                rules_from_3.append({
                    'premise': pre,
                    'conclusion': conc,
                    'support': sup,
                    'confidence': conf,
                    'prior': prior,
                    'interestingness': interestingness,
                    'itemset_size': 3
                })

all_rules = rules_from_2 + rules_from_3

In [14]:
# Threshold filtering
min_support_threshold = support_threshold
confidence_threshold_threshold = confidence_threshold

filtered_rules = [r for r in all_rules if r['support'] >= min_support_threshold and r['confidence'] >= confidence_threshold_threshold]

In [15]:
# Top 10 frequent items by support
sorted_itemsets = sorted(all_freq_itemsets.items(), key=lambda x: x[1], reverse=True)

number_of_itemsets = 10

for i, (itemset, support) in enumerate(sorted_itemsets[:number_of_itemsets], 1):
    print(f"{i}: {set(itemset)} Support: {support:.4f}")

1: {'whole milk'} Support: 0.1579
2: {'other vegetables'} Support: 0.1221
3: {'rolls/buns'} Support: 0.1100
4: {'soda'} Support: 0.0971
5: {'yogurt'} Support: 0.0859
6: {'root vegetables'} Support: 0.0696
7: {'tropical fruit'} Support: 0.0678
8: {'bottled water'} Support: 0.0607
9: {'sausage'} Support: 0.0603
10: {'citrus fruit'} Support: 0.0531


In [16]:
# Top 5 association rules by interestingness
sorted_by_interestingness = sorted(all_rules, key=lambda r: r['interestingness'], reverse=True)

number_of_rul_inte = 5

for i, r in enumerate(sorted_by_interestingness[:number_of_rul_inte], 1):
    print(f"{i}: {set(r['premise'])} -> {set(r['conclusion'])} Support: {r['support']:.4f} Confidence: {r['confidence']:.4f} Prior: {r['prior']:.4f} Interestingness: {r['interestingness']:.4f}")

1: {'root vegetables'} -> {'whole milk'} Support: 0.0076 Confidence: 0.1085 Prior: 0.1579 Interestingness: 0.0494
2: {'root vegetables'} -> {'other vegetables'} Support: 0.0053 Confidence: 0.0759 Prior: 0.1221 Interestingness: 0.0462
3: {'bottled water'} -> {'whole milk'} Support: 0.0072 Confidence: 0.1178 Prior: 0.1579 Interestingness: 0.0401
4: {'soda'} -> {'whole milk'} Support: 0.0116 Confidence: 0.1198 Prior: 0.1579 Interestingness: 0.0382
5: {'tropical fruit'} -> {'whole milk'} Support: 0.0082 Confidence: 0.1213 Prior: 0.1579 Interestingness: 0.0366
