In [37]:
import pandas as pd
from collections import defaultdict
from itertools import combinations
from tqdm import tqdm

# Sampling the dataset for testing
transaction_data = pd.read_csv("../data/user_with_courses.csv")

# Sample 1000 rows randomly for testing purposes
sampled_data = transaction_data.sample(n=1000, random_state=42)

# Convert the sampled data to transactions
def convert_to_transactions(transaction_data):
    transactions = []
    for _, row in tqdm(transaction_data.iterrows(), desc="Converting transaction data to transactions"):
        transaction = [item for item in row[1:] if pd.notnull(item)]
        transactions.append(transaction)
    return transactions

# Convert the sampled data into transactions
transactions = convert_to_transactions(sampled_data)

# Calculate absolute minimum support
min_support = 0.001
absolute_min_support = int(min_support * len(transactions))

def eclat(transactions, absolute_min_support):
    # Step 1: Convert transactions to vertical data format
    vertical_data = defaultdict(list)
    for tid, transaction in enumerate(tqdm(transactions, desc="Converting transactions to vertical data")):
        for item in transaction:
            vertical_data[item].append(tid)

    # Step 2: Get frequent single-item itemsets
    frequent_itemsets = []
    for item, tids in tqdm(vertical_data.items(), desc="Getting frequent single-item itemsets"):
        support = len(tids)
        if support >= absolute_min_support:  # Compare with absolute minimum support
            frequent_itemsets.append(([item], tids))
    print(frequent_itemsets)
    answer = []
    answer.extend(frequent_itemsets)

    # Step 3: Generate frequent itemsets
    k = 2
    while True:
        candidates = generate_candidates(frequent_itemsets, k, absolute_min_support)
        answer.extend(candidates)
        if not candidates:
            break
        frequent_itemsets = candidates
        print(frequent_itemsets)
        k += 1

    return answer, vertical_data

def generate_candidates(frequent_itemsets, k, absolute_min_support):
    candidates = []
    itemsets = [itemset for itemset, _ in frequent_itemsets]
    for itemset_pair in tqdm(combinations(itemsets, 2), desc=f"Generating candidates (k={k})"):
        itemset1, itemset2 = itemset_pair
        union_itemset = set(itemset1) | set(itemset2)
        if len(union_itemset) == k:
            candidate = list(set(sorted(union_itemset)))
            if candidate not in [itemset for itemset, _ in candidates]:
                tids1 = [tids for itemset, tids in frequent_itemsets if itemset == itemset1][0]
                tids2 = [tids for itemset, tids in frequent_itemsets if itemset == itemset2][0]
                candidate_tids = intersect(tids1, tids2)
                if len(candidate_tids) >= absolute_min_support:  # Compare with absolute minimum support
                    candidates.append((candidate, candidate_tids))
    return candidates


def intersect(tids1, tids2):
    return [tid for tid in tids1 if tid in tids2]

def generate_association_rules(frequent_itemsets, vertical_data, min_confidence):
    association_rules = []
    for itemset, tids in tqdm(frequent_itemsets, desc="Generating association rules"):
        if len(itemset) > 1:
            for i in range(1, len(itemset)):
                for antecedent in combinations(itemset, i):
                    antecedent = list(antecedent)
                    consequent = [item for item in itemset if item not in antecedent]
                    antecedent_tids = set([tid for item in antecedent for tid in vertical_data[item]])
                    itemset_tids = set(tids)
                    confidence = len(itemset_tids) / len(antecedent_tids) if antecedent_tids else 0
                    print(f"Antecedent: {antecedent}, Consequent: {consequent}, Confidence: {confidence}")
                    if confidence >= min_confidence:
                        association_rules.append((antecedent, consequent, confidence))
    return association_rules


# Run ECLAT on the sampled transactions
frequent_itemsets, vertical_data = eclat(transactions, min_support)

# Generate association rules
association_rules = generate_association_rules(frequent_itemsets, vertical_data, absolute_min_support)

# Print the frequent itemsets
print("Frequent Itemsets:")
for itemset, tids in frequent_itemsets:
    print(f"Itemset: {itemset}, Support: {len(tids)}")


    


Converting transaction data to transactions: 1000it [00:00, 18853.26it/s]
Converting transactions to vertical data: 100%|██████████| 1000/1000 [00:00<00:00, 1003662.12it/s]
Getting frequent single-item itemsets: 100%|██████████| 769/769 [00:00<00:00, 767957.09it/s]


[(['ML0122ENv1'], [0, 36, 60, 142, 154, 203, 218, 264, 289, 303, 304, 307, 362, 430, 451, 471, 567, 570, 623, 627, 716, 781, 795, 866, 919, 938, 997]), (['PY0101EN, ML0101ENv3, BD0111EN, DS0101EN, DS0105EN, DA0101EN, BD0101EN, BC0101EN, CC0103EN'], [1]), (['CO0101EN, ML0201EN, CO0201EN, BD0101EN, WA0101EN'], [2]), (['DS0103EN, DA0101EN, DS0301EN, DS0105EN, DS0101EN, ST0101EN, PY0101EN'], [3]), (['PY0101EN, DV0101EN, DV0151EN, DA0101EN, DB0101EN'], [4]), (['DS0103EN, DA0101EN, DS0101EN, DS0105EN, DV0101EN, PY0101EN'], [5]), (['CL0101EN, CB0103EN, CC0103EN, BD0101EN, CC0101EN'], [6]), (['BD0101EN, CO0201EN, WA0101EN, CO0401EN, DS0101EN, CO0101EN, ML0120EN, ML0115EN, PY0101EN, ML0101ENv3, ST0101EN, CC0201EN'], [7]), (['ML0101EN, PY0101EN, BD0101EN, BD0111EN, RP0101EN, DS0101EN, ML0115EN'], [8]), (['ML0120ENv2, DS0101EN, DS0105EN, PY0101EN, ML0101ENv3, DS0103EN, ML0115EN'], [9]), (['CNSC02EN'], [10, 85, 93, 97, 126, 135, 143, 146, 147, 202, 204, 211, 219, 221, 224, 228, 230, 251, 308, 324,

Generating candidates (k=2): 295296it [00:34, 8597.26it/s] 
Generating association rules: 100%|██████████| 769/769 [00:00<00:00, 767591.57it/s]

Frequent Itemsets:
Itemset: ['ML0122ENv1'], Support: 27
Itemset: ['PY0101EN, ML0101ENv3, BD0111EN, DS0101EN, DS0105EN, DA0101EN, BD0101EN, BC0101EN, CC0103EN'], Support: 1
Itemset: ['CO0101EN, ML0201EN, CO0201EN, BD0101EN, WA0101EN'], Support: 1
Itemset: ['DS0103EN, DA0101EN, DS0301EN, DS0105EN, DS0101EN, ST0101EN, PY0101EN'], Support: 1
Itemset: ['PY0101EN, DV0101EN, DV0151EN, DA0101EN, DB0101EN'], Support: 1
Itemset: ['DS0103EN, DA0101EN, DS0101EN, DS0105EN, DV0101EN, PY0101EN'], Support: 1
Itemset: ['CL0101EN, CB0103EN, CC0103EN, BD0101EN, CC0101EN'], Support: 1
Itemset: ['BD0101EN, CO0201EN, WA0101EN, CO0401EN, DS0101EN, CO0101EN, ML0120EN, ML0115EN, PY0101EN, ML0101ENv3, ST0101EN, CC0201EN'], Support: 1
Itemset: ['ML0101EN, PY0101EN, BD0101EN, BD0111EN, RP0101EN, DS0101EN, ML0115EN'], Support: 1
Itemset: ['ML0120ENv2, DS0101EN, DS0105EN, PY0101EN, ML0101ENv3, DS0103EN, ML0115EN'], Support: 1
Itemset: ['CNSC02EN'], Support: 49
Itemset: ['DB0101EN, CC0201EN, DS0103EN, DV0101EN, ML01




Converting data to transactions: 1000it [00:00, 4084.21it/s]
Converting transactions to vertical data: 100%|██████████| 1000/1000 [00:00<00:00, 132133.19it/s]
Finding frequent single-item itemsets: 100%|██████████| 769/769 [00:00<00:00, 380850.13it/s]
Generating candidates (k=2): 36it [00:00, 18003.45it/s]
Generating association rules: 100%|██████████| 9/9 [00:00<?, ?it/s]

Input Courses: ['DS0101EN', 'DA0101EN']
Recommended Courses: set()



