In [12]:
import sys
import random
from itertools import combinations
from collections import defaultdict
from tqdm import tqdm

# **apriori**

In [None]:


def apriori(baskets, support):
    k = 1
    result = []
    candidates = generate_singletons(baskets)
    while candidates:
        print(f"Generating frequent itemsets of size {k}")
        frequent_items = generate_frequent_itemsets(candidates, baskets, support)
        print('frequent_items',frequent_items)
        if frequent_items:
            result.append(frequent_items)
            candidates = generate_candidates(frequent_items, k + 1)
            k += 1
            if(k>3):
              break
        else:
            break
    return result

def generate_frequent_itemsets(candidates, baskets, support):
    item_counts = defaultdict(int)
    for basket in tqdm(baskets, desc="Generating frequent itemsets"):
        for candidate in candidates:
            if frozenset(candidate).issubset(basket):
                item_counts[frozenset(candidate)] += 1
    return [list(item) for item, count in item_counts.items() if count >= support]

def generate_candidates(frequent_items, k):
    candidates = []
    print(f"Generating candidates of size {k}")
    for i in tqdm(range(len(frequent_items)), desc="Generating candidates"):
        for j in range(i + 1, len(frequent_items)):
            candidate = frozenset(frequent_items[i]) | frozenset(frequent_items[j])
            if len(candidate) == k:
                candidates.append(candidate)
    return [list(candidate) for candidate in candidates]

def generate_singletons(baskets):
    items = {frozenset([item]) for basket in baskets for item in basket}
    return [list(item) for item in items]

def generate_singletons_pair(frequent_items):
    singletons = set()
    for sublist in frequent_items:
        for item in sublist:
            if isinstance(item, list):
                singletons.update(item)
            else:
                singletons.add(item)
    return [frozenset([item]) for item in singletons]




In [None]:
def sampling(baskets, probability):
    size = len(baskets)
    sample_size = int(probability * size)
    print(f"Sampling {sample_size} out of {size} baskets...")
    return random.sample(baskets, sample_size)

def generate_baskets(inputfile):
    with open(inputfile, 'r') as inputdata:
        baskets = [frozenset(line.strip().split()) for line in inputdata]
    print(f"Generated {len(baskets)} baskets from input file.")
    return baskets

In [None]:
def check_negative_border_items(negative_border_items, baskets, support):
    for items in tqdm(negative_border_items, desc="Checking negative border items"):
        count = sum(1 for basket in baskets if frozenset(items).issubset(basket))
        if count >= support:
            return True
    return False

In [None]:
def generate_negative_border(frequent_items):
    result = set()
    candidates = generate_singletons_pair(frequent_items)
    while candidates:
        new_candidates = set()
        for candidate in tqdm(candidates, desc="Generating negative border"):
            if any(frozenset(candidate).issubset(frozenset(fitem)) for fitem_group in frequent_items for fitem in fitem_group):
                continue
            if candidate not in result:
                result.add(candidate)
            new_candidates.update(frozenset(combo) for combo in combinations(candidate, len(candidate) - 1))
        candidates = new_candidates
    return list(result)

In [None]:
def filter_frequent_items(frequent_items, baskets, support):
    result = []
    for fi_group in frequent_items:
        ans = []
        for fi in tqdm(fi_group, desc="Filtering frequent items"):
            count = sum(1 for basket in baskets if frozenset(fi).issubset(basket))
            if count >= support:
                ans.append(fi)
        if ans:
            result.append(ans)
    return result

In [23]:
def pass_one(inputfile, support):
    print("Starting pass one...")
    result = []
    p = 0.1  # fraction of sample size
    baskets = generate_baskets(inputfile)
    sample_baskets = sampling(baskets, p)
    result.append(len(sample_baskets) * 1.0 / len(baskets))
    adjusted_support = 0.9 * p * support  # use lower threshold
    print(f"Adjusted support for sample: {adjusted_support}")
    frequent_items = apriori(sample_baskets, adjusted_support)
    print("Frequent itemsets in sample:", frequent_items)
    negative_border_items = generate_negative_border(frequent_items)
    print("Negative border items:", negative_border_items)
    result.append(frequent_items)
    result.append(negative_border_items)
    return result

In [24]:
def pass_two(inputfile, frequent_items, negative_border_items, support):
    print("Starting pass two...")
    baskets = generate_baskets(inputfile)
    ans = check_negative_border_items(negative_border_items, baskets, support)
    if ans:
        print("Negative border item found in pass two. Iterating again...")
        return [True]
    ffi = filter_frequent_items(frequent_items, baskets, support)
    print("Frequent itemsets after pass two:", ffi)
    return [False, ffi]

In [None]:
def calculate_support(itemset, baskets):
    count = sum(1 for basket in baskets if itemset.issubset(basket))
    return count / len(baskets)

In [30]:
def generate_association_rules(frequent_itemsets, baskets, min_confidence=0.5):
    rules = []
    for itemset_group in frequent_itemsets:
        for itemset in itemset_group:
            if len(itemset) > 1:
                subsets = list(combinations(itemset, len(itemset) - 1))
                for subset in subsets:
                    antecedent = frozenset(subset)
                    consequent = frozenset(itemset) - antecedent
                    support_antecedent = calculate_support(antecedent, baskets)
                    support_itemset = calculate_support(frozenset(itemset), baskets)
                    confidence = support_itemset / support_antecedent
                    if confidence >= min_confidence:
                        rules.append((antecedent, consequent, confidence))
    return rules

In [34]:
def toivonen(inputfile, support):
    iterations = 0
    result = []
    ck = True
    while ck:
        print(f"Iteration {iterations + 1} starting...")
        ans_one = pass_one(inputfile, support)
        ans_two = pass_two(inputfile, ans_one[1], ans_one[2], support)
        ck = ans_two[0]
        iterations += 1
    result.append(iterations)
    result.append(ans_one[0])
    result.append(ans_two[1])

    return result

In [35]:
results = toivonen('browsing.txt', 100)

Iteration 1 starting...
Starting pass one...
Generated 18929 baskets from input file.
Sampling 1892 out of 18929 baskets...
Adjusted support for sample: 9.000000000000002
Generating frequent itemsets of size 1


Generating frequent itemsets: 100%|██████████| 1892/1892 [00:01<00:00, 1066.42it/s]


frequent_items [['SNA45677'], ['DAI22896'], ['ELE99737'], ['GRO38814'], ['DAI85309'], ['SNA49107'], ['ELE78169'], ['GRO46854'], ['ELE42696'], ['GRO94758'], ['SNA93730'], ['SNA12663'], ['SNA90094'], ['FRO40251'], ['FRO78994'], ['DAI75645'], ['SNA80324'], ['ELE22970'], ['ELE85027'], ['DAI62779'], ['FRO94523'], ['GRO38983'], ['DAI89320'], ['SNA96271'], ['DAI64292'], ['ELE86561'], ['ELE56788'], ['DAI63921'], ['SNA71244'], ['ELE52966'], ['GRO81900'], ['SNA93860'], ['ELE30933'], ['GRO76157'], ['GRO99222'], ['FRO32293'], ['ELE91337'], ['SNA55952'], ['DAI87448'], ['ELE88031'], ['SNA29014'], ['DAI67621'], ['GRO29389'], ['DAI55148'], ['ELE26917'], ['GRO44993'], ['ELE66600'], ['DAI83948'], ['ELE69750'], ['GRO64900'], ['FRO57082'], ['GRO56726'], ['ELE38511'], ['GRO24246'], ['SNA61470'], ['DAI48891'], ['SNA55617'], ['DAI83733'], ['FRO26482'], ['FRO53271'], ['ELE49863'], ['FRO99756'], ['DAI73122'], ['GRO84328'], ['SNA55762'], ['GRO73461'], ['ELE87456'], ['FRO16142'], ['SNA42528'], ['SNA44190'], ['EL

Generating candidates: 100%|██████████| 451/451 [00:00<00:00, 1520.77it/s]


Generating frequent itemsets of size 2


Generating frequent itemsets: 100%|██████████| 1892/1892 [00:48<00:00, 38.61it/s]


frequent_items [['ELE99737', 'SNA45677'], ['SNA45677', 'GRO38814'], ['SNA45677', 'DAI85309'], ['ELE78169', 'SNA45677'], ['SNA45677', 'GRO46854'], ['SNA45677', 'GRO94758'], ['DAI22896', 'GRO38814'], ['ELE99737', 'DAI85309'], ['ELE99737', 'SNA49107'], ['ELE99737', 'GRO94758'], ['GRO38814', 'GRO94758'], ['SNA49107', 'DAI85309'], ['DAI85309', 'GRO46854'], ['GRO94758', 'DAI85309'], ['ELE78169', 'GRO94758'], ['GRO94758', 'GRO46854'], ['FRO40251', 'SNA12663'], ['FRO40251', 'SNA90094'], ['SNA80324', 'SNA90094'], ['FRO40251', 'DAI75645'], ['SNA80324', 'FRO40251'], ['SNA80324', 'DAI75645'], ['SNA45677', 'GRO38983'], ['SNA96271', 'SNA45677'], ['ELE86561', 'SNA45677'], ['SNA45677', 'ELE56788'], ['SNA45677', 'DAI63921'], ['SNA96271', 'GRO38983'], ['DAI62779', 'SNA45677'], ['ELE91337', 'SNA45677'], ['DAI62779', 'GRO46854'], ['FRO32293', 'GRO46854'], ['ELE91337', 'GRO46854'], ['DAI87448', 'GRO46854'], ['DAI62779', 'SNA93860'], ['DAI62779', 'GRO99222'], ['DAI62779', 'FRO32293'], ['ELE91337', 'DAI62779

Generating candidates: 100%|██████████| 771/771 [00:00<00:00, 3680.16it/s]


Generating frequent itemsets of size 3


Generating frequent itemsets: 100%|██████████| 1892/1892 [00:15<00:00, 124.70it/s]


frequent_items [['ELE99737', 'SNA45677', 'DAI85309'], ['ELE99737', 'SNA45677', 'GRO94758'], ['SNA45677', 'DAI85309', 'GRO46854'], ['SNA45677', 'GRO94758', 'DAI85309'], ['ELE78169', 'SNA45677', 'GRO94758'], ['DAI85309', 'ELE99737', 'SNA49107'], ['ELE99737', 'GRO94758', 'DAI85309'], ['SNA80324', 'FRO40251', 'SNA90094'], ['SNA80324', 'FRO40251', 'DAI75645'], ['ELE91337', 'DAI62779', 'SNA45677'], ['DAI62779', 'FRO32293', 'GRO46854'], ['DAI62779', 'DAI87448', 'GRO46854'], ['ELE91337', 'DAI62779', 'DAI87448'], ['DAI62779', 'DAI48891', 'DAI83733'], ['GRO56726', 'GRO73461', 'DAI85309'], ['GRO38814', 'GRO73461', 'DAI22896'], ['FRO40251', 'DAI22896', 'GRO38814'], ['DAI62779', 'DAI22896', 'GRO38814'], ['GRO38814', 'ELE17451', 'DAI22896'], ['FRO40251', 'GRO73461', 'GRO38814'], ['DAI62779', 'GRO73461', 'GRO38814'], ['GRO73461', 'ELE17451', 'GRO38814'], ['DAI62779', 'DAI22896', 'FRO40251'], ['FRO40251', 'DAI22896', 'GRO73461'], ['FRO40251', 'ELE17451', 'DAI22896'], ['DAI62779', 'DAI22896', 'GRO73461

Generating candidates: 100%|██████████| 1043/1043 [00:00<00:00, 2175.89it/s]


Frequent itemsets in sample: [[['SNA45677'], ['DAI22896'], ['ELE99737'], ['GRO38814'], ['DAI85309'], ['SNA49107'], ['ELE78169'], ['GRO46854'], ['ELE42696'], ['GRO94758'], ['SNA93730'], ['SNA12663'], ['SNA90094'], ['FRO40251'], ['FRO78994'], ['DAI75645'], ['SNA80324'], ['ELE22970'], ['ELE85027'], ['DAI62779'], ['FRO94523'], ['GRO38983'], ['DAI89320'], ['SNA96271'], ['DAI64292'], ['ELE86561'], ['ELE56788'], ['DAI63921'], ['SNA71244'], ['ELE52966'], ['GRO81900'], ['SNA93860'], ['ELE30933'], ['GRO76157'], ['GRO99222'], ['FRO32293'], ['ELE91337'], ['SNA55952'], ['DAI87448'], ['ELE88031'], ['SNA29014'], ['DAI67621'], ['GRO29389'], ['DAI55148'], ['ELE26917'], ['GRO44993'], ['ELE66600'], ['DAI83948'], ['ELE69750'], ['GRO64900'], ['FRO57082'], ['GRO56726'], ['ELE38511'], ['GRO24246'], ['SNA61470'], ['DAI48891'], ['SNA55617'], ['DAI83733'], ['FRO26482'], ['FRO53271'], ['ELE49863'], ['FRO99756'], ['DAI73122'], ['GRO84328'], ['SNA55762'], ['GRO73461'], ['ELE87456'], ['FRO16142'], ['SNA42528'], ['S

Generating negative border: 100%|██████████| 451/451 [00:00<00:00, 12397.15it/s]


Negative border items: []
Starting pass two...
Generated 18929 baskets from input file.


Checking negative border items: 0it [00:00, ?it/s]
Filtering frequent items: 100%|██████████| 451/451 [00:03<00:00, 145.91it/s]
Filtering frequent items: 100%|██████████| 771/771 [00:05<00:00, 136.20it/s]
Filtering frequent items: 100%|██████████| 1043/1043 [00:08<00:00, 118.74it/s]

Frequent itemsets after pass two: [[['SNA45677'], ['DAI22896'], ['ELE99737'], ['GRO38814'], ['DAI85309'], ['SNA49107'], ['ELE78169'], ['GRO46854'], ['ELE42696'], ['GRO94758'], ['SNA93730'], ['SNA12663'], ['SNA90094'], ['FRO40251'], ['FRO78994'], ['DAI75645'], ['SNA80324'], ['ELE22970'], ['ELE85027'], ['DAI62779'], ['FRO94523'], ['GRO38983'], ['DAI89320'], ['SNA96271'], ['DAI64292'], ['ELE86561'], ['ELE56788'], ['DAI63921'], ['SNA71244'], ['ELE52966'], ['GRO81900'], ['SNA93860'], ['ELE30933'], ['GRO76157'], ['GRO99222'], ['FRO32293'], ['ELE91337'], ['SNA55952'], ['DAI87448'], ['ELE88031'], ['SNA29014'], ['DAI67621'], ['DAI55148'], ['ELE26917'], ['GRO44993'], ['ELE66600'], ['DAI83948'], ['ELE69750'], ['GRO64900'], ['FRO57082'], ['GRO56726'], ['ELE38511'], ['GRO24246'], ['DAI48891'], ['SNA55617'], ['DAI83733'], ['FRO26482'], ['FRO53271'], ['ELE49863'], ['FRO99756'], ['DAI73122'], ['GRO84328'], ['SNA55762'], ['GRO73461'], ['ELE87456'], ['FRO16142'], ['SNA42528'], ['SNA44190'], ['ELE17451']




In [36]:
frequent_itemsets = results[2]
frequent_itemsets

[[['SNA45677'],
  ['DAI22896'],
  ['ELE99737'],
  ['GRO38814'],
  ['DAI85309'],
  ['SNA49107'],
  ['ELE78169'],
  ['GRO46854'],
  ['ELE42696'],
  ['GRO94758'],
  ['SNA93730'],
  ['SNA12663'],
  ['SNA90094'],
  ['FRO40251'],
  ['FRO78994'],
  ['DAI75645'],
  ['SNA80324'],
  ['ELE22970'],
  ['ELE85027'],
  ['DAI62779'],
  ['FRO94523'],
  ['GRO38983'],
  ['DAI89320'],
  ['SNA96271'],
  ['DAI64292'],
  ['ELE86561'],
  ['ELE56788'],
  ['DAI63921'],
  ['SNA71244'],
  ['ELE52966'],
  ['GRO81900'],
  ['SNA93860'],
  ['ELE30933'],
  ['GRO76157'],
  ['GRO99222'],
  ['FRO32293'],
  ['ELE91337'],
  ['SNA55952'],
  ['DAI87448'],
  ['ELE88031'],
  ['SNA29014'],
  ['DAI67621'],
  ['DAI55148'],
  ['ELE26917'],
  ['GRO44993'],
  ['ELE66600'],
  ['DAI83948'],
  ['ELE69750'],
  ['GRO64900'],
  ['FRO57082'],
  ['GRO56726'],
  ['ELE38511'],
  ['GRO24246'],
  ['DAI48891'],
  ['SNA55617'],
  ['DAI83733'],
  ['FRO26482'],
  ['FRO53271'],
  ['ELE49863'],
  ['FRO99756'],
  ['DAI73122'],
  ['GRO84328'],
  ['SNA5

In [37]:

baskets = generate_baskets('browsing.txt')
rules = generate_association_rules(frequent_itemsets, baskets, min_confidence=0.5)


print("\nAssociation Rules with confidence higher than 50%:")
for rule in rules:
    antecedent, consequent, confidence = rule
    print(f"{set(antecedent)} -> {set(consequent)} (confidence: {confidence:.2f})")


Generated 18929 baskets from input file.

Association Rules with confidence higher than 50%:
{'GRO85051'} -> {'FRO40251'} (confidence: 1.00)
{'ELE59028'} -> {'DAI62779'} (confidence: 0.59)
{'DAI55148'} -> {'DAI62779'} (confidence: 0.57)
{'FRO47962'} -> {'DAI75645'} (confidence: 0.80)
{'FRO19221'} -> {'DAI62779'} (confidence: 0.65)
{'DAI83031'} -> {'DAI94679'} (confidence: 0.52)
{'DAI88079'} -> {'FRO40251'} (confidence: 0.99)
{'DAI43868'} -> {'SNA82528'} (confidence: 0.95)
{'SNA82528'} -> {'DAI43868'} (confidence: 0.51)
{'FRO92469'} -> {'FRO40251'} (confidence: 0.99)
{'DAI93865'} -> {'FRO40251'} (confidence: 1.00)
{'ELE20847'} -> {'FRO40251'} (confidence: 0.53)
{'SNA53220'} -> {'DAI62779'} (confidence: 0.57)
{'SNA53220'} -> {'SNA93860'} (confidence: 0.56)
{'SNA30533'} -> {'SNA96271'} (confidence: 0.50)
{'ELE88583'} -> {'SNA24799'} (confidence: 0.59)
{'DAI43223'} -> {'ELE32164'} (confidence: 0.58)
{'GRO59710'} -> {'ELE32164'} (confidence: 0.51)
{'ELE32244'} -> {'ELE66600'} (confidence: 0