Suppose there are 4 baskets, each is represented as a list. The items are marked as 1,2,3,4 and 5. Items that appear no less than twice(support>=2) are frequent items. The baskets are initialized as below:

In [7]:
baskets = [[1,2,5],[2,4],[2,3],[1,2,4],[1,3],[2,3],[1,3],[1,2,3,5],[1,2,3]]
C_0 = [[1],[2],[3],[4],[5]]   #itemsets

In the first pass, count the support of each item:

In [9]:
def count_sup(baskets,C):
    sup = []
    for item in C:
        ct = 0
        for basket in baskets:
            if(set(item).issubset(basket)):   #if the item appears in the basket
                ct += 1
        sup.append(ct)
    return sup
sup_0 = count_sup(baskets,C_0)
print(sup_0)

[6, 7, 6, 2, 2]


Then, prune the non-frequent itemsets:

In [15]:
def prune(C,sup,freq = 2):
    L = []
    pruned = []
    for i in range(len(C)):
        if(sup[i]>=freq):
            L.append(C[i])
        else:
            pruned.append(C[i])
    return L,pruned
L_0,pruned = prune(C_0,sup_0)
print(L_0)
print(pruned)

[[1], [2], [3], [4], [5]]
[]


In each iteration, the new itemsets are generated using the following function:

In [24]:
pruned_sets = pruned
def generate(L,pruned_sets):
    new_C = []
    for i in range(len(L)):
        for j in range(i+1,len(L)):
            tmp = set(L[i]+L[j])
            checklist = new_C + pruned_sets   # check repetition, also check the subsets
            checked = True
            for check in checklist:
                if (set(check).issubset(tmp)):  # Repetition or subset non-frequent
                    checked = False
                    break
            if(checked):
                new_C.append(sorted(list(tmp)))
    return new_C
print(generate(L_0,pruned_sets))

[[1, 2], [1, 3], [1, 4], [1, 5], [2, 3], [2, 4], [2, 5], [3, 4], [3, 5], [4, 5]]


Each step is summarized as the following function:

In [32]:
pruned_sets = []
def update(baskets,C,pruned_sets,freq=2):
    sup = count_sup(baskets,C)
    L,pruned = prune(C,sup,freq)
    pruned_sets = pruned_sets + pruned
    new_C = generate(L,pruned_sets)
    return new_C,pruned_sets
C_1,pruned_sets = update(baskets,C_0,pruned_sets)
print(C_1)
print(pruned_sets)

[]
[[1], [2], [3], [4], [5]]


Therefore, the A-priori algorithm is implemented as follows:

In [34]:
baskets = [[1,2,5],[2,4],[2,3],[1,2,4],[1,3],[2,3],[1,3],[1,2,3,5],[1,2,3]]
C = [[1],[2],[3],[4],[5]] 
non_frequent = []
while(True):
    C,non_frequent = update(baskets,C,non_frequent)
    print(C)
    if(len(C)==0):  #terminate when there is no itemset left in C
        break

[[1, 2], [1, 3], [1, 4], [1, 5], [2, 3], [2, 4], [2, 5], [3, 4], [3, 5], [4, 5]]
[[1, 2, 3], [1, 2, 5]]
[]


Using the example in the slides:

In [39]:
baskets = [['m','c','b'],['m','p','j'],['m','c','b','n'],['c','j'],['m','p','b'],['m','c','b','j'],['c','b','j'],['b','c']]
C = [['m'],['b'],['c'],['p'],['j'],['n']] 
freq = 3
non_frequent = []
while(True):
    C,non_frequent = update(baskets,C,non_frequent,freq)
    print(C)
    if(len(C)==0):  #terminate when there is no itemset left in C
        break

[['b', 'm'], ['c', 'm'], ['j', 'm'], ['b', 'c'], ['b', 'j'], ['c', 'j']]
[['b', 'c', 'm']]
[]
