Read the groceries.csv file data/groceries.csv"
Find the most frequent itemsets for a support threshold of 50, 100 and 200.
Find the interesting associations for thresholds above 0.7 (Note: the number of baskets in this case is the number of lines in the file)

In [30]:
import itertools

def readdata_hands_on(k, fname="data/groceries.csv"):
    
    with open(fname, "rt", encoding='latin1') as f:
        for line in f:
            C_k  = line.rstrip().split(',')
            for itemset in itertools.combinations(C_k, k):
                    yield frozenset(itemset) 

In [31]:
nitems = 20
for C_k in readdata_hands_on(k=2):
    print(C_k)
    
    nitems -= 1
    if nitems == 0: 
        break

frozenset({'semi-finished bread', 'citrus fruit'})
frozenset({'margarine', 'citrus fruit'})
frozenset({'ready soups', 'citrus fruit'})
frozenset({'semi-finished bread', 'margarine'})
frozenset({'semi-finished bread', 'ready soups'})
frozenset({'margarine', 'ready soups'})
frozenset({'yogurt', 'tropical fruit'})
frozenset({'coffee', 'tropical fruit'})
frozenset({'yogurt', 'coffee'})
frozenset({'yogurt', 'pip fruit'})
frozenset({'cream cheese', 'pip fruit'})
frozenset({'meat spreads', 'pip fruit'})
frozenset({'cream cheese', 'yogurt'})
frozenset({'yogurt', 'meat spreads'})
frozenset({'cream cheese', 'meat spreads'})
frozenset({'other vegetables', 'whole milk'})
frozenset({'condensed milk', 'other vegetables'})
frozenset({'long life bakery product', 'other vegetables'})
frozenset({'condensed milk', 'whole milk'})
frozenset({'long life bakery product', 'whole milk'})


In [32]:
import time


def get_C(k):

    start = time.time()
    C = {}
    for key in readdata_hands_on(k):  # False report
        if key not in C:
            C[key] = 1
        else:
            C[key] += 1
    print("Took {}s for k={}".format((time.time() - start), k))
    return C


C1 = get_C(1)
C2 = get_C(2)

Took 0.024425268173217773s for k=1
Took 0.07533621788024902s for k=2


In [33]:
%time
s = 50# support threshold
L2_1 = {}
for key, n in C2.items():
    if n >= s:
        L2_1[key] = n
print('{} items with >{} occurances'.format(len(L2_1), s))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs
605 items with >50 occurances


In [34]:
%time
s = 100# support threshold
L2_2 = {}
for key, n in C2.items():
    if n >= s:
        L2_2[key] = n
print('{} items with >{} occurances'.format(len(L2_2), s))

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.34 µs
207 items with >100 occurances


In [36]:
%time
s = 200# support threshold
L2_3 = {}
for key, n in C2.items():
    if n >= s:
        L2_3[key] = n
print('{} items with >{} occurances'.format(len(L2_3), s))

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 3.34 µs
60 items with >200 occurances


In [37]:
L2_1

{frozenset({'citrus fruit', 'margarine'}): 78,
 frozenset({'tropical fruit', 'yogurt'}): 288,
 frozenset({'coffee', 'tropical fruit'}): 70,
 frozenset({'coffee', 'yogurt'}): 96,
 frozenset({'pip fruit', 'yogurt'}): 177,
 frozenset({'cream cheese', 'pip fruit'}): 60,
 frozenset({'cream cheese', 'yogurt'}): 122,
 frozenset({'other vegetables', 'whole milk'}): 736,
 frozenset({'long life bakery product', 'other vegetables'}): 105,
 frozenset({'long life bakery product', 'whole milk'}): 133,
 frozenset({'butter', 'whole milk'}): 271,
 frozenset({'whole milk', 'yogurt'}): 551,
 frozenset({'butter', 'yogurt'}): 144,
 frozenset({'UHT-milk', 'other vegetables'}): 80,
 frozenset({'other vegetables', 'rolls/buns'}): 419,
 frozenset({'bottled beer', 'other vegetables'}): 159,
 frozenset({'UHT-milk', 'rolls/buns'}): 63,
 frozenset({'bottled beer', 'rolls/buns'}): 134,
 frozenset({'other vegetables', 'tropical fruit'}): 353,
 frozenset({'tropical fruit', 'white bread'}): 86,
 frozenset({'bottled wa

In [41]:
L2_1 = [elem for elem in list(L2_1) if len(elem) > 1] 

In [43]:
for i in range(len(L2_1)):

    A, B = list(L2_1[i])
    support_AB = C2[frozenset([A, B])]
    support_A = C1[frozenset([A])]
    conf_A_leads_to_B = support_AB / support_A

    support_B = C1[frozenset([B])]
    prob_B = support_B / 9835

    interest_A_leads_to_B = conf_A_leads_to_B - prob_B

    if interest_A_leads_to_B > 0.1:
        print("{} --> {} with interest {:3f}".format(A, B,
                                                     interest_A_leads_to_B))

yogurt --> tropical fruit with interest 0.104981
cream cheese --> yogurt with interest 0.173319
other vegetables --> whole milk with interest 0.131242
long life bakery product --> whole milk with interest 0.105897
butter --> whole milk with interest 0.241732
yogurt --> whole milk with interest 0.146087
butter --> yogurt with interest 0.124718
citrus fruit --> tropical fruit with interest 0.135855
citrus fruit --> whole milk with interest 0.113034
citrus fruit --> yogurt with interest 0.122169
tropical fruit --> whole milk with interest 0.147585
curd --> whole milk with interest 0.234942
flour --> whole milk with interest 0.229864
other vegetables --> root vegetables with interest 0.135878
flour --> other vegetables with interest 0.169080
sausage --> rolls/buns with interest 0.141823
hygiene articles --> napkins with interest 0.132821
root vegetables --> whole milk with interest 0.193178
sugar --> whole milk with interest 0.188928
pork --> whole milk with interest 0.128964
whipped/sour 

In [45]:
c1 = {}
for key in readdata_hands_on(k=1):
    if key not in c1:
        c1[key] = 1
    else:
        c1[key] += 1    
        
print("{} items".format(len(c1)))

169 items


In [47]:
# filter stage
N = 10
l1 = {}
for key, count in c1.items():
    if count >= N:
        l1[key] = count
print('{} items with >{} occurances'.format(len(l1), N))

157 items with >10 occurances


In [48]:
N = 20
l1 = {}
for key, count in c1.items():
    if count >= N:
        l1[key] = count
print('{} items with >{} occurances'.format(len(l1), N))

147 items with >20 occurances


In [50]:
c2_items = set([a.union(b) for a in l1.keys() for b in l1.keys()])

In [51]:
# find frequent 2-tuples
c2 = {}
for key in readdata_hands_on(k=2):
    # filter out non-frequent tuples
    if key not in c2_items:
        continue

    # record frequent tuples
    if key not in c2:
        c2[key] = 1
    else:
        c2[key] += 1
        
print("{} items".format(len(c2)))

8720 items


In [52]:
l2 = {}
for key, count in c2.items():
    if count >= N:
        l2[key] = count
print('A-priori: {} items with >{} occurances'.format(len(l2), N))

A-priori: 1674 items with >20 occurances


In [53]:
import numpy as np

In [54]:
# hash table
max_hash1 = 10 * 1000000
H1 = np.zeros((max_hash1, ), dtype=np.int)

for key in readdata_hands_on(k=2):
    hash_cell_1 = hash(key) % max_hash1
    H1[hash_cell_1] += 1

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  H1 = np.zeros((max_hash1, ), dtype=np.int)


In [57]:
C2 = {}
N = 100
for key in readdata_hands_on(k=2):
    # hash-based filtering stage from PCY
    hash_cell_1 = hash(key) % max_hash1
    if H1[hash_cell_1] < N:
        continue

    # filter out non-frequent tuples
    if key not in c2_items:
        continue

    # record frequent tuples
    if key not in c2:
        c2[key] = 1
    else:
        c2[key] += 1
        
print("{} items".format(len(c2)))

8720 items


In [58]:
N = 100
l2 = {}
for key, count in c2.items():
    if count >= N:
        l2[key] = count
print('{} items with >{} occurances'.format(len(l2), N))

207 items with >100 occurances
