## Student Names: Anh Ha, Dat Nguyen, Phuong Nguyen

## 1. Download the groceries.csv file from itslearning (also the file is in the folder homework inside the zip of this lecture)

In [1]:
# reading csv file and load all items in all baskets (each line is a basket)
with open("groceries.csv", "rt", encoding='latin1') as f:
    basket = []
    i = 0
    for line in f:
        line = line.replace("\n","")
        items = line.split(",")
        basket.append(items)
    print(len(basket))
# printing first 10 baskets
for i in basket[:10]:
    print(i)

9835
['citrus fruit', 'semi-finished bread', 'margarine', 'ready soups']
['tropical fruit', 'yogurt', 'coffee']
['whole milk']
['pip fruit', 'yogurt', 'cream cheese', 'meat spreads']
['other vegetables', 'whole milk', 'condensed milk', 'long life bakery product']
['whole milk', 'butter', 'yogurt', 'rice', 'abrasive cleaner']
['rolls/buns']
['other vegetables', 'UHT-milk', 'rolls/buns', 'bottled beer', 'liquor (appetizer)']
['potted plants']
['whole milk', 'cereals']


### Generate and count itemsets

In [2]:
import itertools

def readdata(k, fname="groceries.csv", report=True):
    C_k = []
    b = 0
    
    with open("groceries.csv", "rt", encoding='latin1') as f:
        lines = f.readlines()
        for line in lines:
            line = line.replace('\n', '')  # remove newline symbol
            for i in line.split(','):
                C_k.append(i)
        
            # end of basket, report all itemsets
            for itemset in itertools.combinations(C_k, k):
                yield frozenset(itemset)
            C_k = []
                
            # report progress
            # print every 1000th element to reduce clutter
            if report:
                if b % 5000 == 0 and b > 0:  
                    print('processing bin ', b)
                b += 1 

In [3]:
nitems = 11
for C_k in readdata(k=2):
    print(C_k)
    
    nitems -= 1
    if nitems == 0: 
        break

frozenset({'citrus fruit', 'semi-finished bread'})
frozenset({'citrus fruit', 'margarine'})
frozenset({'citrus fruit', 'ready soups'})
frozenset({'semi-finished bread', 'margarine'})
frozenset({'semi-finished bread', 'ready soups'})
frozenset({'margarine', 'ready soups'})
frozenset({'tropical fruit', 'yogurt'})
frozenset({'tropical fruit', 'coffee'})
frozenset({'coffee', 'yogurt'})
frozenset({'yogurt', 'pip fruit'})
frozenset({'cream cheese', 'pip fruit'})


## 2. Find the frequent pair of items (2-tuples) using the naïve, A-priori and PCY algorithms. For each of these compare the time of execution and results for supports s=10, 50, 100. Comment your results.

### Naive method

In [4]:
import time

def naive_method(k,s):
    t = time.time()
    C = {}
    for key in readdata(k,report=False):
        if key not in C:
            C[key] = 1
        else:
            C[key] += 1
            
    print("{} initial itemsets to be filtered".format(len(C)))
    
    
    L = {}
    for key, n in C.items():
        if n >= s:
            L[key] = n
    t1 = time.time() - t
    print('Naive method took {} seconds'.format(t1))
    print('{} candidates with >{} occurances'.format(len(L), s))
    print('')
    return L

### Apriori algorithm

In [5]:
def apriori(s):
    t = time.time()
    C1 = {}
    for key in readdata(1, report=False):
        if key not in C1:
            C1[key] = 1
        else:
            C1[key] += 1
    
    
    L1 = {}
    for key, count in C1.items():
        if count >= s:
            L1[key] = count
    
    
    C2_items = set([a.union(b) for a in L1.keys() for b in L1.keys()])
    C2 = {}
    for key in readdata(2, report=False):
        # filter out non-frequent tuples
        if key not in C2_items:
            continue

        # record frequent tuples
        if key not in C2:
            C2[key] = 1
        else:
            C2[key] += 1
    
            
    print("{} initial itemsets to be filtered".format(len(C2)))
    
    
    L2 = {}
    for key, count in C2.items():
        if count >= s:
            L2[key] = count
    t1 = time.time() - t
    
 
        
    print('Apriori algorithm took {} seconds'.format(t1))
    print('{} candidates with >{} occurances'.format(len(L2), s))
    print('')
    return L2

### PCY improvement Apriori

In [6]:
import numpy as np
def pcy(s):
    t = time.time()
    max_hash1 = 15000
    H1 = np.zeros((max_hash1,), dtype=np.int)
    for key in readdata(k=2, report=False):
        hash_cell_1 = hash(key) % max_hash1
        H1[hash_cell_1] += 1
    
    
    C2 = {}
    for key in readdata(k=2, report=False):
        # hash-based filtering stage from PCY
        hash_cell_1 = hash(key) % max_hash1
        if H1[hash_cell_1] < s:
            continue

        # filter out non-frequent tuples
#         if key not in C2_items:
#             continue

        # record frequent tuples
        if key not in C2:
            C2[key] = 1
        else:
            C2[key] += 1
    
            
    print("{} initial itemsets to be filtered".format(len(C2)))
    
    
    L2 = {}
    for key, count in C2.items():
        if count >= s:
            L2[key] = count
    t1 = time.time() - t
    print('PCY-algorithm took {} seconds'.format(t1))
    print('{} candidates with >{} occurances'.format(len(L2), s))
    print('')
    return L2

In [7]:
#Naive method with k=1
for i in [10,50,100]:
    naive_method(1,i)

169 initial itemsets to be filtered
Naive method took 0.06696081161499023 seconds
157 candidates with >10 occurances

169 initial itemsets to be filtered
Naive method took 0.06496024131774902 seconds
120 candidates with >50 occurances

169 initial itemsets to be filtered
Naive method took 0.051969051361083984 seconds
88 candidates with >100 occurances



In [8]:
naive_method(1,100)

169 initial itemsets to be filtered
Naive method took 0.06297135353088379 seconds
88 candidates with >100 occurances



{frozenset({'citrus fruit'}): 814,
 frozenset({'semi-finished bread'}): 174,
 frozenset({'margarine'}): 576,
 frozenset({'tropical fruit'}): 1032,
 frozenset({'yogurt'}): 1372,
 frozenset({'coffee'}): 571,
 frozenset({'whole milk'}): 2513,
 frozenset({'pip fruit'}): 744,
 frozenset({'cream cheese'}): 390,
 frozenset({'other vegetables'}): 1903,
 frozenset({'condensed milk'}): 101,
 frozenset({'long life bakery product'}): 368,
 frozenset({'butter'}): 545,
 frozenset({'rolls/buns'}): 1809,
 frozenset({'UHT-milk'}): 329,
 frozenset({'bottled beer'}): 792,
 frozenset({'potted plants'}): 170,
 frozenset({'white bread'}): 414,
 frozenset({'bottled water'}): 1087,
 frozenset({'chocolate'}): 488,
 frozenset({'curd'}): 524,
 frozenset({'flour'}): 171,
 frozenset({'dishes'}): 173,
 frozenset({'beef'}): 516,
 frozenset({'frankfurter'}): 580,
 frozenset({'soda'}): 1715,
 frozenset({'chicken'}): 422,
 frozenset({'sugar'}): 333,
 frozenset({'fruit/vegetable juice'}): 711,
 frozenset({'newspapers'})

In [9]:
# Runw ith s=10,50, 100
for i in [10,50,100]:
    naive_method(2,i)
    apriori(i)
    pcy(i)

9636 initial itemsets to be filtered
Naive method took 0.15721344947814941 seconds
2981 candidates with >10 occurances

9300 initial itemsets to be filtered
Apriori algorithm took 0.31263256072998047 seconds
2981 candidates with >10 occurances

4673 initial itemsets to be filtered
PCY-algorithm took 0.44062209129333496 seconds
2981 candidates with >10 occurances

9636 initial itemsets to be filtered
Naive method took 0.18289542198181152 seconds
605 candidates with >50 occurances

6621 initial itemsets to be filtered
Apriori algorithm took 0.2528722286224365 seconds
605 candidates with >50 occurances

1173 initial itemsets to be filtered
PCY-algorithm took 0.4036295413970947 seconds
605 candidates with >50 occurances

9636 initial itemsets to be filtered
Naive method took 0.15890908241271973 seconds
207 candidates with >100 occurances

3781 initial itemsets to be filtered
Apriori algorithm took 0.23510408401489258 seconds
207 candidates with >100 occurances

383 initial itemsets to be f

### Comments: 
1. Hash table was defined higher a little bit from 2^169, for  example: 15000 (Ref: (Park, Jong, Chen, Ming-syan, Yu, Philip, An Effective Hash-Based Algorithm for Mining Association Rules, 1997/08/18)
 
2. Execution time for PCY algorithm longer than Apriori, Naive method has teh shortest execution time. The reason we counted total execution time from readdata to C2-->L2. We also tested if we just check time for C2-->L2, PCY algorithm is shorter than Apriori algorithm. 

3. The number of pair itemsets to be filtered found in PCY algorithm less than A-Priori and and Näive method. Dimensions reduced, for example: 50% (s=10), 80% (s=50), 89% (s=100).   

4. s=100: the number of frequent items found 207, s= 50: the number of frequent items found 605, s=10: the number of frequent items found 2981. The higher support value the smaller possibility to find frequent items. 

### Naive method frequent pairs

In [10]:
naive_method(2,10)

9636 initial itemsets to be filtered
Naive method took 0.13739967346191406 seconds
2981 candidates with >10 occurances



{frozenset({'citrus fruit', 'semi-finished bread'}): 24,
 frozenset({'citrus fruit', 'margarine'}): 78,
 frozenset({'margarine', 'semi-finished bread'}): 20,
 frozenset({'tropical fruit', 'yogurt'}): 288,
 frozenset({'coffee', 'tropical fruit'}): 70,
 frozenset({'coffee', 'yogurt'}): 96,
 frozenset({'pip fruit', 'yogurt'}): 177,
 frozenset({'cream cheese', 'pip fruit'}): 60,
 frozenset({'cream cheese', 'yogurt'}): 122,
 frozenset({'meat spreads', 'yogurt'}): 18,
 frozenset({'cream cheese', 'meat spreads'}): 11,
 frozenset({'other vegetables', 'whole milk'}): 736,
 frozenset({'condensed milk', 'other vegetables'}): 25,
 frozenset({'long life bakery product', 'other vegetables'}): 105,
 frozenset({'condensed milk', 'whole milk'}): 24,
 frozenset({'long life bakery product', 'whole milk'}): 133,
 frozenset({'butter', 'whole milk'}): 271,
 frozenset({'whole milk', 'yogurt'}): 551,
 frozenset({'rice', 'whole milk'}): 46,
 frozenset({'abrasive cleaner', 'whole milk'}): 16,
 frozenset({'butte

In [11]:
naive_method(2,50)

9636 initial itemsets to be filtered
Naive method took 0.15850520133972168 seconds
605 candidates with >50 occurances



{frozenset({'citrus fruit', 'margarine'}): 78,
 frozenset({'tropical fruit', 'yogurt'}): 288,
 frozenset({'coffee', 'tropical fruit'}): 70,
 frozenset({'coffee', 'yogurt'}): 96,
 frozenset({'pip fruit', 'yogurt'}): 177,
 frozenset({'cream cheese', 'pip fruit'}): 60,
 frozenset({'cream cheese', 'yogurt'}): 122,
 frozenset({'other vegetables', 'whole milk'}): 736,
 frozenset({'long life bakery product', 'other vegetables'}): 105,
 frozenset({'long life bakery product', 'whole milk'}): 133,
 frozenset({'butter', 'whole milk'}): 271,
 frozenset({'whole milk', 'yogurt'}): 551,
 frozenset({'butter', 'yogurt'}): 144,
 frozenset({'UHT-milk', 'other vegetables'}): 80,
 frozenset({'other vegetables', 'rolls/buns'}): 419,
 frozenset({'bottled beer', 'other vegetables'}): 159,
 frozenset({'UHT-milk', 'rolls/buns'}): 63,
 frozenset({'bottled beer', 'rolls/buns'}): 134,
 frozenset({'other vegetables', 'tropical fruit'}): 353,
 frozenset({'tropical fruit', 'white bread'}): 86,
 frozenset({'bottled wa

In [12]:
naive_method(2,100)

9636 initial itemsets to be filtered
Naive method took 0.16180920600891113 seconds
207 candidates with >100 occurances



{frozenset({'tropical fruit', 'yogurt'}): 288,
 frozenset({'pip fruit', 'yogurt'}): 177,
 frozenset({'cream cheese', 'yogurt'}): 122,
 frozenset({'other vegetables', 'whole milk'}): 736,
 frozenset({'long life bakery product', 'other vegetables'}): 105,
 frozenset({'long life bakery product', 'whole milk'}): 133,
 frozenset({'butter', 'whole milk'}): 271,
 frozenset({'whole milk', 'yogurt'}): 551,
 frozenset({'butter', 'yogurt'}): 144,
 frozenset({'other vegetables', 'rolls/buns'}): 419,
 frozenset({'bottled beer', 'other vegetables'}): 159,
 frozenset({'bottled beer', 'rolls/buns'}): 134,
 frozenset({'other vegetables', 'tropical fruit'}): 353,
 frozenset({'bottled water', 'tropical fruit'}): 182,
 frozenset({'other vegetables', 'white bread'}): 135,
 frozenset({'bottled water', 'other vegetables'}): 244,
 frozenset({'chocolate', 'other vegetables'}): 125,
 frozenset({'citrus fruit', 'tropical fruit'}): 196,
 frozenset({'citrus fruit', 'whole milk'}): 300,
 frozenset({'citrus fruit', 

### Apriori frequent pairs

In [13]:
apriori(10)

9300 initial itemsets to be filtered
Apriori algorithm took 0.32376527786254883 seconds
2981 candidates with >10 occurances



{frozenset({'citrus fruit', 'semi-finished bread'}): 24,
 frozenset({'citrus fruit', 'margarine'}): 78,
 frozenset({'margarine', 'semi-finished bread'}): 20,
 frozenset({'tropical fruit', 'yogurt'}): 288,
 frozenset({'coffee', 'tropical fruit'}): 70,
 frozenset({'coffee', 'yogurt'}): 96,
 frozenset({'pip fruit', 'yogurt'}): 177,
 frozenset({'cream cheese', 'pip fruit'}): 60,
 frozenset({'cream cheese', 'yogurt'}): 122,
 frozenset({'meat spreads', 'yogurt'}): 18,
 frozenset({'cream cheese', 'meat spreads'}): 11,
 frozenset({'other vegetables', 'whole milk'}): 736,
 frozenset({'condensed milk', 'other vegetables'}): 25,
 frozenset({'long life bakery product', 'other vegetables'}): 105,
 frozenset({'condensed milk', 'whole milk'}): 24,
 frozenset({'long life bakery product', 'whole milk'}): 133,
 frozenset({'butter', 'whole milk'}): 271,
 frozenset({'whole milk', 'yogurt'}): 551,
 frozenset({'rice', 'whole milk'}): 46,
 frozenset({'abrasive cleaner', 'whole milk'}): 16,
 frozenset({'butte

In [14]:
apriori(50)

6621 initial itemsets to be filtered
Apriori algorithm took 0.26178836822509766 seconds
605 candidates with >50 occurances



{frozenset({'citrus fruit', 'margarine'}): 78,
 frozenset({'tropical fruit', 'yogurt'}): 288,
 frozenset({'coffee', 'tropical fruit'}): 70,
 frozenset({'coffee', 'yogurt'}): 96,
 frozenset({'pip fruit', 'yogurt'}): 177,
 frozenset({'cream cheese', 'pip fruit'}): 60,
 frozenset({'cream cheese', 'yogurt'}): 122,
 frozenset({'other vegetables', 'whole milk'}): 736,
 frozenset({'long life bakery product', 'other vegetables'}): 105,
 frozenset({'long life bakery product', 'whole milk'}): 133,
 frozenset({'butter', 'whole milk'}): 271,
 frozenset({'whole milk', 'yogurt'}): 551,
 frozenset({'butter', 'yogurt'}): 144,
 frozenset({'UHT-milk', 'other vegetables'}): 80,
 frozenset({'other vegetables', 'rolls/buns'}): 419,
 frozenset({'bottled beer', 'other vegetables'}): 159,
 frozenset({'UHT-milk', 'rolls/buns'}): 63,
 frozenset({'bottled beer', 'rolls/buns'}): 134,
 frozenset({'other vegetables', 'tropical fruit'}): 353,
 frozenset({'tropical fruit', 'white bread'}): 86,
 frozenset({'bottled wa

In [15]:
apriori(100)

3781 initial itemsets to be filtered
Apriori algorithm took 0.2320566177368164 seconds
207 candidates with >100 occurances



{frozenset({'tropical fruit', 'yogurt'}): 288,
 frozenset({'pip fruit', 'yogurt'}): 177,
 frozenset({'cream cheese', 'yogurt'}): 122,
 frozenset({'other vegetables', 'whole milk'}): 736,
 frozenset({'long life bakery product', 'other vegetables'}): 105,
 frozenset({'long life bakery product', 'whole milk'}): 133,
 frozenset({'butter', 'whole milk'}): 271,
 frozenset({'whole milk', 'yogurt'}): 551,
 frozenset({'butter', 'yogurt'}): 144,
 frozenset({'other vegetables', 'rolls/buns'}): 419,
 frozenset({'bottled beer', 'other vegetables'}): 159,
 frozenset({'bottled beer', 'rolls/buns'}): 134,
 frozenset({'other vegetables', 'tropical fruit'}): 353,
 frozenset({'bottled water', 'tropical fruit'}): 182,
 frozenset({'other vegetables', 'white bread'}): 135,
 frozenset({'bottled water', 'other vegetables'}): 244,
 frozenset({'chocolate', 'other vegetables'}): 125,
 frozenset({'citrus fruit', 'tropical fruit'}): 196,
 frozenset({'citrus fruit', 'whole milk'}): 300,
 frozenset({'citrus fruit', 

### PCY Algorithm frequent pairs

In [16]:
pcy(10)

4673 initial itemsets to be filtered
PCY-algorithm took 0.4042801856994629 seconds
2981 candidates with >10 occurances



{frozenset({'citrus fruit', 'semi-finished bread'}): 24,
 frozenset({'citrus fruit', 'margarine'}): 78,
 frozenset({'margarine', 'semi-finished bread'}): 20,
 frozenset({'tropical fruit', 'yogurt'}): 288,
 frozenset({'coffee', 'tropical fruit'}): 70,
 frozenset({'coffee', 'yogurt'}): 96,
 frozenset({'pip fruit', 'yogurt'}): 177,
 frozenset({'cream cheese', 'pip fruit'}): 60,
 frozenset({'cream cheese', 'yogurt'}): 122,
 frozenset({'meat spreads', 'yogurt'}): 18,
 frozenset({'cream cheese', 'meat spreads'}): 11,
 frozenset({'other vegetables', 'whole milk'}): 736,
 frozenset({'condensed milk', 'other vegetables'}): 25,
 frozenset({'long life bakery product', 'other vegetables'}): 105,
 frozenset({'condensed milk', 'whole milk'}): 24,
 frozenset({'long life bakery product', 'whole milk'}): 133,
 frozenset({'butter', 'whole milk'}): 271,
 frozenset({'whole milk', 'yogurt'}): 551,
 frozenset({'rice', 'whole milk'}): 46,
 frozenset({'abrasive cleaner', 'whole milk'}): 16,
 frozenset({'butte

In [17]:
pcy(50)

1173 initial itemsets to be filtered
PCY-algorithm took 0.36310863494873047 seconds
605 candidates with >50 occurances



{frozenset({'citrus fruit', 'margarine'}): 78,
 frozenset({'tropical fruit', 'yogurt'}): 288,
 frozenset({'coffee', 'tropical fruit'}): 70,
 frozenset({'coffee', 'yogurt'}): 96,
 frozenset({'pip fruit', 'yogurt'}): 177,
 frozenset({'cream cheese', 'pip fruit'}): 60,
 frozenset({'cream cheese', 'yogurt'}): 122,
 frozenset({'other vegetables', 'whole milk'}): 736,
 frozenset({'long life bakery product', 'other vegetables'}): 105,
 frozenset({'long life bakery product', 'whole milk'}): 133,
 frozenset({'butter', 'whole milk'}): 271,
 frozenset({'whole milk', 'yogurt'}): 551,
 frozenset({'butter', 'yogurt'}): 144,
 frozenset({'UHT-milk', 'other vegetables'}): 80,
 frozenset({'other vegetables', 'rolls/buns'}): 419,
 frozenset({'bottled beer', 'other vegetables'}): 159,
 frozenset({'UHT-milk', 'rolls/buns'}): 63,
 frozenset({'bottled beer', 'rolls/buns'}): 134,
 frozenset({'other vegetables', 'tropical fruit'}): 353,
 frozenset({'tropical fruit', 'white bread'}): 86,
 frozenset({'bottled wa

In [18]:
pcy(100)

383 initial itemsets to be filtered
PCY-algorithm took 0.4187474250793457 seconds
207 candidates with >100 occurances



{frozenset({'tropical fruit', 'yogurt'}): 288,
 frozenset({'pip fruit', 'yogurt'}): 177,
 frozenset({'cream cheese', 'yogurt'}): 122,
 frozenset({'other vegetables', 'whole milk'}): 736,
 frozenset({'long life bakery product', 'other vegetables'}): 105,
 frozenset({'long life bakery product', 'whole milk'}): 133,
 frozenset({'butter', 'whole milk'}): 271,
 frozenset({'whole milk', 'yogurt'}): 551,
 frozenset({'butter', 'yogurt'}): 144,
 frozenset({'other vegetables', 'rolls/buns'}): 419,
 frozenset({'bottled beer', 'other vegetables'}): 159,
 frozenset({'bottled beer', 'rolls/buns'}): 134,
 frozenset({'other vegetables', 'tropical fruit'}): 353,
 frozenset({'bottled water', 'tropical fruit'}): 182,
 frozenset({'other vegetables', 'white bread'}): 135,
 frozenset({'bottled water', 'other vegetables'}): 244,
 frozenset({'chocolate', 'other vegetables'}): 125,
 frozenset({'citrus fruit', 'tropical fruit'}): 196,
 frozenset({'citrus fruit', 'whole milk'}): 300,
 frozenset({'citrus fruit', 

## 3. For the PCY algorithm, create up to 5 compact hash tables. What is the difference in results and time of execution for 1,2,3,4 and 5 tables? Comment your results.

In [19]:
def pcy2(s):
    t = time.time()
    max_hash1 = 15000
    max_hash2 = 7000
    H1 = np.zeros((max_hash1,), dtype=np.int)
    H2 = np.zeros((max_hash2,), dtype=np.int)
    for key in readdata(k=2, report=False):
        hash_cell_1 = hash(key) % max_hash1
        H1[hash_cell_1] += 1
        hash_cell_2 = hash(key) % max_hash2
        H2[hash_cell_2] += 1
    
    C2 = {}
    for key in readdata(k=2, report=False):
        # hash-based filtering stage from PCY
        hash_cell_1 = hash(key) % max_hash1
        if H1[hash_cell_1] < s:
            continue
        hash_cell_2 = hash(key) % max_hash2
        if H2[hash_cell_2] < s:
            continue
        # filter out non-frequent tuples
#         if key not in C2_items:
#             continue

        # record frequent tuples
        if key not in C2:
            C2[key] = 1
        else:
            C2[key] += 1
    
            
    print("{} initial itemsets to be filtered".format(len(C2)))
    
    
    L2 = {}
    for key, count in C2.items():
        if count >= s:
            L2[key] = count
    t1 = time.time() - t
    print('PCY-algorithm took {} seconds'.format(t1))
    print('{} candidates with >{} occurances'.format(len(L2), s))
    print('')
    return L2

In [20]:
# PCY algorithm with k=2, 2 hash tables, and s=10,50,100
for i in [10,50,100]:
    pcy2(i)

3987 initial itemsets to be filtered
PCY-algorithm took 0.5550084114074707 seconds
2981 candidates with >10 occurances

802 initial itemsets to be filtered
PCY-algorithm took 0.5262117385864258 seconds
605 candidates with >50 occurances

241 initial itemsets to be filtered
PCY-algorithm took 0.529714822769165 seconds
207 candidates with >100 occurances



In [21]:
def pcy3(s):
    t = time.time()
    max_hash1 = 15000
    max_hash2 = 7000 
    max_hash3 = 3800
    H1 = np.zeros((max_hash1,), dtype=np.int)
    H2 = np.zeros((max_hash2,), dtype=np.int)
    H3 = np.zeros((max_hash3,), dtype=np.int)
    for key in readdata(k=2, report=False):
        hash_cell_1 = hash(key) % max_hash1
        H1[hash_cell_1] += 1
        hash_cell_2 = hash(key) % max_hash2
        H2[hash_cell_2] += 1
        hash_cell_3 = hash(key) % max_hash3
        H3[hash_cell_3] += 1
    
    C2 = {}
    for key in readdata(k=2, report=False):
        # hash-based filtering stage from PCY
        hash_cell_1 = hash(key) % max_hash1
        if H1[hash_cell_1] < s:
            continue
        hash_cell_2 = hash(key) % max_hash2
        if H2[hash_cell_2] < s:
            continue
        hash_cell_3 = hash(key) % max_hash3
        if H3[hash_cell_3] < s:
            continue    
            
        # filter out non-frequent tuples
#         if key not in C2_items:
#             continue

        # record frequent tuples
        if key not in C2:
            C2[key] = 1
        else:
            C2[key] += 1
    
            
    print("{} initial itemsets to be filtered".format(len(C2)))
    
   
    L2 = {}
    for key, count in C2.items():
        if count >= s:
            L2[key] = count
    t1 = time.time() - t
    print('PCY-algorithm took {} seconds'.format(t1))
    print('{} candidates with >{} occurances'.format(len(L2), s))
    print('')
    return L2

In [22]:
# PCY algorithm with k=2, 3 hash tables, and s=10,50,100
for i in [10,50,100]:
    pcy3(i)

3760 initial itemsets to be filtered
PCY-algorithm took 0.7712762355804443 seconds
2981 candidates with >10 occurances

692 initial itemsets to be filtered
PCY-algorithm took 0.7373056411743164 seconds
605 candidates with >50 occurances

214 initial itemsets to be filtered
PCY-algorithm took 1.1582920551300049 seconds
207 candidates with >100 occurances



In [23]:
def pcy4(s):
    t = time.time()
    max_hash1 = 15000
    max_hash2 = 7000 
    max_hash3 = 3800
    max_hash4 = 1600
    H1 = np.zeros((max_hash1,), dtype=np.int)
    H2 = np.zeros((max_hash2,), dtype=np.int)
    H3 = np.zeros((max_hash3,), dtype=np.int)
    H4 = np.zeros((max_hash4,), dtype=np.int)
    for key in readdata(k=2, report=False):
        hash_cell_1 = hash(key) % max_hash1
        H1[hash_cell_1] += 1
        hash_cell_2 = hash(key) % max_hash2
        H2[hash_cell_2] += 1
        hash_cell_3 = hash(key) % max_hash3
        H3[hash_cell_3] += 1
        hash_cell_4 = hash(key) % max_hash4
        H4[hash_cell_4] += 1
        
    
    C2 = {}
    for key in readdata(k=2, report=False):
        # hash-based filtering stage from PCY
        hash_cell_1 = hash(key) % max_hash1
        if H1[hash_cell_1] < s:
            continue
        hash_cell_2 = hash(key) % max_hash2
        if H2[hash_cell_2] < s:
            continue
        hash_cell_3 = hash(key) % max_hash3
        if H3[hash_cell_3] < s:
            continue    
        hash_cell_4 = hash(key) % max_hash4
        if H4[hash_cell_4] < s:
            continue 
        
        
        
        # filter out non-frequent tuples
#         if key not in C2_items:
#             continue

        # record frequent tuples
        if key not in C2:
            C2[key] = 1
        else:
            C2[key] += 1
    
            
    print("{} initial itemsets to be filtered".format(len(C2)))
    
    
    L2 = {}
    for key, count in C2.items():
        if count >= s:
            L2[key] = count
    t1 = time.time() - t
    print('PCY-algorithm took {} seconds'.format(t1))
    print('{} candidates with >{} occurances'.format(len(L2), s))
    print('')
    return L2

In [24]:
## PCY algorithm with k=2, 4 hash tables, and s=10,50,100
for i in [10,50,100]:
    pcy4(i)

3748 initial itemsets to be filtered
PCY-algorithm took 0.9172368049621582 seconds
2981 candidates with >10 occurances

683 initial itemsets to be filtered
PCY-algorithm took 1.05961275100708 seconds
605 candidates with >50 occurances

213 initial itemsets to be filtered
PCY-algorithm took 0.7713227272033691 seconds
207 candidates with >100 occurances



In [25]:
def pcy5(s):
    t = time.time()
    max_hash1 = 15000
    max_hash2 = 7000 
    max_hash3 = 3800
    max_hash4 = 1600
    max_hash5 = 1350
    H1 = np.zeros((max_hash1,), dtype=np.int)
    H2 = np.zeros((max_hash2,), dtype=np.int)
    H3 = np.zeros((max_hash3,), dtype=np.int)
    H4 = np.zeros((max_hash4,), dtype=np.int)
    H5 = np.zeros((max_hash5,), dtype=np.int)
    for key in readdata(k=2, report=False):
        hash_cell_1 = hash(key) % max_hash1
        H1[hash_cell_1] += 1
        hash_cell_2 = hash(key) % max_hash2
        H2[hash_cell_2] += 1
        hash_cell_3 = hash(key) % max_hash3
        H3[hash_cell_3] += 1
        hash_cell_4 = hash(key) % max_hash4
        H4[hash_cell_4] += 1
        hash_cell_5 = hash(key) % max_hash5
        H5[hash_cell_5] += 1
    
    C2 = {}
    for key in readdata(k=2, report=False):
        # hash-based filtering stage from PCY
        hash_cell_1 = hash(key) % max_hash1
        if H1[hash_cell_1] < s:
            continue
        hash_cell_2 = hash(key) % max_hash2
        if H2[hash_cell_2] < s:
            continue
        hash_cell_3 = hash(key) % max_hash3
        if H3[hash_cell_3] < s:
            continue    
        hash_cell_4 = hash(key) % max_hash4
        if H4[hash_cell_4] < s:
            continue 
        hash_cell_5 = hash(key) % max_hash5
        if H5[hash_cell_5] < s:
            continue 
        
        
        
        # filter out non-frequent tuples
#         if key not in C2_items:
#             continue

        # record frequent tuples
        if key not in C2:
            C2[key] = 1
        else:
            C2[key] += 1
    
            
    print("{} initial itemsets to be filtered".format(len(C2)))
    
    
    L2 = {}
    for key, count in C2.items():
        if count >= s:
            L2[key] = count
    t1 = time.time() - t
    print('PCY-algorithm took {} seconds'.format(t1))
    print('{} candidates with >{} occurances'.format(len(L2), s))
    print('')
    return L2

In [26]:
# PCY algorithm with k=2, 5 hash tables, and s=10,50,100
for i in [10,50,100]:
    pcy5(i)

3738 initial itemsets to be filtered
PCY-algorithm took 1.2049367427825928 seconds
2981 candidates with >10 occurances

677 initial itemsets to be filtered
PCY-algorithm took 1.7469983100891113 seconds
605 candidates with >50 occurances

211 initial itemsets to be filtered
PCY-algorithm took 1.3894636631011963 seconds
207 candidates with >100 occurances



### Comments: 
1. 5 Hash tables sizes were defined by the possible pairs 2^169 divide by 2, 4, 8, 16. (Ref: (Park, Jong, Chen, Ming-syan, Yu, Philip, An Effective Hash-Based Algorithm for Mining Association Rules, 1997/08/18)

2. PCY multistage can generate the smaller number of itemsets to be filtered, then can prevent the memory problems. 
For example: for s=50, k=2

1 hash table:  1173 initial itemsets to be filtered, took 0.4036295413970947 seconds

2 hash tables: 802 initial itemsets to be filtered, took 0.5262117385864258 seconds

3 hash tables: 692 initial itemsets to be filtered, took 0.7373056411743164 seconds

4 hash tables: 683 initial itemsets to be filtered, took 1.05961275100708 seconds

5 hash tables: 677 initial itemsets to be filtered, took 1.7469983100891113 seconds

However, it can be considered to stop at 3 hash tables, because the dimension reduction was not so much like between the case of 1 and 2 hash tables.

## 4. Find the final list of k-frequent items (k-tuples) for k=3,4 and 5. Experiment a bit and describe the best value for the support in each case. Warning: You can use any of the three algorithms, but be careful, because the algorithm can take too long if you don't chose it properly.

In [27]:
for i in [3,4,5]:
    for s in [10,20,30,40,50]:
        naive_method(i,s)

139424 initial itemsets to be filtered
Naive method took 0.8505136966705322 seconds
6831 candidates with >10 occurances

139424 initial itemsets to be filtered
Naive method took 0.5428187847137451 seconds
1991 candidates with >20 occurances

139424 initial itemsets to be filtered
Naive method took 0.6999096870422363 seconds
850 candidates with >30 occurances

139424 initial itemsets to be filtered
Naive method took 0.8715181350708008 seconds
432 candidates with >40 occurances

139424 initial itemsets to be filtered
Naive method took 0.9074804782867432 seconds
264 candidates with >50 occurances

780620 initial itemsets to be filtered
Naive method took 2.597119092941284 seconds
3137 candidates with >10 occurances

780620 initial itemsets to be filtered
Naive method took 2.606549024581909 seconds
395 candidates with >20 occurances

780620 initial itemsets to be filtered
Naive method took 2.430020570755005 seconds
98 candidates with >30 occurances

780620 initial itemsets to be filtered
Na

In [28]:
naive_method(3,10)

139424 initial itemsets to be filtered
Naive method took 0.5170619487762451 seconds
6831 candidates with >10 occurances



{frozenset({'coffee', 'tropical fruit', 'yogurt'}): 30,
 frozenset({'cream cheese', 'pip fruit', 'yogurt'}): 25,
 frozenset({'long life bakery product', 'other vegetables', 'whole milk'}): 56,
 frozenset({'butter', 'whole milk', 'yogurt'}): 92,
 frozenset({'butter', 'rice', 'whole milk'}): 15,
 frozenset({'rice', 'whole milk', 'yogurt'}): 18,
 frozenset({'butter', 'rice', 'yogurt'}): 10,
 frozenset({'UHT-milk', 'other vegetables', 'rolls/buns'}): 18,
 frozenset({'bottled beer', 'other vegetables', 'rolls/buns'}): 40,
 frozenset({'other vegetables', 'tropical fruit', 'white bread'}): 41,
 frozenset({'bottled water', 'other vegetables', 'tropical fruit'}): 61,
 frozenset({'chocolate', 'other vegetables', 'tropical fruit'}): 35,
 frozenset({'bottled water', 'tropical fruit', 'white bread'}): 11,
 frozenset({'bottled water', 'chocolate', 'tropical fruit'}): 13,
 frozenset({'bottled water', 'other vegetables', 'white bread'}): 16,
 frozenset({'chocolate', 'other vegetables', 'white bread'})

In [29]:
naive_method(4,10)

780620 initial itemsets to be filtered
Naive method took 1.8795795440673828 seconds
3137 candidates with >10 occurances



{frozenset({'butter', 'citrus fruit', 'tropical fruit', 'whole milk'}): 16,
 frozenset({'citrus fruit', 'curd', 'tropical fruit', 'whole milk'}): 16,
 frozenset({'citrus fruit', 'tropical fruit', 'whole milk', 'yogurt'}): 38,
 frozenset({'bottled water',
            'citrus fruit',
            'tropical fruit',
            'whole milk'}): 22,
 frozenset({'butter', 'citrus fruit', 'tropical fruit', 'yogurt'}): 11,
 frozenset({'citrus fruit', 'curd', 'tropical fruit', 'yogurt'}): 11,
 frozenset({'bottled water', 'citrus fruit', 'tropical fruit', 'yogurt'}): 16,
 frozenset({'butter', 'citrus fruit', 'curd', 'whole milk'}): 11,
 frozenset({'butter', 'citrus fruit', 'whole milk', 'yogurt'}): 20,
 frozenset({'bottled water', 'butter', 'citrus fruit', 'whole milk'}): 14,
 frozenset({'citrus fruit', 'curd', 'whole milk', 'yogurt'}): 21,
 frozenset({'bottled water', 'citrus fruit', 'whole milk', 'yogurt'}): 18,
 frozenset({'butter', 'curd', 'tropical fruit', 'whole milk'}): 14,
 frozenset({'but

In [30]:
naive_method(5,10)

2665499 initial itemsets to be filtered
Naive method took 7.913327217102051 seconds
376 candidates with >10 occurances



{frozenset({'citrus fruit',
            'curd',
            'tropical fruit',
            'whole milk',
            'yogurt'}): 10,
 frozenset({'butter', 'curd', 'tropical fruit', 'whole milk', 'yogurt'}): 12,
 frozenset({'domestic eggs',
            'root vegetables',
            'tropical fruit',
            'whole milk',
            'yogurt'}): 15,
 frozenset({'pastry',
            'root vegetables',
            'tropical fruit',
            'whole milk',
            'yogurt'}): 10,
 frozenset({'bottled water',
            'newspapers',
            'rolls/buns',
            'soda',
            'yogurt'}): 10,
 frozenset({'frozen vegetables',
            'other vegetables',
            'root vegetables',
            'whole milk',
            'yogurt'}): 15,
 frozenset({'fruit/vegetable juice',
            'other vegetables',
            'root vegetables',
            'whole milk',
            'yogurt'}): 20,
 frozenset({'frozen vegetables',
            'fruit/vegetable juice',
      

### Generate rules A -> B

In [31]:
C1 = {}
for key in readdata(k=1):
    if key not in C1:
        C1[key] = 1
    else:
        C1[key] += 1

processing bin  5000


In [32]:
C2 = {}
for key in readdata(k=2):
    if key not in C2:
        C2[key] = 1
    else:
        C2[key] += 1

processing bin  5000


In [33]:
print(len(C1),len(C2))

169 9636


In [34]:
s = 10 # support threshold
L2 = {}
for key, n in C2.items():
    if n >= s:
        L2[key] = n
print('{} items with >{} occurances'.format(len(L2), s))

2981 items with >10 occurances


In [35]:
L2 = [ elem for elem in list(L2) if len(elem)>1] # clean our list a bit. 


In [46]:
for i in range(len(L2)):

    A, B = list(L2[i])
    support_AB = C2[frozenset([A, B])]
    support_A = C1[frozenset([A])]
    conf_A_leads_to_B = support_AB / support_A
    
    
    support_B = C1[frozenset([B])]
    prob_B = support_B /9835 # len of baskets/transactions
    
    interest_A_leads_to_B = conf_A_leads_to_B - prob_B
    
    if interest_A_leads_to_B > 0.4:
        print("{} --> {} with interest {:3f}".format(A, B, interest_A_leads_to_B))
        print(conf_A_leads_to_B)

honey --> whole milk with interest 0.477817
0.7333333333333333
rubbing alcohol --> citrus fruit with interest 0.417234
0.5
rubbing alcohol --> butter with interest 0.444586
0.5
cream --> sausage with interest 0.444511
0.5384615384615384
frozen fruits --> other vegetables with interest 0.473174
0.6666666666666666
kitchen utensil --> whole milk with interest 0.494484
0.75


### Comments
1. With s=40, k=5, there are zero candidates found. The best value of support for k=5 can be 10 if we want to do further analysis, like association rules generation, or clustering assiociation rules. For example, we maintain the results around > 300 candidates. 
2. With k=4, the best value of support can be 20. 
3. With k=3, the best value of support can be 40.
4. We can found 3 top association rules: 
'kitchen utensil --> whole milk' with a confidence level 75%, but the interest is 50%.
'honey --> whole milk' wih a confidence level 73%, but the interest is 48%. 
'frozen fruits --> other vegetables' with a confidence level 66%, but the interest is 47%.
if we can accept the interest aroun 50%, we can think about the association rules: 'kitchen utensil, honey --> whole milk'


# 5. Using one of the results of the previous item, for one k (k=3,4 or 5) find the possible clusters using the 1-NN criteria. Comment your results.

## Clustering for frequent candidates of naive method and k=2

In [39]:
cluster1 = list(naive_method(2,100))
cluster1

9636 initial itemsets to be filtered
Naive method took 0.16608715057373047 seconds
207 candidates with >100 occurances



[frozenset({'tropical fruit', 'yogurt'}),
 frozenset({'pip fruit', 'yogurt'}),
 frozenset({'cream cheese', 'yogurt'}),
 frozenset({'other vegetables', 'whole milk'}),
 frozenset({'long life bakery product', 'other vegetables'}),
 frozenset({'long life bakery product', 'whole milk'}),
 frozenset({'butter', 'whole milk'}),
 frozenset({'whole milk', 'yogurt'}),
 frozenset({'butter', 'yogurt'}),
 frozenset({'other vegetables', 'rolls/buns'}),
 frozenset({'bottled beer', 'other vegetables'}),
 frozenset({'bottled beer', 'rolls/buns'}),
 frozenset({'other vegetables', 'tropical fruit'}),
 frozenset({'bottled water', 'tropical fruit'}),
 frozenset({'other vegetables', 'white bread'}),
 frozenset({'bottled water', 'other vegetables'}),
 frozenset({'chocolate', 'other vegetables'}),
 frozenset({'citrus fruit', 'tropical fruit'}),
 frozenset({'citrus fruit', 'whole milk'}),
 frozenset({'citrus fruit', 'yogurt'}),
 frozenset({'bottled water', 'citrus fruit'}),
 frozenset({'tropical fruit', 'whole

In [40]:
P1 = list(map(list, cluster1)) 
print(P1)

[['tropical fruit', 'yogurt'], ['yogurt', 'pip fruit'], ['cream cheese', 'yogurt'], ['whole milk', 'other vegetables'], ['long life bakery product', 'other vegetables'], ['whole milk', 'long life bakery product'], ['whole milk', 'butter'], ['whole milk', 'yogurt'], ['butter', 'yogurt'], ['rolls/buns', 'other vegetables'], ['bottled beer', 'other vegetables'], ['bottled beer', 'rolls/buns'], ['tropical fruit', 'other vegetables'], ['tropical fruit', 'bottled water'], ['white bread', 'other vegetables'], ['bottled water', 'other vegetables'], ['other vegetables', 'chocolate'], ['tropical fruit', 'citrus fruit'], ['whole milk', 'citrus fruit'], ['citrus fruit', 'yogurt'], ['citrus fruit', 'bottled water'], ['tropical fruit', 'whole milk'], ['tropical fruit', 'curd'], ['whole milk', 'curd'], ['whole milk', 'bottled water'], ['curd', 'yogurt'], ['bottled water', 'yogurt'], ['rolls/buns', 'frankfurter'], ['soda', 'frankfurter'], ['soda', 'rolls/buns'], ['newspapers', 'tropical fruit'], ['tro

In [41]:
result = []
if len(P1) > 1:
  tmp = [P1[0]]
  for i in range(1,len(P1)):
    if P1[i][0] == P1[i-1][1] or P1[i][1] == P1[i-1][0] or P1[i][1] == P1[i-1][1] or P1[i][0] == P1[i-1][0]:
      tmp.append(P1[i])
    else:
      result.append(tmp)
      tmp = [P1[i]]
  result.append(tmp)
else:
  result = P1

for elem in result:
  print(elem)

[['tropical fruit', 'yogurt'], ['yogurt', 'pip fruit'], ['cream cheese', 'yogurt']]
[['whole milk', 'other vegetables'], ['long life bakery product', 'other vegetables'], ['whole milk', 'long life bakery product'], ['whole milk', 'butter'], ['whole milk', 'yogurt'], ['butter', 'yogurt']]
[['rolls/buns', 'other vegetables'], ['bottled beer', 'other vegetables'], ['bottled beer', 'rolls/buns']]
[['tropical fruit', 'other vegetables'], ['tropical fruit', 'bottled water']]
[['white bread', 'other vegetables'], ['bottled water', 'other vegetables'], ['other vegetables', 'chocolate']]
[['tropical fruit', 'citrus fruit'], ['whole milk', 'citrus fruit'], ['citrus fruit', 'yogurt'], ['citrus fruit', 'bottled water']]
[['tropical fruit', 'whole milk'], ['tropical fruit', 'curd'], ['whole milk', 'curd'], ['whole milk', 'bottled water']]
[['curd', 'yogurt'], ['bottled water', 'yogurt']]
[['rolls/buns', 'frankfurter'], ['soda', 'frankfurter'], ['soda', 'rolls/buns']]
[['newspapers', 'tropical fruit

## Clustering for frequent candidates with naive method and k=3

In [42]:
cluster2 = list(naive_method(3,50))
cluster2

139424 initial itemsets to be filtered
Naive method took 0.4851841926574707 seconds
264 candidates with >50 occurances



[frozenset({'long life bakery product', 'other vegetables', 'whole milk'}),
 frozenset({'butter', 'whole milk', 'yogurt'}),
 frozenset({'bottled water', 'other vegetables', 'tropical fruit'}),
 frozenset({'citrus fruit', 'tropical fruit', 'whole milk'}),
 frozenset({'citrus fruit', 'tropical fruit', 'yogurt'}),
 frozenset({'butter', 'citrus fruit', 'whole milk'}),
 frozenset({'citrus fruit', 'whole milk', 'yogurt'}),
 frozenset({'bottled water', 'citrus fruit', 'whole milk'}),
 frozenset({'butter', 'tropical fruit', 'whole milk'}),
 frozenset({'curd', 'tropical fruit', 'whole milk'}),
 frozenset({'tropical fruit', 'whole milk', 'yogurt'}),
 frozenset({'bottled water', 'tropical fruit', 'whole milk'}),
 frozenset({'curd', 'tropical fruit', 'yogurt'}),
 frozenset({'bottled water', 'tropical fruit', 'yogurt'}),
 frozenset({'bottled water', 'butter', 'whole milk'}),
 frozenset({'curd', 'whole milk', 'yogurt'}),
 frozenset({'bottled water', 'whole milk', 'yogurt'}),
 frozenset({'other veget

In [43]:
P2 = list(map(list, cluster2)) 
print(P2)

[['whole milk', 'long life bakery product', 'other vegetables'], ['whole milk', 'butter', 'yogurt'], ['tropical fruit', 'bottled water', 'other vegetables'], ['whole milk', 'tropical fruit', 'citrus fruit'], ['tropical fruit', 'citrus fruit', 'yogurt'], ['whole milk', 'citrus fruit', 'butter'], ['whole milk', 'citrus fruit', 'yogurt'], ['whole milk', 'citrus fruit', 'bottled water'], ['tropical fruit', 'whole milk', 'butter'], ['tropical fruit', 'whole milk', 'curd'], ['tropical fruit', 'whole milk', 'yogurt'], ['tropical fruit', 'whole milk', 'bottled water'], ['tropical fruit', 'curd', 'yogurt'], ['tropical fruit', 'bottled water', 'yogurt'], ['whole milk', 'butter', 'bottled water'], ['whole milk', 'curd', 'yogurt'], ['whole milk', 'bottled water', 'yogurt'], ['tropical fruit', 'other vegetables', 'root vegetables'], ['tropical fruit', 'rolls/buns', 'root vegetables'], ['tropical fruit', 'rolls/buns', 'other vegetables'], ['rolls/buns', 'other vegetables', 'root vegetables'], ['soda

In [44]:
result2 = []
if len(P2) > 1:
  tmp = [P2[0]]
  for i in range(1,len(P2)):
    if P2[i][0] == P2[i-1][0] or P2[i][1] == P2[i-1][1] or P2[i][2] == P2[i-1][2] or P2[i][0] == P2[i-1][1] or P2[i][0] == P2[i-1][2] or P2[i][1] == P2[i-1][0] or P2[i][1] == P2[i-1][2] or P2[i][2] == P2[i-1][0] or P2[i][2] == P2[i-1][1]:
      tmp.append(P2[i])
    else:
      result2.append(tmp)
      tmp = [P2[i]]
  result2.append(tmp)
else:
  result2 = P2

cluster = []
for elem in result2:
  print(elem)
  cluster.append(elem)
    


[['whole milk', 'long life bakery product', 'other vegetables'], ['whole milk', 'butter', 'yogurt']]
[['tropical fruit', 'bottled water', 'other vegetables'], ['whole milk', 'tropical fruit', 'citrus fruit'], ['tropical fruit', 'citrus fruit', 'yogurt'], ['whole milk', 'citrus fruit', 'butter'], ['whole milk', 'citrus fruit', 'yogurt'], ['whole milk', 'citrus fruit', 'bottled water'], ['tropical fruit', 'whole milk', 'butter'], ['tropical fruit', 'whole milk', 'curd'], ['tropical fruit', 'whole milk', 'yogurt'], ['tropical fruit', 'whole milk', 'bottled water'], ['tropical fruit', 'curd', 'yogurt'], ['tropical fruit', 'bottled water', 'yogurt'], ['whole milk', 'butter', 'bottled water'], ['whole milk', 'curd', 'yogurt'], ['whole milk', 'bottled water', 'yogurt']]
[['tropical fruit', 'other vegetables', 'root vegetables'], ['tropical fruit', 'rolls/buns', 'root vegetables'], ['tropical fruit', 'rolls/buns', 'other vegetables'], ['rolls/buns', 'other vegetables', 'root vegetables'], ['so

In [45]:
print(len(cluster))

35


### Comments: 
1. Using frequent itemsets can help to clustering the large number of documents or transactions. It also can be helpful with clustered association rules in reducing the large number of association rules. 
2. In our case, k=3 and Näive method (s=50), we found 35 clusters based on 1-nn. 