# Olimov Bekhzod (올리모브벡조드)
## Kyungpook National University

### A toy dataset

In [1]:
import pandas as pd
import numpy as np

items = [["I1", "I2", "I3", "I5"], ["I1", "I3"], ["I2", "I7", "I8"], ["I3", "I4", "I9", "I10"], ["I2", "I3", "I8", "I10"],
        ["I8", "I9", "I10"], ["I2", "I3", "I5", "I6", "I8"], ["I1", "I2", "I4", "I6"], ["I2", "I3", "I6", "I7"], ["I1", "I3"]]
df = pd.DataFrame(items)
items

[['I1', 'I2', 'I3', 'I5'],
 ['I1', 'I3'],
 ['I2', 'I7', 'I8'],
 ['I3', 'I4', 'I9', 'I10'],
 ['I2', 'I3', 'I8', 'I10'],
 ['I8', 'I9', 'I10'],
 ['I2', 'I3', 'I5', 'I6', 'I8'],
 ['I1', 'I2', 'I4', 'I6'],
 ['I2', 'I3', 'I6', 'I7'],
 ['I1', 'I3']]

In [2]:
items = [[1,2,3,5], [1,3], [2,7,8], [3,4,9,10], [2,3,8,10], [8,9,10], [2,3,5,6,8],
        [1,2,4,6], [2,3,6,7], [1,3]]
dic = {'Items': items}
df = pd.DataFrame.from_dict(dic)
numbers = np.arange(1, 11, 1).astype('int').reshape(-1, 1)
df_numbers = pd.DataFrame(numbers)
pd.concat([df_numbers, df], axis=1)

Unnamed: 0,0,Items
0,1,"[1, 2, 3, 5]"
1,2,"[1, 3]"
2,3,"[2, 7, 8]"
3,4,"[3, 4, 9, 10]"
4,5,"[2, 3, 8, 10]"
5,6,"[8, 9, 10]"
6,7,"[2, 3, 5, 6, 8]"
7,8,"[1, 2, 4, 6]"
8,9,"[2, 3, 6, 7]"
9,10,"[1, 3]"


In [3]:
items = []
for i in range(len(df)): 
    for j in range(len(df['Items'][i])): 
        items.append(df['Items'][i][j])
items

[1,
 2,
 3,
 5,
 1,
 3,
 2,
 7,
 8,
 3,
 4,
 9,
 10,
 2,
 3,
 8,
 10,
 8,
 9,
 10,
 2,
 3,
 5,
 6,
 8,
 1,
 2,
 4,
 6,
 2,
 3,
 6,
 7,
 1,
 3]

<h2>Creating First Candidate (C1)</h2>

In [4]:
unique_item = set(items)
list_unique_item = list(unique_item)
count_unique = []
for value in (list_unique_item):
    count_unique.append((value, items.count(value)))
count_unique

[(1, 4),
 (2, 6),
 (3, 7),
 (4, 2),
 (5, 2),
 (6, 3),
 (7, 2),
 (8, 4),
 (9, 2),
 (10, 3)]

In [5]:
c1 = pd.DataFrame(count_unique, columns=["Itemsets", "Support"])
c1

Unnamed: 0,Itemsets,Support
0,1,4
1,2,6
2,3,7
3,4,2
4,5,2
5,6,3
6,7,2
7,8,4
8,9,2
9,10,3


<h2>Creating first Frequent Itemset (L1)</h2>

In [6]:
def filtering(c):
    threshold = 2
    filtering = c['Support'] >= threshold
    freq = c[filtering]
    return freq

In [7]:
l1 = filtering(c1)
l1

Unnamed: 0,Itemsets,Support
0,1,4
1,2,6
2,3,7
3,4,2
4,5,2
5,6,3
6,7,2
7,8,4
8,9,2
9,10,3


## Create the Second Candidate (C2)

In [8]:
def join(prev_c):
    join_c = []
    for i in range(len(prev_c['Itemsets'])):
        for j in range((i+1), len(prev_c['Itemsets'])):
            itemset_i = prev_c['Itemsets'][i]
            itemset_j = prev_c['Itemsets'][j]
            if(type(itemset_i) == np.int64 and type(itemset_j) == np.int64):
                itemset_i = {itemset_i}
                itemset_j = {itemset_j}
            union_c = itemset_i.union(itemset_j)
            if union_c not in join_c:
                join_c.append(union_c)
    return join_c

In [9]:
two_item_sets = join(c1)
two_item_sets

[{1, 2},
 {1, 3},
 {1, 4},
 {1, 5},
 {1, 6},
 {1, 7},
 {1, 8},
 {1, 9},
 {1, 10},
 {2, 3},
 {2, 4},
 {2, 5},
 {2, 6},
 {2, 7},
 {2, 8},
 {2, 9},
 {2, 10},
 {3, 4},
 {3, 5},
 {3, 6},
 {3, 7},
 {3, 8},
 {3, 9},
 {3, 10},
 {4, 5},
 {4, 6},
 {4, 7},
 {4, 8},
 {4, 9},
 {4, 10},
 {5, 6},
 {5, 7},
 {5, 8},
 {5, 9},
 {5, 10},
 {6, 7},
 {6, 8},
 {6, 9},
 {6, 10},
 {7, 8},
 {7, 9},
 {7, 10},
 {8, 9},
 {8, 10},
 {9, 10}]

In [10]:
count_c2 = []
for i in range(len(two_item_sets)):
    count_c2.append((two_item_sets[i], 0))
count_c2

[({1, 2}, 0),
 ({1, 3}, 0),
 ({1, 4}, 0),
 ({1, 5}, 0),
 ({1, 6}, 0),
 ({1, 7}, 0),
 ({1, 8}, 0),
 ({1, 9}, 0),
 ({1, 10}, 0),
 ({2, 3}, 0),
 ({2, 4}, 0),
 ({2, 5}, 0),
 ({2, 6}, 0),
 ({2, 7}, 0),
 ({2, 8}, 0),
 ({2, 9}, 0),
 ({2, 10}, 0),
 ({3, 4}, 0),
 ({3, 5}, 0),
 ({3, 6}, 0),
 ({3, 7}, 0),
 ({3, 8}, 0),
 ({3, 9}, 0),
 ({3, 10}, 0),
 ({4, 5}, 0),
 ({4, 6}, 0),
 ({4, 7}, 0),
 ({4, 8}, 0),
 ({4, 9}, 0),
 ({4, 10}, 0),
 ({5, 6}, 0),
 ({5, 7}, 0),
 ({5, 8}, 0),
 ({5, 9}, 0),
 ({5, 10}, 0),
 ({6, 7}, 0),
 ({6, 8}, 0),
 ({6, 9}, 0),
 ({6, 10}, 0),
 ({7, 8}, 0),
 ({7, 9}, 0),
 ({7, 10}, 0),
 ({8, 9}, 0),
 ({8, 10}, 0),
 ({9, 10}, 0)]

In [11]:
init_df_c = pd.DataFrame(count_c2, columns=['Itemsets', 'Support'])

In [12]:
def count_support(df, prev_c_list):
    init_df_c['Support'] = 0 
    count_prev_c = []

    for i in range(len(prev_c_list)):
        count_prev_c.append((prev_c_list[i], 0))
    
    df_c = pd.DataFrame(count_prev_c, columns=['Itemsets', 'Support'])
        
    for i in range(len(df)):
        for j in range(len(count_prev_c)):
            if (df_c['Itemsets'][j]).issubset(set(df['Items'][i])): 
                df_c.loc[j, 'Support'] += 1
            
    return df_c

In [14]:
c2 = count_support(df, two_item_sets)
c2

Unnamed: 0,Itemsets,Support
0,"{1, 2}",2
1,"{1, 3}",3
2,"{1, 4}",1
3,"{1, 5}",1
4,"{1, 6}",1
5,"{1, 7}",0
6,"{8, 1}",0
7,"{1, 9}",0
8,"{1, 10}",0
9,"{2, 3}",4


# Creating Second Frequent Itemset (L2)

In [15]:
l2 = filtering(c2)
l2 = l2.reset_index(drop=True)
l2

Unnamed: 0,Itemsets,Support
0,"{1, 2}",2
1,"{1, 3}",3
2,"{2, 3}",4
3,"{2, 5}",2
4,"{2, 6}",3
5,"{2, 7}",2
6,"{8, 2}",3
7,"{3, 5}",2
8,"{3, 6}",2
9,"{8, 3}",2


## Creating the Third Candidate (C3) 

In [16]:
join_l2 = join(l2)
print(join_l2)

[{1, 2, 3}, {1, 2, 5}, {1, 2, 6}, {1, 2, 7}, {8, 1, 2}, {1, 2, 3, 5}, {1, 2, 3, 6}, {8, 1, 2, 3}, {3, 1, 2, 10}, {8, 1, 2, 10}, {1, 2, 10, 9}, {1, 2, 3, 7}, {1, 3, 5}, {1, 3, 6}, {8, 1, 3}, {1, 10, 3}, {8, 1, 10, 3}, {1, 10, 3, 9}, {2, 3, 5}, {2, 3, 6}, {2, 3, 7}, {8, 2, 3}, {10, 2, 3}, {8, 10, 2, 3}, {10, 9, 2, 3}, {2, 5, 6}, {2, 5, 7}, {8, 2, 5}, {2, 3, 5, 6}, {8, 2, 3, 5}, {3, 2, 10, 5}, {8, 2, 10, 5}, {9, 2, 10, 5}, {2, 6, 7}, {8, 2, 6}, {8, 2, 3, 6}, {3, 2, 10, 6}, {8, 2, 10, 6}, {9, 2, 10, 6}, {8, 2, 7}, {2, 3, 5, 7}, {2, 3, 6, 7}, {8, 2, 3, 7}, {3, 2, 10, 7}, {8, 2, 10, 7}, {9, 2, 10, 7}, {8, 2, 10}, {8, 9, 2, 10}, {3, 5, 6}, {8, 3, 5}, {10, 3, 5}, {8, 10, 3, 5}, {9, 10, 3, 5}, {8, 3, 6}, {10, 3, 6}, {8, 10, 3, 6}, {9, 10, 3, 6}, {8, 10, 3}, {8, 9, 10, 3}, {9, 10, 3}, {8, 9, 10}]


## Pruning

In [18]:
def subsets(c):
    
    temp = []
    final = []
    for i in range(len(c)):
        for j in range(len(c)):
            if i != j:
                temp.append(c[j])
        temp_set = set(temp)
        final.append(temp_set)
        temp.clear()
    print('Subset from {} : {}'.format(c, final))
    return final

def pruning(c_set, prev_c):
    temp = []
    
    for idx, value in enumerate(c_set):
        list_candidate = list(value)
        temp_candidate = (subsets(list_candidate))
        
        for temp_item in temp_candidate:
            check = temp_item == prev_c['Itemsets']
            
            if any(check) == False:
                print(any(check))
                print('Val', value)
            else:
                print('\nAll of {} subset contained in \n{}'.format(c_set, prev_c))
                if value not in temp:
                    temp.append(value)
                
    return temp

In [19]:
subset = [{2, 3}, {1, 3}, {1, 2}]

In [20]:
for i in range(len(join_l2)):
    subsets(list(join_l2[i]))

Subset from [1, 2, 3] : [{2, 3}, {1, 3}, {1, 2}]
Subset from [1, 2, 5] : [{2, 5}, {1, 5}, {1, 2}]
Subset from [1, 2, 6] : [{2, 6}, {1, 6}, {1, 2}]
Subset from [1, 2, 7] : [{2, 7}, {1, 7}, {1, 2}]
Subset from [8, 1, 2] : [{1, 2}, {8, 2}, {8, 1}]
Subset from [1, 2, 3, 5] : [{2, 3, 5}, {1, 3, 5}, {1, 2, 5}, {1, 2, 3}]
Subset from [1, 2, 3, 6] : [{2, 3, 6}, {1, 3, 6}, {1, 2, 6}, {1, 2, 3}]
Subset from [8, 1, 2, 3] : [{1, 2, 3}, {8, 2, 3}, {8, 1, 3}, {8, 1, 2}]
Subset from [3, 1, 2, 10] : [{1, 2, 10}, {10, 2, 3}, {1, 10, 3}, {1, 2, 3}]
Subset from [8, 1, 2, 10] : [{1, 2, 10}, {8, 2, 10}, {8, 1, 10}, {8, 1, 2}]
Subset from [1, 2, 10, 9] : [{9, 2, 10}, {1, 10, 9}, {1, 2, 9}, {1, 2, 10}]
Subset from [1, 2, 3, 7] : [{2, 3, 7}, {1, 3, 7}, {1, 2, 7}, {1, 2, 3}]
Subset from [1, 3, 5] : [{3, 5}, {1, 5}, {1, 3}]
Subset from [1, 3, 6] : [{3, 6}, {1, 6}, {1, 3}]
Subset from [8, 1, 3] : [{1, 3}, {8, 3}, {8, 1}]
Subset from [1, 10, 3] : [{10, 3}, {1, 3}, {1, 10}]
Subset from [8, 1, 10, 3] : [{1, 10, 3},

In [22]:
for item in subset:
    print(item)
    check = item == l2['Itemsets']
    print('Check', any(check))

{2, 3}
Check True
{1, 3}
Check True
{1, 2}
Check True


In [23]:
c3 = pruning(join_l2, l2)

Subset from [1, 2, 3] : [{2, 3}, {1, 3}, {1, 2}]

All of [{1, 2, 3}, {1, 2, 5}, {1, 2, 6}, {1, 2, 7}, {8, 1, 2}, {1, 2, 3, 5}, {1, 2, 3, 6}, {8, 1, 2, 3}, {3, 1, 2, 10}, {8, 1, 2, 10}, {1, 2, 10, 9}, {1, 2, 3, 7}, {1, 3, 5}, {1, 3, 6}, {8, 1, 3}, {1, 10, 3}, {8, 1, 10, 3}, {1, 10, 3, 9}, {2, 3, 5}, {2, 3, 6}, {2, 3, 7}, {8, 2, 3}, {10, 2, 3}, {8, 10, 2, 3}, {10, 9, 2, 3}, {2, 5, 6}, {2, 5, 7}, {8, 2, 5}, {2, 3, 5, 6}, {8, 2, 3, 5}, {3, 2, 10, 5}, {8, 2, 10, 5}, {9, 2, 10, 5}, {2, 6, 7}, {8, 2, 6}, {8, 2, 3, 6}, {3, 2, 10, 6}, {8, 2, 10, 6}, {9, 2, 10, 6}, {8, 2, 7}, {2, 3, 5, 7}, {2, 3, 6, 7}, {8, 2, 3, 7}, {3, 2, 10, 7}, {8, 2, 10, 7}, {9, 2, 10, 7}, {8, 2, 10}, {8, 9, 2, 10}, {3, 5, 6}, {8, 3, 5}, {10, 3, 5}, {8, 10, 3, 5}, {9, 10, 3, 5}, {8, 3, 6}, {10, 3, 6}, {8, 10, 3, 6}, {9, 10, 3, 6}, {8, 10, 3}, {8, 9, 10, 3}, {9, 10, 3}, {8, 9, 10}] subset contained in 
   Itemsets  Support
0    {1, 2}        2
1    {1, 3}        3
2    {2, 3}        4
3    {2, 5}        2
4    {2, 6}        

<h2>Creating the Third Frequent Itemset (L3)</h2>

In [25]:
c3_df = count_support(df, c3)
c3_df

Unnamed: 0,Itemsets,Support
0,"{1, 2, 3}",1
1,"{1, 2, 5}",1
2,"{1, 2, 6}",1
3,"{1, 2, 7}",0
4,"{8, 1, 2}",0
5,"{1, 3, 5}",1
6,"{1, 3, 6}",0
7,"{8, 1, 3}",0
8,"{1, 10, 3}",0
9,"{2, 3, 5}",2


In [28]:
def filtering(c):
    threshold = 2
    filtering = c['Support'] >= threshold
    freq = c[filtering]
    return freq

In [29]:
l3 = filtering(c3_df)
l3

Unnamed: 0,Itemsets,Support
9,"{2, 3, 5}",2
10,"{2, 3, 6}",2
12,"{8, 2, 3}",2


## All Frequent Itemsets

In [30]:
print(f"l1: {l1}\n")
print(f"l2: {l2}\n")
print(f"l3: {l3}\n")

l1:    Itemsets  Support
0         1        4
1         2        6
2         3        7
3         4        2
4         5        2
5         6        3
6         7        2
7         8        4
8         9        2
9        10        3

l2:    Itemsets  Support
0    {1, 2}        2
1    {1, 3}        3
2    {2, 3}        4
3    {2, 5}        2
4    {2, 6}        3
5    {2, 7}        2
6    {8, 2}        3
7    {3, 5}        2
8    {3, 6}        2
9    {8, 3}        2
10  {10, 3}        2
11  {8, 10}        2
12  {9, 10}        2

l3:      Itemsets  Support
9   {2, 3, 5}        2
10  {2, 3, 6}        2
12  {8, 2, 3}        2

