#Frequent Pattern Mining
1. Analytical problem: Data Mining: Concepts and Techniques,3rd ed Exercise [Ch.06]: 6.3

2. Dataset Repository: Frequent Itemset Mining Dataset Repository

a. Experiment on each of the following:

i. Retail

3. Implementation of Frequent Pattern Mining Algorithms

a. Part - 1: Exploratory Data Analysis

i. Print summary of each dataset

1. Total Number of Transactions

2. Avg. Transaction Length

b. Part - 2: Apriori Algorithm

i. Implement in Python3 / Java

ii. Pseudocode: Data Mining: Concepts and Techniques, 3rd ed. Jiawei Han, Micheline Kamber, and Jian Pei (Apriori - 6.2.1, FP-growth - 6.2.4) [See Next Page]

iii. Take input of min_sup and dataset_name from command line argument

iv. Print L-1 to L-k itemsets and this output should be written to result_{dataset_name}.txt file

v. Print Total Elapsed Time (in second)

4. Code Structure

a. Data loader method: takes the name of the dataset as input and returns in a suitable format for applying apriori

b. Main algorithm and supporting methods: According to pseudocode

In [None]:
# @title Libraries
import numpy as np
import pandas as pd
import itertools
import time
import urllib.request

In [None]:
# @title Download dataset
urllib.request.urlretrieve('http://fimi.uantwerpen.be/data/', 'retail.dat')
!ls

retail.dat  sample_data


In [None]:
# @title Reading the Dataset
data = pd.read_csv('/content/retail.data')

data

Unnamed: 0,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
0,30 31 32
1,33 34 35
2,36 37 38 39 40 41 42 43 44 45 46
3,38 39 47 48
4,38 39 48 49 50 51 52 53 54 55 56 57 58
...,...
88156,39 875 2665 2962 12959 14070 14406 15518 16379
88157,39 41 101 346 393 413 479 522 586 635 695 799 ...
88158,2310 4267
88159,39 48 2528


In [None]:
# @title Function to Read the Dataset
print("Enter the filename:")
filename = input()
print("Enter the minimum support count:")
min_support = int(input())

# Read data from the file
with open(filename) as f:
    content = f.readlines()

content = [x.strip() for x in content]

Transaction = []                  # To store transactions
Frequent_items_value = {}         # To store all frequent item sets

# Fill values in transaction from the txt file
for i in range(0, len(content)):
    Transaction.append(content[i].split())

Enter the filename:
/content/retail1.data
Enter the minimum support count:
2


In [None]:
# @title Function to get Frequent One-Itemset
def frequent_one_item(Transaction, min_support):
    candidate1 = {}

    for i in range(0, len(Transaction)):
        for j in range(0, len(Transaction[i])):
            if Transaction[i][j] not in candidate1:
                candidate1[Transaction[i][j]] = 1
            else:
                candidate1[Transaction[i][j]] += 1

    frequentitem1 = []  # To get frequent 1-itemsets with minimum support count
    for value in candidate1:
        if candidate1[value] >= min_support:
            frequentitem1.append([value])
            Frequent_items_value[tuple([value])] = candidate1[value]

    return frequentitem1

values = frequent_one_item(Transaction, min_support)
print(values)
print(Frequent_items_value)

[['3'], ['32'], ['36'], ['38'], ['39'], ['41'], ['47'], ['48'], ['56'], ['65'], ['66'], ['78'], ['79'], ['89'], ['140'], ['146'], ['179'], ['186'], ['193'], ['225'], ['230'], ['237'], ['242'], ['256']]
{('3',): 2, ('32',): 5, ('36',): 6, ('38',): 12, ('39',): 32, ('41',): 11, ('47',): 2, ('48',): 23, ('56',): 2, ('65',): 3, ('66',): 2, ('78',): 2, ('79',): 3, ('89',): 2, ('140',): 2, ('146',): 2, ('179',): 2, ('186',): 2, ('193',): 2, ('225',): 2, ('230',): 2, ('237',): 2, ('242',): 3, ('256',): 2}


In [None]:
# @title Generate Subsets of Itemsets of Size K
def generate_k_subsets(dataset, length):
    subsets = []
    for itemset in dataset:
        subsets.extend(map(list, itertools.combinations(itemset, length)))
    return subsets

In [None]:
# @title Apriori Generate Function to Generate Ck
def apriori_generate(dataset, k):
    ck = []
    # Join step
    lenlk = len(dataset)
    for i in range(lenlk):
        for j in range(i + 1, lenlk):
            L1 = list(dataset[i])[:k - 2]
            L2 = list(dataset[j])[:k - 2]
            if L1 == L2:
                ck.append(sorted(list(set(dataset[i]) | set(dataset[j]))))

    # Prune step
    final_ck = []
    for candidate in ck:
        all_subsets = list(itertools.combinations(candidate, k - 1))
        found = True
        for i in range(len(all_subsets)):
            value = list(sorted(all_subsets[i]))
            if value not in dataset:
                found = False
        if found:
            final_ck.append(candidate)

    return ck, final_ck

In [None]:
# @title Function to Generate Lk
def generateL(ck, min_support):
    support_ck = {}
    for val in Transaction:
        for val1 in ck:
            value = set(val)
            value1 = set(val1)

            if value1.issubset(value):
                if tuple(val1) not in support_ck:
                    support_ck[tuple(val1)] = 1
                else:
                    support_ck[tuple(val1)] += 1
    frequent_item = []
    for item_set in support_ck:
        if support_ck[item_set] >= min_support:
            frequent_item.append(sorted(list(item_set)))
            Frequent_items_value[item_set] = support_ck[item_set]

    return frequent_item

In [None]:
# @title Main Apriori Algorithm Function
def apriori(L1, min_support):
    k = 2
    L = []
    L.append(0)
    L.append(L1)

    start = time.time()
    while len(L[k - 1]) > 0:
        ck, final_ck = apriori_generate(L[k - 1], k)  # To generate candidate itemsets
        print("C%d" % k)
        print(final_ck)
        if k > 2:
            while len(L[k - 1]) > 0:
                l = generateL(final_ck, min_support)
                L.append(l)
                print("Frequent %d item" % k)
                print(l)
                k = k + 1
                ck, final_ck = apriori_generate(L[k - 1], k)
                print("C%d" % k)
                print(final_ck)
            break
        k_subsets = generate_k_subsets(Transaction, k)  # To generate subsets of each transaction
        for subset in k_subsets:
            support_ck = 0
            for transaction in Transaction:
                if set(subset).issubset(set(transaction)):
                    support_ck += 1
            if support_ck >= min_support:
                final_ck.append(subset)
        print("Frequent %d item" % k)
        print(final_ck)
        L.append(final_ck)
        k = k + 1
    end = time.time()
    return L, (end - start)

In [None]:
# @title Output File
L_value, time_taken = apriori(values, min_support)
print("Time Taken is:")
print(time_taken)
print("All frequent itemsets with their support count:")
print(Frequent_items_value)

C2
[['3', '32'], ['3', '36'], ['3', '38'], ['3', '39'], ['3', '41'], ['3', '47'], ['3', '48'], ['3', '56'], ['3', '65'], ['3', '66'], ['3', '78'], ['3', '79'], ['3', '89'], ['140', '3'], ['146', '3'], ['179', '3'], ['186', '3'], ['193', '3'], ['225', '3'], ['230', '3'], ['237', '3'], ['242', '3'], ['256', '3'], ['32', '36'], ['32', '38'], ['32', '39'], ['32', '41'], ['32', '47'], ['32', '48'], ['32', '56'], ['32', '65'], ['32', '66'], ['32', '78'], ['32', '79'], ['32', '89'], ['140', '32'], ['146', '32'], ['179', '32'], ['186', '32'], ['193', '32'], ['225', '32'], ['230', '32'], ['237', '32'], ['242', '32'], ['256', '32'], ['36', '38'], ['36', '39'], ['36', '41'], ['36', '47'], ['36', '48'], ['36', '56'], ['36', '65'], ['36', '66'], ['36', '78'], ['36', '79'], ['36', '89'], ['140', '36'], ['146', '36'], ['179', '36'], ['186', '36'], ['193', '36'], ['225', '36'], ['230', '36'], ['237', '36'], ['242', '36'], ['256', '36'], ['38', '39'], ['38', '41'], ['38', '47'], ['38', '48'], ['38', '5