# Q5 Frequent Itemset Mining

You are required to write programs to mine the frequent itemsets (`min sup = 100`) in the provided data. Based on the frequent itemsets you mined, please write program to mine the closed frequent itemsets and maximal frequent itemsets.

## Reference

https://fp-growth.readthedocs.io/en/latest/readme.html#getting-started

https://github.com/evandempsey/fp-growth

In [57]:
import pyfpgrowth
import time
import re
import pandas as pd

## Task 1: Mine the frequent itemsets (min sup = 100)

In [7]:
transactions = [] # origin data
with open("freq_items_dataset.txt","r") as f:
    for line in f.readlines():
        transactions.append(line.strip().split())

In [14]:
t0 = time.time()
patterns = pyfpgrowth.find_frequent_patterns(transactions, 100)
t1 = time.time()

In [15]:
print("Task 1 running time = %s" % (t1-t0))

Task 1 running time = 179.21781706809998


In [69]:
# Transfer the form of string according to the output requirement
def transfer_to_output(x):
    return x.replace('(', '').replace(')', '').replace(',', ' ').replace("'", "")

In [70]:
# Generate the task1_output.csv
freq_items = [i for i in patterns.keys()]
frequency = [i for i in patterns.values()]
res = [i for i in zip(freq_items, frequency)]

df_res = pd.DataFrame(res, columns=['freq_items', 'frequency'])
df_res['freq_items'] = df_res['freq_items'].apply(lambda x: transfer_to_output(str(x)))
df_res.to_csv("task1_output.csv", index=False)

In [82]:
print("The number of frequent itemsets = %s" % len(patterns))

The number of frequent itemsets = 22438


## Task 2: Mine the closed frequent itemsets

In [37]:
closed_freq = []
t2 = time.time()
total = len(patterns)
i = 0
for item, sup in patterns.items():
    if i%1000 == 0:
        print("processing: %s/%s" % (i, total))
    flag = 1
    for k, v in patterns.items():
        if len(k) > len(item) and set(item).issubset(k) and sup == v:
            flag = 0
            break
    if flag == 1:
        closed_freq.append(item)
    i += 1
t3 = time.time()
print("Task 2 running time = %s" % (t3-t2))

processing: 0/22438
processing: 1000/22438
processing: 2000/22438
processing: 3000/22438
processing: 4000/22438
processing: 5000/22438
processing: 6000/22438
processing: 7000/22438
processing: 8000/22438
processing: 9000/22438
processing: 10000/22438
processing: 11000/22438
processing: 12000/22438
processing: 13000/22438
processing: 14000/22438
processing: 15000/22438
processing: 16000/22438
processing: 17000/22438
processing: 18000/22438
processing: 19000/22438
processing: 20000/22438
processing: 21000/22438
processing: 22000/22438
Task 2 running time = 259.6320242881775


In [42]:
print("The number of closed frequent itemsets = %s" % len(closed_freq))

The number of closed frequent itemsets = 21797


In [78]:
# Generate the task2_output.csv
res = [i for i in zip(closed_freq)]
df_res = pd.DataFrame(res, columns=['closed_items'])
df_res['closed_items'] = df_res['closed_items'].apply(lambda x: transfer_to_output(str(x)))
df_res.to_csv("task2_output.csv", index=False)

## Task 3: Mine the maximal frequent itemsets.

In [40]:
max_freq = []
t4 = time.time()
total = len(patterns)
i = 0
for item, sup in patterns.items():
    if i%1000 == 0:
        print("processing: %s/%s" % (i, total))
    flag = 1
    for k, v in patterns.items():
        if len(k) > len(item) and set(item).issubset(k):
            flag = 0
            break
    if flag == 1:
        max_freq.append(item)
    i += 1
t5 = time.time()
print("Task 3 running time = %s" % (t5-t4))

processing: 0/22438
processing: 1000/22438
processing: 2000/22438
processing: 3000/22438
processing: 4000/22438
processing: 5000/22438
processing: 6000/22438
processing: 7000/22438
processing: 8000/22438
processing: 9000/22438
processing: 10000/22438
processing: 11000/22438
processing: 12000/22438
processing: 13000/22438
processing: 14000/22438
processing: 15000/22438
processing: 16000/22438
processing: 17000/22438
processing: 18000/22438
processing: 19000/22438
processing: 20000/22438
processing: 21000/22438
processing: 22000/22438
Task 3 running time = 132.08923029899597


In [43]:
print("The number of maximal frequent itemsets = %s" % len(max_freq))

The number of closed frequent itemsets = 4054


In [81]:
# Generate the task3_output.csv
res = [i for i in zip(max_freq)]
df_res = pd.DataFrame(res, columns=['maximal_items'])
df_res['maximal_items'] = df_res['maximal_items'].apply(lambda x: transfer_to_output(str(x)))
df_res.to_csv("task3_output.csv", index=False)