## Imports

In [None]:
from apriori import A_Priori
from time import time
import matplotlib.pyplot as plt

In [None]:
# Reading data from sale_data.dat file

with open('sale_data.dat') as f:
    data: list[set[int]] = [{int(i) for i in b.split()} for b in f.readlines()]

n_baskets = len(data)
print(f'The number of baskets is: {n_baskets}')

print(data[:4])

## Find frequent itemsets

In [None]:
s = 500
c = 0.5
k = 3

start = time()
frequent_itemsets = A_Priori.get_frequent_itemsets(data, k, s)
delta_t = time() - start

print(f'The itemsets until k={k} have been found in {delta_t} seconds')

## Mine frequent rules

In [None]:
start = time()
rules = A_Priori.mine_frequent_rules(frequent_itemsets, c)
delta_t = time() - start

print(f'The rules have been found in {delta_t} seconds')

## Check the results with existing library

In [None]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules

te = TransactionEncoder()
te_ary = te.fit(data).transform(data)
df = pd.DataFrame(te_ary, columns=te.columns_)  # type: ignore
ml_fp = fpgrowth(df, min_support=s/n_baskets, use_colnames=True, max_len=k)
ml_rules = association_rules(ml_fp, metric="confidence", min_threshold=c)

print(f'The number of frequent itemsets found by the mlxtend library is: {len(ml_fp)}')
print(f'The number of frequent itemsets found by the apriori algorithm is: {sum([len(v) for v in frequent_itemsets.values()])}\n')

print(f'The number of rules found by the mlxtend library is: {len(ml_rules)}')
print(f'The number of rules found by the apriori algorithm is: {sum([len(rules[x]) for x in rules])} \n')

## Some plots

In [None]:
# plot how the number of frequent itemsets we find changes with k

# fix k and change s, record number of frequent itemsets
k = 4
s_vals = [100, 250, 500, 1000, 5000]
n_frequent_itemsets = []
for s in s_vals:
    frequent_itemsets = A_Priori.get_frequent_itemsets(data, k, s)
    n_frequent_itemsets.append(sum([len(v) for v in frequent_itemsets.values()]))

# fix s and change k, record time
s = 500
k_vals = [2, 3, 4, 5]
time_spent = []
for k in k_vals:
    start = time()
    A_Priori.get_frequent_itemsets(data, k, s)
    time_spent.append(time() - start)

# fix s and k, change c and record the number of rules it finds
k = 4
s = 500
c_vals = [0.5, 0.6, 0.7, 0.8, 0.9]
fi = A_Priori.get_frequent_itemsets(data, k, s)
n_rules = []
for c in c_vals:
    rules = A_Priori.mine_frequent_rules(fi, c)
    n_rules.append(sum([len(rules[x]) for x in rules]))

In [None]:
print(len([f'{set(b)} -> {set(a - b)}' for a, s in rules.items() for b in s]))

In [None]:
plt.plot(s_vals, n_frequent_itemsets, '-x')
plt.title('Number of frequent itemsets vs s, using k=4')
plt.xlabel('s')
plt.ylabel('number of frequent itemsets')
plt.grid()
plt.show()

plt.plot(k_vals, time_spent, '-x')
plt.title('Time spent vs k, using s=1000')
plt.xlabel('k')
plt.ylabel('time')
plt.grid()
plt.show()

plt.plot(c_vals, n_rules, '-x')
plt.title('Number of rules vs c, using s=1000 and k=4')
plt.xlabel('c')
plt.ylabel('number of rules')
plt.grid()
plt.show()
