In [4]:
import numpy as np

In [5]:
class Itemset:
    def __init__(self, items: list):
        self.items = items
        self.size = len(items)
    
    def getItems(self):
        return self.items
    def getSize(self):
        return self.size

In [6]:
# read all transactions from comma-separated file
def read_transactions(filename):
    transactions = []
    with open(filename, 'r') as f:
        for line in f:
            transactions.append(line.strip().split(','))
    return transactions

In [7]:
# convert all elements in txn from string to int
def convert_to_int(txn):
    for i in range(len(txn)):
        for j in range(len(txn[i])):
            txn[i][j] = int(txn[i][j])
    return txn


In [8]:
def convert_to_objects(txn):
    ret = []
    for i in range(len(txn)):
        ret.append(Itemset(txn[i]))
    return ret

In [9]:
def vec2str(vec):
    ret = ''
    for i in range(len(vec)):
        ret += str(vec[i]) + ' '
    ret = ret.rstrip()
    return ret

def str2vec(str: str):
    ret = []
    for i in range(len(str.split(' '))):
        ret.append(int(str.split(' ')[i]))
    return ret

def count_spaces(str: str):
    return str.count(' ')

In [10]:
txns = read_transactions('data/grocery.txt')
txns = convert_to_int(txns)
txns_itemsets = convert_to_objects(txns)

In [11]:
# txns
# txn_itemsets # find a better name for this

In [12]:
# sample dictionary
d = {'1': 24, '10': 86, '5': 33, '20': 34, '1 20': 34, '5 7': 36, '9 10': 37,'1 21': 37}
d = {'1': 2, '10': 2, '5': 2, '20': 1, '1 20': 1, '5 7': 1, '9 10': 1,'1 21': 1}
d.items()


dict_items([('1', 2), ('10', 2), ('5', 2), ('20', 1), ('1 20', 1), ('5 7', 1), ('9 10', 1), ('1 21', 1)])

In [13]:
# SCO = decreasing by count, decreasing by support, lexicographic
# Try to test more thoroughly
# lexicographic
# d = { k:v for k,v in sorted(d.items(), key=lambda x: x[0])}
d = { k:v for k,v in sorted(d.items(), key=lambda x: str2vec(x[0]))}
# by support
d = { k:v for k,v in sorted(d.items(), key=lambda x: x[1], reverse=True)}
#by count
d = { k:v for k,v in sorted(d.items(), key=lambda x: count_spaces(x[0]), reverse=True)}

In [14]:
d

{'1 20': 1, '1 21': 1, '5 7': 1, '9 10': 1, '1': 2, '5': 2, '10': 2, '20': 1}

In [15]:
def getStandardCodeTable(txns: list):
    """
    <algorithm 1>
    
    Get the standard code table from the database.

    @param txns: database / list of transactions
    @return: standard code table SCT
    SCT = { itemName:(code,frequency) , ...}
    """
    # get the unique items in transactions
    d = {}
    for tx in txns:
        for item in tx:
            key = vec2str([item])
            if(key not in d):
                d[key] = 1
            else:
                d[key]+=1
    
    # sort items by their frequency
    d = { k:v for k,v in sorted(d.items(), key=lambda x: str2vec(x[0]))}
    # by support
    d = { k:v for k,v in sorted(d.items(), key=lambda x: x[1], reverse=True)}
    #by count
    d = { k:v for k,v in sorted(d.items(), key=lambda x: count_spaces(x[0]), reverse=True)}
    # print(d)

    # get the standard code table
    code_table = {}
    actual_code = 0
    for key, value in d.items():
        code_table[key] = (actual_code, value)
        actual_code+=1

    return code_table


global SCT, CT
SCT = getStandardCodeTable(txns)
CT = SCT.copy()

In [16]:
# SCT

In [17]:
def support(X):
    # [TODO] How do we get SCT here properly?
    # look into OOP based approach
    return SCT[X][1]/len(txns)

In [18]:
def compareSCO(X1, X2):
    '''
    Compare two transactions in the **Standard Cover Order**
    '''
    if len(X1) != len(X2):
        if len(X1) > len(X2):
            return -1
        else:
            return 1
    elif support(X1) != support(X2):
        if support(X1) > support(X2):
            return -1
        else:
            return 1
    else:
        for i in range(len(X1)):
            if X1[i] != X2[i]:
                if X1[i] < X2[i]:
                    return -1
                else:
                    return 1
        return 0

In [32]:
def getStandardCover(txn: list, CT: dict):
    '''
    <algorithm 2>

    Get the standard cover of a transaction.
    
    @param txn: A transaction
    @param CT:  code table CT
    @return: a set, standard cover
    CT = { (str)"items":(int)code, ...}
    '''
    # get the standard code
    code = set()
    
    # S ← smallest element X of CT in Standard Cover Order for which X ⊆ t
    for k,v in CT.items():
        if set(str2vec(k)).issubset(set(txn)):
            code = set(str2vec(k))
            break
    # if t \ S = ∅ then
    if len(set(txn) - code) == 0:
        return code
    else:
        return code.union(getStandardCover(list(set(txn) - code), CT))

In [33]:
test = getStandardCover(txns[0], CT)

{1, 18, 102, 17}
{1}
3
{17, 18, 102}
{17}
2
{18, 102}
{18}
1


In [34]:
test

{1, 17, 18, 102}