In [44]:
import numpy as np
from itertools import chain, combinations

In [45]:
class Itemset:
    def __init__(self, items: list):
        self.items = items
        self.size = len(items)
    
    def getItems(self):
        return self.items
    def getSize(self):
        return self.size

In [46]:
# read all transactions from comma-separated file
def read_transactions(filename):
    transactions = []
    with open(filename, 'r') as f:
        for line in f:
            transactions.append(line.strip().split(','))
    return transactions

In [47]:
# convert all elements in txn from string to int
def convert_to_int(txns):
    for i in range(len(txns)):
        for j in range(len(txns[i])):
            txns[i][j] = int(txns[i][j])
        txns[i].sort()
    return txns


In [48]:
def convert_to_objects(txn):
    ret = []
    for i in range(len(txn)):
        ret.append(Itemset(txn[i]))
    return ret

In [49]:
def vec2str(vec):
    ret = ''
    for i in range(len(vec)):
        ret += str(vec[i]) + ' '
    ret = ret.rstrip()
    return ret

def str2vec(str: str):
    ret = []
    for i in range(len(str.split(' '))):
        ret.append(int(str.split(' ')[i]))
    return ret

def count_spaces(str: str):
    return str.count(' ')

In [50]:
txns = read_transactions('data/grocery_500.txt')
txns = txns[:100]
txns = convert_to_int(txns)
txns_itemsets = convert_to_objects(txns)

In [51]:
vec2str("12 15")

'1 2   1 5'

In [52]:
txns
# txn_itemsets # find a better name for this

[[1, 17, 18, 102],
 [1, 23, 79],
 [24, 27],
 [18, 95],
 [37, 61],
 [8, 44],
 [1, 4, 18],
 [1, 37],
 [7, 82],
 [8, 37, 73],
 [3, 74],
 [1, 10],
 [0, 34],
 [15, 52],
 [4, 18],
 [21, 38],
 [35, 129],
 [4],
 [76, 84],
 [24, 94],
 [3, 95],
 [0, 1, 2],
 [4, 13, 31],
 [3, 70],
 [1, 4, 12, 13],
 [1, 21, 23],
 [4],
 [69, 73],
 [4, 16, 151],
 [1, 158],
 [70, 147],
 [1, 4, 8, 9, 13, 33, 49],
 [20, 50, 62, 106],
 [0, 17, 21, 37, 55, 67, 68],
 [17, 36, 43],
 [0, 17],
 [23, 145],
 [100, 150],
 [2, 26, 63],
 [75, 131],
 [14, 16],
 [8, 16],
 [2, 4, 137],
 [23, 30],
 [1, 3, 8, 17, 36, 39, 137],
 [4, 6, 16, 64],
 [8, 17],
 [0, 4, 21, 65, 70],
 [19, 74],
 [1, 40],
 [1, 35],
 [56, 68, 82, 130],
 [21, 128],
 [3, 122],
 [16, 58, 113],
 [1, 64],
 [0, 74],
 [73, 122],
 [1, 17, 18, 49],
 [24, 101],
 [15, 20],
 [1, 4],
 [1, 6],
 [7, 79],
 [4, 11, 13],
 [49, 63],
 [9, 31],
 [2, 127],
 [2, 4, 8, 72, 128],
 [19, 81],
 [3, 4],
 [7, 17],
 [3, 21],
 [37, 50, 70],
 [44, 118],
 [48, 68],
 [15, 21],
 [25, 153],
 [20, 31

In [53]:
# sample dictionary
d = {'1': 24, '10': 86, '5': 33, '20': 34, '1 20': 34, '5 7': 36, '9 10': 37,'1 21': 37}
d = {'1': 2, '10': 2, '5': 2, '20': 1, '1 20': 1, '5 7': 1, '9 10': 1,'1 21': 1}
d.items()


dict_items([('1', 2), ('10', 2), ('5', 2), ('20', 1), ('1 20', 1), ('5 7', 1), ('9 10', 1), ('1 21', 1)])

In [54]:
# SCO = decreasing by count, decreasing by support, lexicographic
# Try to test more thoroughly
# lexicographic
# d = { k:v for k,v in sorted(d.items(), key=lambda x: x[0])}
d = { k:v for k,v in sorted(d.items(), key=lambda x: str2vec(x[0]))}
# by support
d = { k:v for k,v in sorted(d.items(), key=lambda x: x[1], reverse=True)}
#by count
d = { k:v for k,v in sorted(d.items(), key=lambda x: count_spaces(x[0]), reverse=True)}

In [55]:
d

{'1 20': 1, '1 21': 1, '5 7': 1, '9 10': 1, '1': 2, '5': 2, '10': 2, '20': 1}

In [56]:
def getStandardCodeTable(txns: list):
    """
    <algorithm 1>
    
    Get the standard code table from the database.

    @param txns: database / list of transactions
    @return: standard code table SCT
    SCT = { itemName:(code,frequency) , ...}
    """
    # get the unique items in transactions
    d = {}
    for tx in txns:
        for item in tx:
            key = vec2str([item])
            if(key not in d):
                d[key] = 1
            else:
                d[key]+=1
    
    # sort items by their frequency
    d = { k:v for k,v in sorted(d.items(), key=lambda x: str2vec(x[0]))}
    # by support
    d = { k:v for k,v in sorted(d.items(), key=lambda x: x[1], reverse=True)}
    #by count
    d = { k:v for k,v in sorted(d.items(), key=lambda x: count_spaces(x[0]), reverse=True)}
    # print(d)

    # get the standard code table, we don't need the actual code rn
    code_table = {}
    # actual_code = 0
    for key, value in d.items():
        code_table[key] = value
        # code_table[key] = (actual_code,value)
        # actual_code+=1

    return code_table


global SCT
SCT = getStandardCodeTable(txns)
# CT = SCT.copy()

In [57]:
SCT

{'4': 19,
 '1': 18,
 '17': 11,
 '21': 9,
 '0': 8,
 '3': 8,
 '18': 8,
 '8': 7,
 '24': 6,
 '37': 6,
 '70': 6,
 '73': 6,
 '2': 5,
 '16': 5,
 '7': 4,
 '10': 4,
 '13': 4,
 '23': 4,
 '35': 4,
 '44': 4,
 '49': 4,
 '68': 4,
 '9': 3,
 '11': 3,
 '15': 3,
 '20': 3,
 '31': 3,
 '50': 3,
 '63': 3,
 '64': 3,
 '74': 3,
 '79': 3,
 '82': 3,
 '6': 2,
 '12': 2,
 '14': 2,
 '19': 2,
 '25': 2,
 '26': 2,
 '36': 2,
 '43': 2,
 '61': 2,
 '75': 2,
 '94': 2,
 '95': 2,
 '113': 2,
 '122': 2,
 '128': 2,
 '129': 2,
 '137': 2,
 '27': 1,
 '28': 1,
 '30': 1,
 '33': 1,
 '34': 1,
 '38': 1,
 '39': 1,
 '40': 1,
 '41': 1,
 '48': 1,
 '52': 1,
 '55': 1,
 '56': 1,
 '58': 1,
 '59': 1,
 '60': 1,
 '62': 1,
 '65': 1,
 '67': 1,
 '69': 1,
 '72': 1,
 '76': 1,
 '77': 1,
 '81': 1,
 '84': 1,
 '100': 1,
 '101': 1,
 '102': 1,
 '106': 1,
 '112': 1,
 '118': 1,
 '127': 1,
 '130': 1,
 '131': 1,
 '136': 1,
 '145': 1,
 '147': 1,
 '150': 1,
 '151': 1,
 '153': 1,
 '158': 1}

In [58]:
def support(X):
    # [TODO] How do we get SCT here properly?
    # look into OOP based approach
    return SCT[X]/len(txns)

In [59]:
def compareSCO(X1, X2):
    '''
    Compare two transactions in the **Standard Cover Order**
    '''
    if len(X1) != len(X2):
        if len(X1) > len(X2):
            return -1
        else:
            return 1
    elif support(X1) != support(X2):
        if support(X1) > support(X2):
            return -1
        else:
            return 1
    else:
        for i in range(len(X1)):
            if X1[i] != X2[i]:
                if X1[i] < X2[i]:
                    return -1
                else:
                    return 1
        return 0

In [60]:
def strset2intset(strset: set):
    ret = np.array([])
    for el in strset:
        ret = np.append(ret,str2vec(el))
    return set(np.ravel(ret))

In [61]:
def getCover(txn: list, CT: dict):
    '''
    <algorithm 2>

    Get the standard cover of a transaction.
    
    @param txn: A transaction
    @param CT:  code table CT
    @return: a set, standard cover
    CT = { (str)"items":(int)code, ...}
    '''
    # get the standard code
    code = set()
    
    # S ← smallest element X of CT in Standard Cover Order for which X ⊆ t
    for k,v in CT.items():
        if set(str2vec(k)).issubset(set(txn)):
            # add string to set <can this be better?>
            code.add(k)
            # code = set(str2vec(k))
            break
    # if t \ S = ∅ then
    if len(set(txn) - strset2intset(code)) == 0:
        return code
    else:
        # Res ← {S} ∪ StandardCover(t \ S, CT )
        return code.union(getCover(list(set(txn) - strset2intset(code)), CT))

In [62]:
test = getCover(txns[5], SCT)
test

{'44', '8'}

In [63]:
def properPowerset(iterable):
    '''
    powerset([1,2,3]) --> (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)
    '''
    s = list(iterable)
    return list(chain.from_iterable(combinations(s, r) for r in range(2,len(s)+1)))

In [64]:
properPowerset([1,2,4])

[(1, 2), (1, 4), (2, 4), (1, 2, 4)]

In [65]:
def getCandidateSet(txns: list):
    """
    
    candidate set F = D in Standard Candidate Order.

    @param txns: database / list of transactions
    @return: standard code table SCT
    SCT = { itemName:(code,frequency) , ...}
    """
    # get the unique items in transactions
    d = {}
    for tx1 in txns:
        # calculate support
        for ps in properPowerset(tx1):
            key = vec2str(ps)
            if(key not in d):
                d[key] = 1
            else:
                d[key] += 1
    
    # Standard Candidate Order =  decreasing by support, decreasing by count, lexicographic
    # sort items by their frequency
    d = { k:v for k,v in sorted(d.items(), key=lambda x: str2vec(x[0]))}
    #by count
    d = { k:v for k,v in sorted(d.items(), key=lambda x: count_spaces(x[0]), reverse=True)}
    # by support(X) = number of transactions that contain X
    d = { k:v for k,v in sorted(d.items(), key=lambda x: x[1], reverse=True)}
    # print(d)

    return d


F = getCandidateSet(txns)

In [66]:
# F

In [67]:
def LDCT(txns, encoding, xdict):
    ret = 0
    for txn in txns:
        arg = vec2str(txn)
        for X in encoding[arg]:
            ret += np.log(sum(xdict.values())/xdict[X])

    return ret
            

In [68]:
def LCTD(txns, encoding, xdict):
    # L(CT | D) = sum                         < L(code_ST(X))(->????!!!!!) + L(code_CT(X)) >
    #                (for X ∈ CT :usage(X)!=0)  
    ret = 0
    for X in xdict.keys():
        LcodeSTX = 0
        for item in X.strip().split():
            # print(item)
            LcodeSTX += np.log(sum(SCT.values())/SCT[item])
        ret += LcodeSTX
        ret += np.log(sum(xdict.values())/xdict[X])

    return ret

In [69]:
def getL(txns: list, encoding, xdict):
    return LDCT(txns, encoding, xdict) + LCTD(txns, encoding, xdict)

In [70]:
def coverall(CT: dict, txns: list):
    '''
    @return a dict of items and their cover, a dict of cover stats {X: (usage)}

    @definition usage: the number of transactions that contain X in their cover
    '''
    xCov = {}
    covDict = {}
    for txn in txns:
        cov = getCover(txn, CT)
        covDict[vec2str(txn)] = cov
        for X in cov:
            if X not in xCov:
                xCov[X] = 1
            else:
                xCov[X] += 1
    
    return covDict, xCov

In [78]:
# a,b = coverall(SCT, txns)

In [77]:
# b

In [73]:
def krimp(txns: list, F: dict, SCT: dict):
    """
    <algorithm 3>
    Our main KRIMP algorithm
    @param txns: database / list of transactions
    @param F: candidate set F
    @param SCT: standard code table SCT
    @return: compressed code table CT
    """

    CT = SCT.copy()
    for key, value in F.items():
        # CTc ← (CT ∪ F) in Standard Cover Order
        CTc = CT.copy()
        # [TODO] Modify to add key at proper position
        CTc[key] = value
        # cover every txn and get cover stats
        encoding_ctc, dictx_ctc = coverall(CTc, txns)
        encoding_ct, dictx_ct = coverall(CT, txns)
        if getL(txns, encoding_ctc, dictx_ctc) < getL(txns, encoding_ct, dictx_ct):
            CT = CTc

    return CT

In [74]:
ans = krimp(txns, F, SCT)

In [75]:
ans

{'4': 19,
 '1': 18,
 '17': 11,
 '21': 9,
 '0': 8,
 '3': 8,
 '18': 8,
 '8': 7,
 '24': 6,
 '37': 6,
 '70': 6,
 '73': 6,
 '2': 5,
 '16': 5,
 '7': 4,
 '10': 4,
 '13': 4,
 '23': 4,
 '35': 4,
 '44': 4,
 '49': 4,
 '68': 4,
 '9': 3,
 '11': 3,
 '15': 3,
 '20': 3,
 '31': 3,
 '50': 3,
 '63': 3,
 '64': 3,
 '74': 3,
 '79': 3,
 '82': 3,
 '6': 2,
 '12': 2,
 '14': 2,
 '19': 2,
 '25': 2,
 '26': 2,
 '36': 2,
 '43': 2,
 '61': 2,
 '75': 2,
 '94': 2,
 '95': 2,
 '113': 2,
 '122': 2,
 '128': 2,
 '129': 2,
 '137': 2,
 '27': 1,
 '28': 1,
 '30': 1,
 '33': 1,
 '34': 1,
 '38': 1,
 '39': 1,
 '40': 1,
 '41': 1,
 '48': 1,
 '52': 1,
 '55': 1,
 '56': 1,
 '58': 1,
 '59': 1,
 '60': 1,
 '62': 1,
 '65': 1,
 '67': 1,
 '69': 1,
 '72': 1,
 '76': 1,
 '77': 1,
 '81': 1,
 '84': 1,
 '100': 1,
 '101': 1,
 '102': 1,
 '106': 1,
 '112': 1,
 '118': 1,
 '127': 1,
 '130': 1,
 '131': 1,
 '136': 1,
 '145': 1,
 '147': 1,
 '150': 1,
 '151': 1,
 '153': 1,
 '158': 1}

In [76]:
st = '12 105'
st.strip().split()

['12', '105']