In [157]:
import numpy as np
from itertools import chain, combinations

In [158]:
class Itemset:
    def __init__(self, items: list):
        self.items = items
        self.size = len(items)
    
    def getItems(self):
        return self.items
    def getSize(self):
        return self.size

In [159]:
# read all transactions from comma-separated file
def read_transactions(filename):
    transactions = []
    with open(filename, 'r') as f:
        for line in f:
            transactions.append(line.strip().split(','))
    return transactions

In [160]:
# convert all elements in txn from string to int
def convert_to_int(txns):
    for i in range(len(txns)):
        for j in range(len(txns[i])):
            txns[i][j] = int(txns[i][j])
        txns[i].sort()
    return txns


In [161]:
def convert_to_objects(txn):
    ret = []
    for i in range(len(txn)):
        ret.append(Itemset(txn[i]))
    return ret

In [162]:
def vec2str(vec):
    ret = ''
    for i in range(len(vec)):
        ret += str(vec[i]) + ' '
    ret = ret.rstrip()
    return ret

def str2vec(str: str):
    ret = []
    for i in range(len(str.split(' '))):
        ret.append(int(str.split(' ')[i]))
    return ret

def count_spaces(str: str):
    return str.count(' ')

In [163]:
txns = read_transactions('data/grocery.txt')
txns = convert_to_int(txns)
txns_itemsets = convert_to_objects(txns)

In [164]:
# txns
# txn_itemsets # find a better name for this

In [165]:
# sample dictionary
d = {'1': 24, '10': 86, '5': 33, '20': 34, '1 20': 34, '5 7': 36, '9 10': 37,'1 21': 37}
d = {'1': 2, '10': 2, '5': 2, '20': 1, '1 20': 1, '5 7': 1, '9 10': 1,'1 21': 1}
d.items()


dict_items([('1', 2), ('10', 2), ('5', 2), ('20', 1), ('1 20', 1), ('5 7', 1), ('9 10', 1), ('1 21', 1)])

In [166]:
# SCO = decreasing by count, decreasing by support, lexicographic
# Try to test more thoroughly
# lexicographic
# d = { k:v for k,v in sorted(d.items(), key=lambda x: x[0])}
d = { k:v for k,v in sorted(d.items(), key=lambda x: str2vec(x[0]))}
# by support
d = { k:v for k,v in sorted(d.items(), key=lambda x: x[1], reverse=True)}
#by count
d = { k:v for k,v in sorted(d.items(), key=lambda x: count_spaces(x[0]), reverse=True)}

In [167]:
d

{'1 20': 1, '1 21': 1, '5 7': 1, '9 10': 1, '1': 2, '5': 2, '10': 2, '20': 1}

In [168]:
def getStandardCodeTable(txns: list):
    """
    <algorithm 1>
    
    Get the standard code table from the database.

    @param txns: database / list of transactions
    @return: standard code table SCT
    SCT = { itemName:(code,frequency) , ...}
    """
    # get the unique items in transactions
    d = {}
    for tx in txns:
        for item in tx:
            key = vec2str([item])
            if(key not in d):
                d[key] = 1
            else:
                d[key]+=1
    
    # sort items by their frequency
    d = { k:v for k,v in sorted(d.items(), key=lambda x: str2vec(x[0]))}
    # by support
    d = { k:v for k,v in sorted(d.items(), key=lambda x: x[1], reverse=True)}
    #by count
    d = { k:v for k,v in sorted(d.items(), key=lambda x: count_spaces(x[0]), reverse=True)}
    # print(d)

    # get the standard code table, we don't need the actual code rn
    code_table = {}
    # actual_code = 0
    for key, value in d.items():
        code_table[key] = value
        # code_table[key] = (actual_code,value)
        # actual_code+=1

    return code_table


global SCT
SCT = getStandardCodeTable(txns)
# CT = SCT.copy()

In [169]:
SCT

{'1': 2363,
 '3': 1827,
 '4': 1646,
 '37': 1453,
 '17': 1285,
 '21': 1041,
 '0': 1014,
 '16': 908,
 '18': 903,
 '6': 795,
 '23': 774,
 '2': 734,
 '70': 712,
 '24': 702,
 '49': 678,
 '73': 654,
 '43': 582,
 '8': 565,
 '19': 563,
 '22': 555,
 '68': 555,
 '10': 527,
 '11': 509,
 '7': 508,
 '44': 504,
 '69': 482,
 '26': 473,
 '74': 419,
 '9': 417,
 '82': 359,
 '59': 354,
 '13': 353,
 '50': 353,
 '113': 331,
 '20': 327,
 '25': 326,
 '63': 320,
 '65': 303,
 '79': 281,
 '114': 277,
 '118': 268,
 '34': 265,
 '15': 263,
 '28': 256,
 '56': 252,
 '35': 251,
 '48': 248,
 '52': 239,
 '27': 236,
 '57': 227,
 '72': 223,
 '58': 220,
 '39': 216,
 '64': 215,
 '75': 210,
 '14': 209,
 '95': 205,
 '99': 180,
 '103': 177,
 '55': 175,
 '36': 158,
 '31': 157,
 '40': 152,
 '127': 150,
 '33': 146,
 '102': 142,
 '135': 135,
 '61': 134,
 '38': 129,
 '12': 127,
 '45': 121,
 '77': 121,
 '5': 117,
 '100': 115,
 '141': 106,
 '60': 103,
 '94': 102,
 '88': 100,
 '53': 98,
 '90': 92,
 '122': 92,
 '153': 92,
 '71': 89,
 

In [170]:
def support(X):
    # [TODO] How do we get SCT here properly?
    # look into OOP based approach
    return SCT[X]/len(txns)

In [171]:
def compareSCO(X1, X2):
    '''
    Compare two transactions in the **Standard Cover Order**
    '''
    if len(X1) != len(X2):
        if len(X1) > len(X2):
            return -1
        else:
            return 1
    elif support(X1) != support(X2):
        if support(X1) > support(X2):
            return -1
        else:
            return 1
    else:
        for i in range(len(X1)):
            if X1[i] != X2[i]:
                if X1[i] < X2[i]:
                    return -1
                else:
                    return 1
        return 0

In [172]:
def strset2intset(strset: set):
    ret = np.array([])
    for el in strset:
        ret = np.append(ret,str2vec(el))
    return set(np.ravel(ret))

In [173]:
def getCover(txn: list, CT: dict):
    '''
    <algorithm 2>

    Get the standard cover of a transaction.
    
    @param txn: A transaction
    @param CT:  code table CT
    @return: a set, standard cover
    CT = { (str)"items":(int)code, ...}
    '''
    # get the standard code
    code = set()
    
    # S ← smallest element X of CT in Standard Cover Order for which X ⊆ t
    for k,v in CT.items():
        if set(str2vec(k)).issubset(set(txn)):
            # add string to set <can this be better?>
            code.add(k)
            # code = set(str2vec(k))
            break
    # if t \ S = ∅ then
    if len(set(txn) - strset2intset(code)) == 0:
        return code
    else:
        # Res ← {S} ∪ StandardCover(t \ S, CT )
        return code.union(getCover(list(set(txn) - strset2intset(code)), CT))

In [174]:
test = getCover(txns[0], SCT)
test

{'1', '102', '17', '18'}

In [175]:
def properPowerset(iterable):
    '''
    powerset([1,2,3]) --> (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)
    '''
    s = list(iterable)
    return list(chain.from_iterable(combinations(s, r) for r in range(2,len(s)+1)))

In [176]:
properPowerset([1,2,4])

[(1, 2), (1, 4), (2, 4), (1, 2, 4)]

In [177]:
def getCandidateSet(txns: list):
    """
    
    candidate set F = D in Standard Candidate Order.

    @param txns: database / list of transactions
    @return: standard code table SCT
    SCT = { itemName:(code,frequency) , ...}
    """
    # get the unique items in transactions
    d = {}
    for tx1 in txns:
        # calculate support
        for ps in properPowerset(tx1):
            key = vec2str(ps)
            if(key not in d):
                d[key] = 1
            else:
                d[key] += 1
    
    # Standard Candidate Order =  decreasing by support, decreasing by count, lexicographic
    # sort items by their frequency
    d = { k:v for k,v in sorted(d.items(), key=lambda x: str2vec(x[0]))}
    #by count
    d = { k:v for k,v in sorted(d.items(), key=lambda x: count_spaces(x[0]), reverse=True)}
    # by support(X) = number of transactions that contain X
    d = { k:v for k,v in sorted(d.items(), key=lambda x: x[1], reverse=True)}
    # print(d)

    return d


F = getCandidateSet(txns)

In [178]:
# F

In [179]:
def LDCT(txns, encoding, xdict):
    ret = 0
    for txn in txns:
        for X in encoding[vec2str(txn)]:
            ret += np.log(sum(xdict.values())/xdict[X])

    return ret
            

In [180]:
def LCTD(txns, encoding, xdict):
    # L(CT | D) = sum                         < L(code_ST(X))(->????!!!!!) + L(code_CT(X)) >
    #                (for X ∈ CT :usage(X)!=0)  
    ret = 0
    for X in xdict.keys():
        LcodeSTX = 0
        for item in X:
            LcodeSTX += np.log(sum(SCT.values())/SCT[item])
        ret += LcodeSTX
        ret += np.log(sum(xdict.values())/xdict[X])

    return ret

In [181]:
def getL(txns: list, encoding, xdict):
    return LDCT(txns, encoding, xdict) + LCTD(txns, encoding, xdict)

In [182]:
def coverall(CT: dict, txns: list):
    '''
    @return a dict of items and their cover, a dict of cover stats {X: (usage)}

    @definition usage: the number of transactions that contain X in their cover
    '''
    xCov = {}
    covDict = {}
    for txn in txns:
        cov = getCover(txn, CT)
        # print(cov)
        # print(txn)
        # txn.sort()
        # print(txn)
        # is tuple() better than vec2str() ?
        covDict[vec2str(txn)] = cov
        for X in cov:
            if X not in xCov:
                xCov[X] = 1
            else:
                xCov[X] += 1
    
    return covDict, xCov

In [185]:
a,b = coverall(SCT, txns)

In [188]:
LDCT(txns, SCT, b)

KeyError: '1 17 18 102'

In [189]:
def krimp(txns: list, F: dict, SCT: dict):
    """
    <algorithm 3>
    Our main KRIMP algorithm
    @param txns: database / list of transactions
    @param F: candidate set F
    @param SCT: standard code table SCT
    @return: compressed code table CT
    """

    CT = SCT.copy()
    for key, value in F.items():
        # CTc ← (CT ∪ F) in Standard Cover Order
        CTc = CT.copy()
        CTc[key] = value
        # cover every txn and get cover stats
        encoding_ctc, dictx_ctc = coverall(CTc, txns)
        encoding_ct, dictx_ct = coverall(CT, txns)
        if getL(txns, encoding_ctc, dictx_ctc) < getL(encoding_ct, dictx_ct, CT):
            CT = CTc

    return CT

In [190]:
ans = krimp(txns, F, SCT)

KeyError: '1   1 7   1 8   1 0 2'