In [10]:
import numpy as np
import pandas as pd
from functools import reduce

In [11]:
def supports_vectorized(itemsets, matrix, minSup=0):
    supports_dict = {iset: 0 for iset in itemsets}
    for iset in itemsets:
        supports_dict[iset] = sum(reduce((lambda x,y: x&y), [matrix[item] for item in iset]))
    rm_list = []
    for iset in supports_dict.keys():
        if supports_dict[iset] < minSup:
            rm_list.append(iset)
    for iset in rm_list:
        supports_dict.pop(iset)
    return supports_dict
    #occurence_col = matrix[itemset[0]]
    #for i in range(1, len(itemset)):
    #    occurence_col = occurence_col & matrix[itemset[i]]
    #return sum(occurence_col)

In [12]:
def supports(itemsets, matrix, minSup=0):
    supports_dict = {iset: 0 for iset in itemsets}
    for idx in matrix.index:
        for iset in itemsets:
            row = matrix.loc[idx]
            if reduce((lambda x,y: x&y), [row[item] for item in iset]) == 1:
                supports_dict[iset] += 1
    rm_list = []
    for iset in supports_dict.keys():
        if supports_dict[iset] < minSup:
            rm_list.append(iset)
    for iset in rm_list:
        supports_dict.pop(iset)
    return supports_dict

In [13]:
def powerset_one_less(iterable):
    sets = []
    s = list(iterable)
    for i in range(len(iterable)):
        sets.append(frozenset(s[0:i]+s[i+1:]))
    return sets

In [14]:
def candidate_gen(set_list):
    C = []
    for f1 in set_list:
        for f2 in set_list:
            if len(f1.union(f2)) == len(f1) + 1:
                c = f1.union(f2)
                if c not in C:
                    flag = True
                    for subset in powerset_one_less(c):
                        if subset != f1 and subset != f2 and subset not in set_list:
                            flag = False
                    if flag:
                        C.append(c)
    return C

In [15]:
def apriori(matrix, total_itemset, minRelativeSup):
    all_supports_dict = {}
    skylines = {}
    minSup = minRelativeSup*matrix.shape[0]
    all_supports_dict[1] = supports_vectorized([frozenset([i]) for i in total_itemset], matrix, minSup)
    for singleton in all_supports_dict[1]:
        skylines[singleton] = 1
    k = 2
    while(k <= len(total_itemset) and len(all_supports_dict[k-1].values()) > 0):
        C_k = candidate_gen(all_supports_dict[k-1])
        all_supports_dict[k] = supports_vectorized(C_k, matrix, minSup)
        for ilist in all_supports_dict[k]:
            skylines[ilist] = 1
            for subset in powerset_one_less(ilist):
                skylines[subset] = 0
        k += 1
    skyline_list = []
    for itemset in skylines.keys():
        if skylines[itemset] == 1:
            skyline_list.append(itemset)
    return all_supports_dict, skyline_list

In [16]:
def association_rules(supports_dict, skyline_list, minConfidence = 0, verbose=False):
    confidence_dict = {}
    for iset in skyline_list:
        if (len(iset) >= 2):
            for item in iset:
                right_set = frozenset([item])
                left_set = iset.difference(right_set)
                total_support = supports_dict[len(iset)][iset]
                left_support = supports_dict[len(left_set)][left_set]
                confidence = total_support/left_support
                if confidence > minConfidence:
                    confidence_dict[(left_set, item)] = confidence
    return confidence_dict

In [17]:
def goods_lookup(item_id, goods):
    row = goods.loc[item_id]
    return row["Flavor"][1:-1] + " " + row["Food"][1:-1]

In [62]:
def mine_bakery_rules(filename, minRelativeSup, minConf, goods_filename, output_file=''):
    #reading in goods and bakery datasets
    goods = pd.read_csv(goods_filename)
    bakery = pd.read_csv(filename, names=goods.index)
    
    #apriori and association rules
    supports_dict,skyline_list = apriori(bakery, list(goods.index), minRelativeSup)
    association_rules_dict = association_rules(supports_dict, skyline_list, minConf)
    
    if output_file != '':
        #formatting skyline frequent item sets
        skylines_formatted = []
        for entry in skyline_list:
            skyline_set = [goods_lookup(i, goods) for i in entry]
            support = supports_dict[len(entry)][entry]/bakery.shape[0]
            skylines_formatted.append((skyline_set, support))

        #formatting association rules
        arules_formatted = []
        for entry in association_rules_dict.keys():
            left_side = [goods_lookup(i, goods) for i in entry[0]]
            right_side = goods_lookup(entry[1], goods)
            full_itemset = entry[0].union([entry[1]])
            support = supports_dict[len(full_itemset)][full_itemset]/bakery.shape[0]
            confidence = association_rules_dict[entry]
            arules_formatted.append((left_side, right_side, support, confidence))
        
        f = open(output_file, "w")
        f.write("Skyline Frequent Item Sets: \n")
        for skyline in skylines_formatted:
            f.write("{}, support={}\n".format(skyline[0], skyline[1]))
        f.write("\nAssociation Rules:\n")
        i = 1
        for arule in arules_formatted:
            f.write("Rule {}, {} -> {} [support={}, confidence={}]\n"\
                    .format(i, arule[0], arule[1], arule, confidence))
        f.close()
    return "{} Skyline, {} A-Rules"\
                .format(len(skyline_list), len(association_rules_dict.keys()))

In [63]:
for size in ["5000",  "20000", "75000"]:
        mine_bakery_rules(size+"-out2.csv", .02, .9, "goods.csv", size+"-output.txt")

In [30]:
skylines_final_1000, rules_final_1000 = mine_bakery_rules("out2.csv", .1, 0, "goods.csv")
print("Skylines:" + "\n")
for item in skylines_final_1000:
    print(item)
print()
print("Association Rules:" + "\n")
for item in rules_final_1000:
    print(item)

Skylines:

(['Lemon Cake', 'Single Espresso'], 0.127)
(['Napoleon Cake', 'Gongolais Cookie'], 0.181)
(['Apple Danish', 'Blackberry Tart'], 0.139)
(['Blueberry Tart', 'Apple Tart', 'Berry Tart'], 0.257)

Association Rules:

(['Single Espresso'], 'Lemon Cake', 0.127, 0.7839506172839507)
(['Lemon Cake'], 'Single Espresso', 0.127, 0.8141025641025641)
(['Gongolais Cookie'], 'Napoleon Cake', 0.181, 0.8418604651162791)
(['Napoleon Cake'], 'Gongolais Cookie', 0.181, 0.8044444444444444)
(['Blackberry Tart'], 'Apple Danish', 0.139, 0.7513513513513513)
(['Apple Danish'], 'Blackberry Tart', 0.139, 0.7988505747126436)
(['Apple Tart', 'Berry Tart'], 'Blueberry Tart', 0.257, 0.9589552238805971)
(['Blueberry Tart', 'Berry Tart'], 'Apple Tart', 0.257, 0.9961240310077519)
(['Blueberry Tart', 'Apple Tart'], 'Berry Tart', 0.257, 0.9922779922779923)


In [20]:
skylines_final_5000, rules_final_5000 = mine_bakery_rules("5000-out2.csv", .01, .9, "goods.csv")
print("Skylines:" + "\n")
for item in skylines_final_5000:
    print(item)
print()
print("Association Rules:" + "\n")
for item in rules_final_5000:
    print(item)

Skylines:

(['Chocolate Eclair'], 0.0382)
(['Vanilla Eclair'], 0.046)
(['Almond Tart'], 0.0386)
(['Apricot Tart'], 0.0422)
(['Pecan Tart'], 0.0444)
(['Ganache Cookie'], 0.0388)
(['Chocolate Meringue'], 0.0452)
(['Vanilla Meringue'], 0.0398)
(['Almond Croissant'], 0.0456)
(['Chocolate Croissant'], 0.0432)
(['Almond Bear Claw'], 0.0428)
(['Blueberry Danish'], 0.04)
(['Lemon Cake', 'Lemon Tart'], 0.0336)
(['Napoleon Cake', 'Strawberry Cake'], 0.0422)
(['Truffle Cake', 'Gongolais Cookie'], 0.0472)
(['Bottled Water', 'Berry Tart'], 0.0366)
(['Marzipan Cookie', 'Tuile Cookie'], 0.0496)
(['Cheese Croissant', 'Orange Juice'], 0.043)
(['Chocolate Cake', 'Casino Cake', 'Chocolate Coffee'], 0.0312)
(['Apricot Danish', 'Cherry Tart', 'Opera Cake'], 0.0408)
(['Single Espresso', 'Blackberry Tart', 'Coffee Eclair'], 0.0286)
(['Blueberry Tart', 'Apricot Croissant', 'Hot Coffee'], 0.0328)
(['Chocolate Tart', 'Walnut Cookie', 'Vanilla Frappuccino'], 0.0266)
(['Almond Twist', 'Coffee Eclair', 'Apple Pie'

In [21]:
skylines_final_20000, rules_final_20000 = mine_bakery_rules("20000-out2.csv", .01, .5, "goods.csv")
print("Skylines:" + "\n")
for item in skylines_final_20000:
    print(item)
print()
print("Association Rules:" + "\n")
for item in rules_final_20000:
    print(item)

Skylines:

(['Chocolate Eclair'], 0.0426)
(['Vanilla Eclair'], 0.0427)
(['Almond Tart'], 0.04055)
(['Apricot Tart'], 0.04275)
(['Pecan Tart'], 0.04155)
(['Ganache Cookie'], 0.0433)
(['Chocolate Meringue'], 0.0445)
(['Vanilla Meringue'], 0.0424)
(['Almond Croissant'], 0.04205)
(['Chocolate Croissant'], 0.0446)
(['Almond Bear Claw'], 0.04425)
(['Blueberry Danish'], 0.04115)
(['Lemon Cake', 'Lemon Tart'], 0.037)
(['Napoleon Cake', 'Strawberry Cake'], 0.04455)
(['Truffle Cake', 'Gongolais Cookie'], 0.04335)
(['Bottled Water', 'Berry Tart'], 0.0357)
(['Marzipan Cookie', 'Tuile Cookie'], 0.04855)
(['Cheese Croissant', 'Orange Juice'], 0.0439)
(['Chocolate Cake', 'Casino Cake', 'Chocolate Coffee'], 0.0339)
(['Apricot Danish', 'Cherry Tart', 'Opera Cake'], 0.041)
(['Single Espresso', 'Blackberry Tart', 'Coffee Eclair'], 0.02695)
(['Blueberry Tart', 'Apricot Croissant', 'Hot Coffee'], 0.0326)
(['Chocolate Tart', 'Walnut Cookie', 'Vanilla Frappuccino'], 0.02825)
(['Almond Twist', 'Coffee Eclair'

In [22]:
skylines_final_75000, rules_final_75000 = mine_bakery_rules("75000-out2.csv", .01, .5, "goods.csv")
print("Skylines:" + "\n")
for item in skylines_final_75000:
    print(item)
print()
print("Association Rules:" + "\n")
for item in rules_final_75000:
    print(item)

Skylines:

(['Chocolate Eclair'], 0.04237333333333333)
(['Vanilla Eclair'], 0.04252)
(['Almond Tart'], 0.04204)
(['Apricot Tart'], 0.04236)
(['Pecan Tart'], 0.04337333333333333)
(['Ganache Cookie'], 0.04324)
(['Chocolate Meringue'], 0.041933333333333336)
(['Vanilla Meringue'], 0.04238666666666667)
(['Almond Croissant'], 0.04273333333333333)
(['Chocolate Croissant'], 0.04324)
(['Almond Bear Claw'], 0.04244)
(['Blueberry Danish'], 0.04409333333333333)
(['Lemon Cake', 'Lemon Tart'], 0.036853333333333335)
(['Napoleon Cake', 'Strawberry Cake'], 0.043146666666666667)
(['Truffle Cake', 'Gongolais Cookie'], 0.04392)
(['Bottled Water', 'Berry Tart'], 0.0378)
(['Marzipan Cookie', 'Tuile Cookie'], 0.05092)
(['Cheese Croissant', 'Orange Juice'], 0.04306666666666667)
(['Chocolate Cake', 'Casino Cake', 'Chocolate Coffee'], 0.03338666666666667)
(['Apricot Danish', 'Cherry Tart', 'Opera Cake'], 0.041106666666666666)
(['Single Espresso', 'Blackberry Tart', 'Coffee Eclair'], 0.0272)
(['Blueberry Tart', 

In [33]:
def mine_reader_rules(filename, minRelativeSup, minConf, author_filename, verbose=False):
    #reading in reader dataset as dictionary
    f = open(filename, "r")
    read_dict = {}
    lines = f.readlines()
    for line in lines:
        tokens = [tok.strip() for tok in line.split(",")]
        read_dict[int(tokens[0])] = [int(tok) for tok in tokens[1:]]
        
    #reading in author dataset as dictionary
    f = open(author_filename, "r")
    lines = f.readlines()
    author_dict = {}
    for line in lines:
        tokens = [tok.strip() for tok in line.split("|")]
        author_dict[int(tokens[0])] = tokens[1]
        
    #converting to binary vector format
    read_df_dict = {}
    for reader in read_dict.keys():
        binary_vector = np.zeros(len(author_dict.keys()))
        for idx in read_dict[reader]:
            binary_vector[idx-1] = 1
        read_df_dict[reader] = binary_vector
    reader_df = pd.DataFrame(read_df_dict, index=author_dict.keys(), dtype="int").transpose()
    reader_df = reader_df.drop(1,1) #drop the NA column
    
    #apriori and association rules
    supports_dict,skyline_list = apriori(reader_df, list(reader_df.columns), minRelativeSup)
    association_rules_dict = association_rules(supports_dict, skyline_list, minConf)
    
    #formatting skyline frequent item sets
    skylines_formatted = []
    for entry in skyline_list:
        skyline_set = [author_dict[i] for i in entry]
        support = supports_dict[len(entry)][entry]/reader_df.shape[0]
        skylines_formatted.append((skyline_set, support))
        
    #formatting association rules
    arules_formatted = []
    for entry in association_rules_dict.keys():
        left_side = [author_dict[i] for i in entry[0]]
        right_side = author_dict[entry[1]]
        total_itemset = entry[0].union([entry[1]])
        support = supports_dict[len(total_itemset)][total_itemset]/reader_df.shape[0]
        confidence = association_rules_dict[entry]
        arules_formatted.append((left_side, right_side, support, confidence))
    return skylines_formatted, arules_formatted

In [39]:
skylines_final, rules_final = mine_reader_rules("bingoBaskets.csv", .075, .5, "authorlist.psv")
print("Skylines:" + "\n")
for item in skylines_final:
    print(item)
print()
print("Association Rules:" + "\n")
for item in rules_final:
    print(item)

Skylines:

(['Aaronovitch, Ben'], 0.10699588477366255)
(['Abraham, Daniel / Hanover, M. L. N. / Corey, James S. A.'], 0.07818930041152264)
(['Anders, Charlie Jane'], 0.11522633744855967)
(['Atwood, Margaret'], 0.09876543209876543)
(['Bardugo, Leigh'], 0.102880658436214)
(['Bear, Elizabeth'], 0.11934156378600823)
(['Beaulieu, Bradley P.'], 0.10699588477366255)
(['Bennett, Robert Jackson'], 0.12757201646090535)
(['Brown, Pierce'], 0.102880658436214)
(['Bujold, Lois McMaster'], 0.09053497942386832)
(['Butler, Octavia E.'], 0.09053497942386832)
(['Carriger, Gail'], 0.08641975308641975)
(['Drake, Darrell'], 0.11934156378600823)
(['Elliott, Kate / Rasmussen, Alis A.'], 0.08641975308641975)
(['Erikson, Steven'], 0.08641975308641975)
(['Huff, Tanya'], 0.0823045267489712)
(['Jones, Diana Wynne'], 0.11522633744855967)
(['Kowal, Mary Robinette'], 0.1111111111111111)
(['Lee, Yoon Ha'], 0.09876543209876543)
(['Liu, Ken'], 0.102880658436214)
(['Liu, Marjorie'], 0.08641975308641975)
(['Malerman, Josh