In [361]:
#Read the data into the program as a list of records
def ReadIndex(filename):
    file = open(filename,'r')
    record_index = []
    doc = file.readlines()
    for line in doc:
        line = line.strip('\n ')
        line_lst = line.split(' ')
        record_index.append(line_lst)
    return record_index

In [362]:
#Create a frequency index for each of the items
def getOccIndex(record_index):
    freq_index = {}
    for record_num in range(len(record_index)):
        for item in record_index[record_num]:
            if item not in freq_index:
                freq_index[item] = []
                freq_index[item].append(record_num)
            else:
                freq_index[item].append(record_num)
    return freq_index

In [363]:
#Creates an empty list to fill in with frequent itemsets
def getFreqSets(num_terms):
    freq_itemsets = []
    for i in range(0,num_terms+1):
        temp = []
        freq_itemsets.append(temp)
    return freq_itemsets

In [364]:
#Calculates the support of every itemset
def GetSup(itemset,occ_index,d_sup_vals):
    if tuple(itemset) in d_sup_vals:
        return d_sup_vals[tuple(itemset)]
    inv_lists = []
    for term in itemset:
        inv_lists.append(occ_index[term])
    else:
        sup = len(Intersect(inv_lists))
        d_sup_vals[tuple(itemset)] = sup
    return sup

In [365]:
#Returns the intersect of the terms of inv_lists
def Intersect(inv_lists):
    sect_terms = inv_lists[0]
    for term in inv_lists:
        sect_terms = set(sect_terms).intersection(term)
    return sect_terms

In [366]:
#Determine if two sets need to be joined
def NeedJoin(set1,set2):
    for i in range(0,len(set1)-1):
        if set1[i] != set2[i]:
            return False
    return True

In [367]:
#Joins two sets
def Join(set1,set2):
    set3 = list(set1)
    set3.append(set2[-1])
    return set3

In [368]:
def getCodeMap(code_filename):
    code_map = {}
    file = open(code_filename,'r')
    codes = file.readlines()
    for code in codes:
        code = code.split(" ")
        code_map[code[1].strip("\n")] = code[0]
    return code_map

In [369]:
def parseCodeMap(code_map):
    item_codes = []
    item_names = []
    for name in code_map:
        item_codes.append(code_map[name])
        item_names.append(name)
    return item_codes, item_names

In [375]:
def getCleanIndex(unclean_index,item_codes,item_names,code_map):
    clean_index = []
    for record_num in range(len(unclean_index)):
        #print(record)
        #Split on ';' if needed
        if len(unclean_index[record_num]) == 1:
            if ';' in unclean_index[record_num][0]:
                unclean_index[record_num] = unclean_index[record_num][0].strip("\n").split(';')[0:-1]
                
        #Replace item names with item codes
        for item_num in range(len(unclean_index[record_num])):
            if unclean_index[record_num][item_num] in code_map:
                unclean_index[record_num][item_num] = code_map[unclean_index[record_num][item_num]]
                
        #Delete records with invalid item codes
        num_invalid = 0
        for item_num in range(len(unclean_index[record_num])):
            if unclean_index[record_num][item_num] not in item_codes:
                num_invalid += 1
        if num_invalid == 0:
            clean_index.append(unclean_index[record_num])
        
    return clean_index    

In [376]:
code_map = getCodeMap("codeprodmap.txt")
print(code_map)

{'Avocados': 'P01', 'BakingPowder': 'P02', 'BakingSoda': 'P03', 'BrownieMix': 'P04', 'Catfish': 'P05', 'Crackers': 'P06', 'Fruit': 'P07', 'Glue': 'P08', 'GranolaBars': 'P09', 'HairGel': 'P10', 'IceCream': 'P11', 'InsectRepellent': 'P12', 'JuiceConcentrate': 'P13', 'Ketchup': 'P14', 'LipBalm': 'P15', 'MoisturizingLotion': 'P16', 'Non-stickSpray': 'P17', 'Notepad': 'P18', 'PeanutButter': 'P19', 'Pork': 'P20', 'Relish': 'P21', 'Shortening': 'P22', 'Shrimp': 'P23', 'SlicedBread': 'P24', 'SodaPop': 'P25', 'SteakSauce': 'P26'}


In [377]:
#Create list of codes and items
item_codes, item_names = parseCodeMap(code_map)
print(item_codes)
print(item_names)

['P01', 'P02', 'P03', 'P04', 'P05', 'P06', 'P07', 'P08', 'P09', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15', 'P16', 'P17', 'P18', 'P19', 'P20', 'P21', 'P22', 'P23', 'P24', 'P25', 'P26']
['Avocados', 'BakingPowder', 'BakingSoda', 'BrownieMix', 'Catfish', 'Crackers', 'Fruit', 'Glue', 'GranolaBars', 'HairGel', 'IceCream', 'InsectRepellent', 'JuiceConcentrate', 'Ketchup', 'LipBalm', 'MoisturizingLotion', 'Non-stickSpray', 'Notepad', 'PeanutButter', 'Pork', 'Relish', 'Shortening', 'Shrimp', 'SlicedBread', 'SodaPop', 'SteakSauce']


In [378]:
unclean_index = ReadIndex('trans_unclean.txt')

[['AvocadosBakingPowder;Brownie;Catfish;Crackers;Fruit;HairGel;JuiceConcentrate;Non-stickSpray;Notepad;Pork;SodaPop;'], ['Avocados;BakingPowder;BrownieMix;Catfish;Crackers;Fruit;HairGel;JuiceConcentrate;Non-stickSpray;Notepad;Pork;SodaPop;'], ['Avocados;BakingPowder;BrownieMix;GranolaBars;HairGel;JuiceConcentrate;Ketchup;MoisturizingLotion;Non-stickSpray;Notepad;PeanutButter;Pork;'], ['Avocados;BakingPowder;IceCream;Ketchup;LipBalm;MoisturizingLotion;Notepad;Relish;Shortening;Shrimp;SlicedBread;'], ['Avocados;BakingStuff;BrownieMix;GranolaBars;HairGel;JuiceConcentrate;Ketchup;MoisturizingLotion;Non-stickSpray;Notepad;PeanutButter;Park;'], ['Avocados;Fruit;GranolaBars;JuiceConcentrate;LipBalm;Relish;Shortening;'], ['Avocados;FruitMango;GranolaTars;JuiceConcentrate;LipBalm;Relish;Shortening;'], ['Avocados;PI7;IceCream;Ketchup;LipBalm;MoisturizingLotion;Notepad;Relish;Shortening;Shrimp;SlicedBread;'], ['BakingPowder;BakingSoda;Glue;Ketchup;LipBalm;MoisturizingLotion;PeanutButter;Relish;Sh

In [379]:
clean_index = getCleanIndex(unclean_index,item_codes,item_names,code_map)
print(clean_index)

[['P01', 'P02', 'P04', 'P05', 'P06', 'P07', 'P10', 'P13', 'P17', 'P18', 'P20', 'P25'], ['P01', 'P02', 'P04', 'P09', 'P10', 'P13', 'P14', 'P16', 'P17', 'P18', 'P19', 'P20'], ['P01', 'P02', 'P11', 'P14', 'P15', 'P16', 'P18', 'P21', 'P22', 'P23', 'P24'], ['P01', 'P07', 'P09', 'P13', 'P15', 'P21', 'P22'], ['P02', 'P03', 'P08', 'P14', 'P15', 'P16', 'P19', 'P21', 'P23', 'P26'], ['P02', 'P04', 'P07', 'P09', 'P10', 'P12', 'P13', 'P17', 'P20', 'P21', 'P24', 'P25'], ['P02', 'P18'], ['P02', 'P23'], ['P03', 'P05', 'P08', 'P10', 'P11', 'P14', 'P26'], ['P03', 'P10', 'P12', 'P14', 'P17', 'P25', 'P26'], ['P04', 'P05', 'P07', 'P08', 'P09', 'P12', 'P18', 'P19', 'P22'], ['P07', 'P16', 'P17', 'P23'], ['P09', 'P12', 'P19'], ['P10', 'P12', 'P21', 'P22', 'P24', 'P25'], ['P17', 'P19', 'P24'], ['P01', 'P02', 'P07', 'P08', 'P19', 'P21', 'P22', 'P23', 'P26'], ['P01', 'P03', 'P04', 'P05', 'P10', 'P15', 'P16', 'P17', 'P18', 'P19', 'P20', 'P24'], ['P01', 'P03', 'P06', 'P08', 'P09', 'P13', 'P18', 'P20', 'P23', 'P24'

In [380]:
print(len(clean_index))

30


In [382]:
record_index = ReadIndex('trans_clean.txt')

#Get the occurrence based index
occ_index = getOccIndex(record_index)

#Get the total number of records and number of unique terms
num_records = len(record_index)
num_unique_items = len(occ_index)

#get Minimunm support value
min_sup = 2

#Create a dictionary of itemsets and their support values
d_sup_vals = {}

print(occ_index)

{'P01': [0, 1, 2, 3, 4, 5, 6], 'P02': [0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13], 'P04': [0, 1, 4, 8, 14, 20, 21], 'P05': [0, 4, 15, 16, 20, 22], 'P06': [0, 5, 9, 14, 23, 24], 'P07': [0, 2, 6, 8, 10, 14, 20, 21, 25], 'P10': [0, 1, 4, 8, 15, 17, 18, 26, 28], 'P13': [0, 1, 5, 6, 8, 16, 24], 'P17': [0, 1, 4, 8, 9, 11, 14, 18, 19, 22, 25, 29], 'P18': [0, 1, 3, 4, 5, 9, 12, 16, 17, 20, 21], 'P20': [0, 1, 4, 5, 8, 9, 10, 14, 17], 'P25': [0, 5, 8, 9, 18, 28], 'P09': [1, 5, 6, 8, 20, 22, 23, 26, 27], 'P14': [1, 3, 7, 15, 16, 17, 18, 19], 'P16': [1, 3, 4, 7, 10, 16, 25], 'P19': [1, 2, 4, 7, 16, 20, 21, 23, 27, 29], 'P08': [2, 5, 7, 14, 15, 16, 17, 20], 'P21': [2, 3, 6, 7, 8, 9, 14, 16, 23, 24, 28], 'P22': [2, 3, 6, 11, 16, 17, 20, 28], 'P23': [2, 3, 5, 7, 13, 14, 17, 19, 22, 25], 'P26': [2, 5, 7, 9, 15, 16, 18, 21, 22, 26], 'P11': [3, 15, 17, 19, 21, 22, 23], 'P15': [3, 4, 6, 7], 'P24': [3, 4, 5, 8, 11, 16, 28, 29], 'P03': [4, 5, 7, 14, 15, 16, 17, 18, 19], 'P12': [8, 16, 18, 20, 21, 22, 27, 28]}


In [229]:
#Initialise empty set of frequent itemsets
freq_itemsets = getFreqSets(num_unique_items)

In [230]:
#Find all the frequent 1 itemsets
for item in occ_index:
    #print(item)
    itemset = [item]
    item_sup = GetSup(itemset,occ_index,d_sup_vals)
    #print(item_sup)
    if item_sup > min_sup:
        freq_itemsets[0].append(itemset)
        
#print(freq_itemsets)

In [231]:
print(freq_itemsets)

[[['P01'], ['P02'], ['P04'], ['P05'], ['P06'], ['P07'], ['P10'], ['P13'], ['P17'], ['P18'], ['P20'], ['P25'], ['P09'], ['P14'], ['P16'], ['P19'], ['P08'], ['P21'], ['P22'], ['P23'], ['P26'], ['P11'], ['P15'], ['P24'], ['P03'], ['P12']], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]


In [232]:
#Find frequent 2 to K-itemsets
for k in range(1,num_unique_items+1):
    prev_itemsets = freq_itemsets[k-1]
    n = len(prev_itemsets)
    for i in range(1,n-1):
        for j in range(i+1,n):
            set1 = prev_itemsets[i]
            set2 = prev_itemsets[j]
            if NeedJoin(set1,set2):
                set3 = Join(set1,set2)
                if GetSup(set3,occ_index,d_sup_vals)>min_sup:
                    freq_itemsets[k].append(set3)
    if len(freq_itemsets) == 1:
        break

In [233]:
print(freq_itemsets)

[[['P01'], ['P02'], ['P04'], ['P05'], ['P06'], ['P07'], ['P10'], ['P13'], ['P17'], ['P18'], ['P20'], ['P25'], ['P09'], ['P14'], ['P16'], ['P19'], ['P08'], ['P21'], ['P22'], ['P23'], ['P26'], ['P11'], ['P15'], ['P24'], ['P03'], ['P12']], [['P02', 'P04'], ['P02', 'P07'], ['P02', 'P10'], ['P02', 'P13'], ['P02', 'P17'], ['P02', 'P18'], ['P02', 'P20'], ['P02', 'P25'], ['P02', 'P14'], ['P02', 'P16'], ['P02', 'P19'], ['P02', 'P21'], ['P02', 'P22'], ['P02', 'P23'], ['P02', 'P26'], ['P02', 'P24'], ['P04', 'P05'], ['P04', 'P07'], ['P04', 'P10'], ['P04', 'P13'], ['P04', 'P17'], ['P04', 'P18'], ['P04', 'P20'], ['P04', 'P09'], ['P04', 'P19'], ['P04', 'P12'], ['P05', 'P10'], ['P05', 'P17'], ['P05', 'P18'], ['P05', 'P19'], ['P05', 'P08'], ['P05', 'P26'], ['P05', 'P03'], ['P05', 'P12'], ['P06', 'P13'], ['P06', 'P17'], ['P06', 'P18'], ['P06', 'P20'], ['P06', 'P25'], ['P06', 'P21'], ['P07', 'P13'], ['P07', 'P17'], ['P07', 'P18'], ['P07', 'P20'], ['P07', 'P09'], ['P07', 'P19'], ['P07', 'P08'], ['P07', 'P

In [239]:
print(d_sup_vals)

{('P01',): 7, ('P02',): 11, ('P04',): 7, ('P05',): 6, ('P06',): 6, ('P07',): 9, ('P10',): 9, ('P13',): 7, ('P17',): 12, ('P18',): 11, ('P20',): 9, ('P25',): 6, ('P09',): 9, ('P14',): 8, ('P16',): 7, ('P19',): 10, ('P08',): 8, ('P21',): 11, ('P22',): 8, ('P23',): 10, ('P26',): 10, ('P11',): 7, ('P15',): 4, ('P24',): 8, ('P03',): 9, ('P12',): 8, ('P02', 'P04'): 3, ('P02', 'P05'): 1, ('P02', 'P06'): 2, ('P02', 'P07'): 4, ('P02', 'P10'): 3, ('P02', 'P13'): 3, ('P02', 'P17'): 5, ('P02', 'P18'): 5, ('P02', 'P20'): 5, ('P02', 'P25'): 3, ('P02', 'P09'): 2, ('P02', 'P14'): 3, ('P02', 'P16'): 4, ('P02', 'P19'): 3, ('P02', 'P08'): 2, ('P02', 'P21'): 5, ('P02', 'P22'): 3, ('P02', 'P23'): 4, ('P02', 'P26'): 3, ('P02', 'P11'): 1, ('P02', 'P15'): 2, ('P02', 'P24'): 3, ('P02', 'P03'): 1, ('P02', 'P12'): 1, ('P04', 'P05'): 3, ('P04', 'P06'): 2, ('P04', 'P07'): 5, ('P04', 'P10'): 4, ('P04', 'P13'): 3, ('P04', 'P17'): 5, ('P04', 'P18'): 5, ('P04', 'P20'): 5, ('P04', 'P25'): 2, ('P04', 'P09'): 3, ('P04', 

In [240]:
#Put the results into the output file
for k in range(len(freq_itemsets)):
    for itemset in freq_itemsets[k]:
        out_set = ""
        for item in itemset:
            out_set += item
            out_set += " "
        out_set += str(d_sup_vals[tuple(itemset)])
        print(out_set)     

P01 7
P02 11
P04 7
P05 6
P06 6
P07 9
P10 9
P13 7
P17 12
P18 11
P20 9
P25 6
P09 9
P14 8
P16 7
P19 10
P08 8
P21 11
P22 8
P23 10
P26 10
P11 7
P15 4
P24 8
P03 9
P12 8
P02 P04 3
P02 P07 4
P02 P10 3
P02 P13 3
P02 P17 5
P02 P18 5
P02 P20 5
P02 P25 3
P02 P14 3
P02 P16 4
P02 P19 3
P02 P21 5
P02 P22 3
P02 P23 4
P02 P26 3
P02 P24 3
P04 P05 3
P04 P07 5
P04 P10 4
P04 P13 3
P04 P17 5
P04 P18 5
P04 P20 5
P04 P09 3
P04 P19 4
P04 P12 3
P05 P10 3
P05 P17 3
P05 P18 4
P05 P19 3
P05 P08 3
P05 P26 3
P05 P03 3
P05 P12 3
P06 P13 3
P06 P17 3
P06 P18 3
P06 P20 4
P06 P25 3
P06 P21 4
P07 P13 3
P07 P17 4
P07 P18 3
P07 P20 4
P07 P09 3
P07 P19 3
P07 P08 3
P07 P21 4
P07 P22 3
P07 P23 3
P07 P12 3
P10 P13 3
P10 P17 5
P10 P18 4
P10 P20 5
P10 P25 4
P10 P09 3
P10 P14 4
P10 P26 3
P10 P24 3
P10 P03 4
P10 P12 3
P13 P17 3
P13 P18 4
P13 P20 4
P13 P25 3
P13 P09 4
P13 P21 4
P13 P24 3
P17 P18 4
P17 P20 6
P17 P25 4
P17 P09 3
P17 P14 3
P17 P16 3
P17 P19 3
P17 P21 3
P17 P23 4
P17 P26 3
P17 P24 4
P17 P03 4
P17 P12 3
P18 P20 6
P18 P25