In [4]:
import numpy as np
import pandas as pd
import math
import itertools

In [2]:
df = pd.read_csv("D:\_MASTER\AN1\Q2\IS\Assignments\Assignment3\myDataFile.csv", low_memory=False)
df.fillna(0, inplace=True)
df.replace('t', 1, inplace=True)
df.to_csv("D:\_MASTER\AN1\Q2\IS\Assignments\Assignment3\data.csv")

In [6]:
df = pd.read_csv("D:\_MASTER\AN1\Q2\IS\Assignments\Assignment3\smaller_data.csv", index_col=0)
df.index.names = ['Transaction']

In [7]:
df

Unnamed: 0_level_0,Instant_food_products,UHT_milk,abrasive_cleaner,artif__sweetener,baby_cosmetics,baby_food,bags,baking_powder,bathroom_cleaner,beef,...,brown_bread,butter,butter_milk,cake_bar,whisky,white_bread,white_wine,whole_milk,yogurt,zwieback
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Support computations

The function below returns the support metric for a given itemset, passed into the function as either a pandas.DataFrame object or a pandas.Series object (when we have data for a single feature - the transaction history regarding a single product)

In [8]:
def support(itemset_data):
    support = 0
    if(isinstance(itemset_data, pd.DataFrame)):
        support_counter = 0
        number_of_transactions = itemset_data.shape[0]
        column_list = list(itemset_data.columns)
        for index, row in itemset_data.iterrows():
            occurs = True
            for i in range (len(column_list)):
                if row[column_list[i]] == 0:
                    occurs = False
                    break
            if (occurs):
                support_counter+=1
        support = support_counter / number_of_transactions
    elif(isinstance(itemset_data, pd.Series)):
        number_of_transactions = itemset_data.count()
        support_counter = itemset_data.sum()
        support = support_counter / number_of_transactions
    return support

However, the Apriori algorithm only checks if the support of a given itemset exceeds a certain threshold. It does not require the computation of exact metric value. Hence, the following function breaks the loop iterating over all transactions if the minimum number of occurences is attained. This way, the computational load is less, whilst performing the requirements.

In [9]:
def exceedsMinSup(itemset_data, minSup):
    exceeds = False
    support = 0
    if(isinstance(itemset_data, pd.DataFrame)):
        support_counter = 0
        minSupAbsoluteValue = math.ceil(itemset_data.shape[0] * minSup)
        column_list = list(itemset_data.columns)
        for index, row in itemset_data.iterrows():
            occurs = True
            for i in range (len(column_list)):
                if row[column_list[i]] == 0:
                    occurs = False
                    break
            if (occurs):
                support_counter+=1
            if support_counter >= minSupAbsoluteValue:
                exceeds = True
                break
    elif(isinstance(itemset_data, pd.Series)):
        support_counter = itemset_data.sum()
        minSupAbsoluteValue = math.ceil(itemset_data.count() * minSup)
        if(support_counter >= minSupAbsoluteValue):
            exceeds = True
    return exceeds

As we ultimately need to view the frequent itemsets on each layer, we chose to structure the data as follows: we created a dictionary with the keys representing the layers (the number of elements/products in a single itemset) and the values being lists of sets. Thus, we have added a helper function which converts a set of items (in our case, strings) to a list of sets. This is particularly useful when creating the first layer out of the column names.

In [10]:
def listOfSets(setOfItems):
    listOfSets = None
    if(isinstance(setOfItems, set)):
        listOfSets = list()
        for item in setOfItems:
            listItem = {item}
            listOfSets.append(listItem)
    return listOfSets

The following function prunes the candidate itemsets after evaluating their support. If they exceed the minimum value passed as a parameter, then the itemset will go into L (the final frequent itemsets collection)

In [11]:
def CtoL(C, L, df, layer, minSup):
    for itemset in C[layer]:
        itemset_list = list(itemset)
        if(exceedsMinSup(df.loc[:, itemset_list], minSup)):
            L[layer].append(itemset)
            print(itemset)
    return L[layer]

This is the function which returns the collection of frequent itemsets, represented by a dictionary.

In [12]:
def frequent_itemsets(df, minSup):
    C = {}
    C[1] = listOfSets(set(df.columns))
    L = {}
    L[1] = list()
    L[1] = CtoL(C, L, df, 1, minSup)
    for layer in range(2, df.shape[1] + 1):
        C[layer] = list()
        L[layer] = list()
        # we take each element of the L[layer] list and compare with the other elements, one by one, only once
        for p in range(len(L[layer - 1]) - 1):
            for q in range(p + 1, len(L[layer - 1])):
                if L[layer - 1][p] != L[layer - 1][q]:
                    p_set = L[layer - 1][p]
                    q_set = L[layer - 1][q]
                    
                    # we check if the number of common elements of each 2 layer - 1 itemsets is = layer - 2
                    # (i.e. they differ by only one element)
                    if(len(p_set.intersection(q_set)) == layer - 2):
                        next_p = set.union(p_set, q_set)

                        # we check if the itemset is already in the list of itemsets,
                        # obtained from other layer - 1 subsets (e.g. {3,4,5,6} can be obtained by merging subsets:
                        # 1. {3,4,6} and {3,5,6}
                        # 2. {3,4,5} and {3,5,6}
                        # 3. {4,5,6} and {3,5,6})
                        exists = False
                        for i in C[layer]:
                            if next_p == i:
                                exists = True
                        if(exists == False):
                            # we check if all layer - 1 subsets are in L[layer - 1] (i.e. they are frequent)
                            # if yes, append. Else, don't append.
                            layer_minus_one_subsets = set(itertools.combinations(next_p, layer - 1))
                            frequentItemset = True
                            for subset_tuple in layer_minus_one_subsets:
                                if set(subset_tuple) not in L[layer - 1]:
                                    frequentItemset = False
                            if (frequentItemset):
                                C[layer].append(next_p)
        # we only add the items which exceed the minimum support to the final collection
        L[layer] = CtoL(C, L, df, layer, minSup)
    return L, C

In [13]:
# takes 3 input parameters, the reference to the dataframe and 2 itemsets
def confidence(df, antecedent, consequent):
    union_column_list = list(set.union(antecedent, consequent))
    antecedent_column_list = list(antecedent)
    return support(df.loc[:, union_column_list]) / support(df.loc[:, antecedent_column_list])

# Tests

In [14]:
df

Unnamed: 0_level_0,Instant_food_products,UHT_milk,abrasive_cleaner,artif__sweetener,baby_cosmetics,baby_food,bags,baking_powder,bathroom_cleaner,beef,...,brown_bread,butter,butter_milk,cake_bar,whisky,white_bread,white_wine,whole_milk,yogurt,zwieback
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
sup_ant = support(df.loc[:, 'yogurt'])

In [16]:
sup_cons = support(df.loc[:, 'whole_milk'])

In [17]:
sup_union = support(df.loc[:, ['yogurt', 'whole_milk']])

In [18]:
confidence(df, {'yogurt'}, {'yogurt', 'whole_milk'})

0.375

In [19]:
sup_ant

0.11222444889779559

In [20]:
sup_union

0.04208416833667335

In [21]:
sup_union / sup_ant

0.375

In [22]:
def frequent_itemsets_less_efficient(df, minSup):
    C = {}
    C[1] = listOfSets(set(df.columns))
    L = {}
    L[1] = list()
    L[1] = CtoL(C, L, df, 1, minSup)
    for layer in range(2, df.shape[1] + 1):
        C[layer] = list()
        L[layer] = list()
        # we take 
        for p in L[layer - 1]:
            for q in L[layer - 1]:
                if p != q:
                    if(len(p.intersection(q)) == layer - 2):
                        next_p = set.union(p, q)

                        # we check if the itemset is already in the list of itemsets, written in another form
                        exists = False
                        for i in C[layer]:
                            if next_p == i:
                                exists = True
                        if(exists == False):
                            # check if all layer - 1 subsets are in L[layer - 1]
                            # if yes, append. Else, don't append.
                            subset_tuples_set = set(itertools.combinations(next_p, layer - 1))
                            frequentItemset = True
                            for subset_tuple in subset_tuples_set:
                                if set(subset_tuple) not in L[layer - 1]:
                                    frequentItemset = False
                            if (frequentItemset):
                                C[layer].append(next_p)
                            # TODO: check if all layer-1 subsets are in L[layer-1]
                            # if yes, append. Else, don't append
        L[layer] = CtoL(C, L, df, layer, minSup)
    return L, C        

In [23]:
L1, C1 = frequent_itemsets(df, 0.005)

{'butter_milk'}
{'brown_bread'}
{'yogurt'}
{'beverages'}
{'butter'}
{'white_bread'}
{'UHT_milk'}
{'Instant_food_products'}
{'zwieback'}
{'cake_bar'}
{'bathroom_cleaner'}
{'whole_milk'}
{'bottled_beer'}
{'beef'}
{'bottled_water'}
{'berries'}
{'baking_powder'}
{'abrasive_cleaner'}
{'white_wine'}
{'yogurt', 'butter_milk'}
{'whole_milk', 'butter_milk'}
{'bottled_water', 'butter_milk'}
{'brown_bread', 'yogurt'}
{'brown_bread', 'whole_milk'}
{'brown_bread', 'bottled_water'}
{'beverages', 'yogurt'}
{'yogurt', 'butter'}
{'yogurt', 'whole_milk'}
{'yogurt', 'bottled_beer'}
{'yogurt', 'bottled_water'}
{'beverages', 'whole_milk'}
{'white_bread', 'butter'}
{'whole_milk', 'butter'}
{'bottled_beer', 'butter'}
{'bottled_water', 'butter'}
{'whole_milk', 'white_bread'}
{'whole_milk', 'bottled_beer'}
{'whole_milk', 'beef'}
{'whole_milk', 'bottled_water'}
{'whole_milk', 'berries'}
{'whole_milk', 'abrasive_cleaner'}
{'bottled_beer', 'bottled_water'}
{'berries', 'beef'}
{'brown_bread', 'whole_milk', 'yogurt

In [24]:
L2, C2 = frequent_itemsets_less_efficient(df, 0.005)

{'butter_milk'}
{'brown_bread'}
{'yogurt'}
{'beverages'}
{'butter'}
{'white_bread'}
{'UHT_milk'}
{'Instant_food_products'}
{'zwieback'}
{'cake_bar'}
{'bathroom_cleaner'}
{'whole_milk'}
{'bottled_beer'}
{'beef'}
{'bottled_water'}
{'berries'}
{'baking_powder'}
{'abrasive_cleaner'}
{'white_wine'}
{'yogurt', 'butter_milk'}
{'whole_milk', 'butter_milk'}
{'bottled_water', 'butter_milk'}
{'brown_bread', 'yogurt'}
{'brown_bread', 'whole_milk'}
{'brown_bread', 'bottled_water'}
{'beverages', 'yogurt'}
{'yogurt', 'butter'}
{'yogurt', 'whole_milk'}
{'yogurt', 'bottled_beer'}
{'yogurt', 'bottled_water'}
{'beverages', 'whole_milk'}
{'white_bread', 'butter'}
{'whole_milk', 'butter'}
{'bottled_beer', 'butter'}
{'bottled_water', 'butter'}
{'whole_milk', 'white_bread'}
{'whole_milk', 'bottled_beer'}
{'whole_milk', 'beef'}
{'whole_milk', 'bottled_water'}
{'whole_milk', 'berries'}
{'whole_milk', 'abrasive_cleaner'}
{'bottled_beer', 'bottled_water'}
{'berries', 'beef'}
{'brown_bread', 'whole_milk', 'yogurt

In [25]:
L1 == L2

True

In [26]:
C1 == C2

True

In [27]:
L1

{1: [{'butter_milk'},
  {'brown_bread'},
  {'yogurt'},
  {'beverages'},
  {'butter'},
  {'white_bread'},
  {'UHT_milk'},
  {'Instant_food_products'},
  {'zwieback'},
  {'cake_bar'},
  {'bathroom_cleaner'},
  {'whole_milk'},
  {'bottled_beer'},
  {'beef'},
  {'bottled_water'},
  {'berries'},
  {'baking_powder'},
  {'abrasive_cleaner'},
  {'white_wine'}],
 2: [{'butter_milk', 'yogurt'},
  {'butter_milk', 'whole_milk'},
  {'bottled_water', 'butter_milk'},
  {'brown_bread', 'yogurt'},
  {'brown_bread', 'whole_milk'},
  {'bottled_water', 'brown_bread'},
  {'beverages', 'yogurt'},
  {'butter', 'yogurt'},
  {'whole_milk', 'yogurt'},
  {'bottled_beer', 'yogurt'},
  {'bottled_water', 'yogurt'},
  {'beverages', 'whole_milk'},
  {'butter', 'white_bread'},
  {'butter', 'whole_milk'},
  {'bottled_beer', 'butter'},
  {'bottled_water', 'butter'},
  {'white_bread', 'whole_milk'},
  {'bottled_beer', 'whole_milk'},
  {'beef', 'whole_milk'},
  {'bottled_water', 'whole_milk'},
  {'berries', 'whole_milk'},

In [28]:
C1

{1: [{'butter_milk'},
  {'brown_bread'},
  {'yogurt'},
  {'beverages'},
  {'butter'},
  {'white_bread'},
  {'UHT_milk'},
  {'brandy'},
  {'Instant_food_products'},
  {'baby_food'},
  {'zwieback'},
  {'bags'},
  {'artif__sweetener'},
  {'cake_bar'},
  {'bathroom_cleaner'},
  {'whole_milk'},
  {'bottled_beer'},
  {'beef'},
  {'bottled_water'},
  {'berries'},
  {'baby_cosmetics'},
  {'whisky'},
  {'baking_powder'},
  {'abrasive_cleaner'},
  {'white_wine'}],
 2: [{'brown_bread', 'butter_milk'},
  {'butter_milk', 'yogurt'},
  {'beverages', 'butter_milk'},
  {'butter', 'butter_milk'},
  {'butter_milk', 'white_bread'},
  {'UHT_milk', 'butter_milk'},
  {'Instant_food_products', 'butter_milk'},
  {'butter_milk', 'zwieback'},
  {'butter_milk', 'cake_bar'},
  {'bathroom_cleaner', 'butter_milk'},
  {'butter_milk', 'whole_milk'},
  {'bottled_beer', 'butter_milk'},
  {'beef', 'butter_milk'},
  {'bottled_water', 'butter_milk'},
  {'berries', 'butter_milk'},
  {'baking_powder', 'butter_milk'},
  {'abr

In [16]:
C = {}
C[1] = listOfSets(set(df.columns))
L = {}
L[1] = list()
for itemset in C[1]:
    itemset_list = list(itemset)
    if(exceedsMinSup(df.loc[:, itemset_list], 0.005)):
        L[1].append(itemset)
C[2] = list()
L[2] = list()

In [17]:
for p in L[1]:
    for q in L[1]:
        if p != q:
            counter = 0
            p_list = list(p)
            q_list = list(q)
            for i in range(len(p_list)):
                if (i != len(p_list) - 1): # here we can modify the index of the different element
                    if p_list[i] == q_list[i]:
                        counter += 1
            if counter == 0: # calculam acum C[2], ne uitam la L[1], iar counterul trebuie sa fie = 0
                next_p = set.union(p, q)
                
                # we check if the itemset is already in the list of itemsets, written in another form
                exists = False
                for i in C[2]:
                    if next_p == i:
                        exists = True
                if(exists == False):
                    C[2].append(next_p)

In [18]:
for itemset in C[2]:
    itemset_list = list(itemset)
    if(exceedsMinSup(df.loc[:, itemset_list], 0.005)):
        L[2].append(itemset)

In [19]:
len(L[2])

24