In [1]:
import numpy as np
import pandas as pd
import math

In [3]:
df = pd.read_csv("D:\_MASTER\AN1\Q2\IS\Assignments\Assignment3\myDataFile.csv", low_memory=False)
df.fillna(0, inplace=True)
df.replace('t', 1, inplace=True)
df.to_csv("D:\_MASTER\AN1\Q2\IS\Assignments\Assignment3\data.csv")

In [2]:
df = pd.read_csv("D:\_MASTER\AN1\Q2\IS\Assignments\Assignment3\smaller_data.csv", index_col=0)
df.index.names = ['Transaction']

In [3]:
df

Unnamed: 0_level_0,Instant_food_products,UHT_milk,abrasive_cleaner,artif__sweetener,baby_cosmetics,baby_food,bags,baking_powder,bathroom_cleaner,beef,...,brown_bread,butter,butter_milk,cake_bar,whisky,white_bread,white_wine,whole_milk,yogurt,zwieback
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Support computations

The function below returns the support metric for a given itemset, passed into the function as either a pandas.DataFrame object or a pandas.Series object (when we have data for a single feature - the transaction history regarding a single product)

In [4]:
def support(itemset_data):
    support = 0
    if(isinstance(itemset_data, pd.DataFrame)):
        support_counter = 0
        number_of_transactions = itemset_data.shape[0]
        column_list = list(itemset_data.columns)
        for index, row in itemset_data.iterrows():
            occurs = True
            for i in range (len(column_list)):
                if row[column_list[i]] == 0:
                    occurs = False
                    break
            if (occurs):
                support_counter+=1
        support = support_counter / number_of_transactions
    elif(isinstance(itemset_data, pd.Series)):
        number_of_transactions = itemset_data.count()
        support_counter = itemset_data.sum()
        support = support_counter / number_of_transactions
    return support

However, the Apriori algorithm only checks if the support of a given itemset exceeds a certain threshold. It does not require the computation of exact metric value. Hence, the following function breaks the loop iterating over all transactions if the minimum number of occurences is attained. This way, the computational load is less, whilst performing the requirements.

In [24]:
def exceedsMinSup(itemset_data, minSup):
    exceeds = False
    support = 0
    if(isinstance(itemset_data, pd.DataFrame)):
        support_counter = 0
        minSupAbsoluteValue = math.ceil(itemset_data.shape[0] * minSup)
        column_list = list(itemset_data.columns)
        for index, row in itemset_data.iterrows():
            occurs = True
            for i in range (len(column_list)):
                if row[column_list[i]] == 0:
                    occurs = False
                    break
            if (occurs):
                support_counter+=1
            if support_counter >= minSupAbsoluteValue:
                exceeds = True
                break
    elif(isinstance(itemset_data, pd.Series)):
        support_counter = itemset_data.sum()
        minSupAbsoluteValue = math.ceil(itemset_data.count() * minSup)
        if(support_counter >= minSupAbsoluteValue):
            exceeds = True
    return exceeds

As we ultimately need to view the frequent itemsets on each layer, we chose to structure the data as follows: we created a dictionary with the keys representing the layers (the number of elements/products in a single itemset) and the values being lists of sets. Thus, we have added a helper function which converts a set of items (in our case, strings) to a list of sets. This is particularly useful when creating the first layer out of the column names.

In [6]:
def listOfSets(setOfItems):
    listOfSets = None
    if(isinstance(setOfItems, set)):
        listOfSets = list()
        for item in setOfItems:
            listItem = {item}
            listOfSets.append(listItem)
    return listOfSets

The following function prunes the candidate itemsets after evaluating their support. If they exceed the minimum value passed as a parameter, then the itemset will go into L (the final frequent itemsets collection)

In [7]:
def CtoL(C, L, df, layer, minSup):
    for itemset in C[layer]:
        itemset_list = list(itemset)
        if(exceedsMinSup(df.loc[:, itemset_list], minSup)):
            L[layer].append(itemset)
    return L[layer]

This is the function which returns the collection of frequent itemsets, represented by a dictionary.

In [25]:
def frequent_itemsets(df, minSup):
    C = {}
    C[1] = listOfSets(set(df.columns))
    L = {}
    L[1] = list()
    L[1] = CtoL(C, L, df, 1, minSup)
    for layer in range(2, df.shape[1] + 1):
        C[layer] = list()
        L[layer] = list()
        for p in L[layer - 1]:
            for q in L[layer - 1]:
                if p != q:
                    counter = 0
                    p_list = list(p)
                    q_list = list(q)
                    for i in range(len(p_list)):
                        if (i != len(p_list) - 1): # we only check the last element, as the set orders the elemnts automatically
                            if p_list[i] == q_list[i]:
                                counter += 1
                    if counter == layer - 2: # we now compute C[2], we look at L[1], counter need to be = 0
                        next_p = set.union(p, q)

                        # we check if the itemset is already in the list of itemsets, written in another form
                        exists = False
                        for i in C[layer]:
                            if next_p == i:
                                exists = True
                        if(exists == False):
                            C[layer].append(next_p)
        L[layer] = CtoL(C, L, df, layer, minSup)
    return L        

In [26]:
L = frequent_itemsets(df, 0.005)

# Tests

In [35]:
C = {}
C[1] = listOfSets(set(df.columns))
L = {}
L[1] = list()
for itemset in C[1]:
    itemset_list = list(itemset)
    if(exceedsMinSup(df.loc[:, itemset_list], 0.005)):
        L[1].append(itemset)
C[2] = list()
L[2] = list()

In [36]:
for p in L[1]:
    for q in L[1]:
        if p != q:
            counter = 0
            p_list = list(p)
            q_list = list(q)
            for i in range(len(p_list)):
                if (i != len(p_list) - 1): # here we can modify the index of the different element
                    if p_list[i] == q_list[i]:
                        counter += 1
            if counter == 0: # calculam acum C[2], ne uitam la L[1], iar counterul trebuie sa fie = 0
                next_p = set.union(p, q)
                
                # we check if the itemset is already in the list of itemsets, written in another form
                exists = False
                for i in C[2]:
                    if next_p == i:
                        exists = True
                if(exists == False):
                    C[2].append(next_p)

In [37]:
for itemset in C[2]:
    itemset_list = list(itemset)
    if(exceedsMinSup(df.loc[:, itemset_list], 0.005)):
        L[2].append(itemset)

In [38]:
len(L[2])

24