In [227]:
import pandas as pd
import itertools

In [228]:
df = pd.read_csv("../myDataFile.csv", low_memory=False)
df.fillna(0, inplace=True)
df.replace('t', 1, inplace=True)
df.to_csv("../data.csv")

In [230]:
df

Unnamed: 0,Instant_food_products,UHT_milk,abrasive_cleaner,artif__sweetener,baby_cosmetics,baby_food,bags,baking_powder,bathroom_cleaner,beef,...,turkey,vinegar,waffles,whipped_sour_cream,whisky,white_bread,white_wine,whole_milk,yogurt,zwieback
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9830,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
9831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9832,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9833,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Support computations

The function below returns the support metric for a given itemset, passed into the function as either a pandas.DataFrame object or a pandas.Series object (when we have data for a single feature - the transaction history regarding a single product)

In [231]:
def compute_support_for(df, item_set):
    # df is the (reference to) complete dataframe
    # item_set is a set of strings, each being a column name in df to be checked.
    # returns the support percentage for a given item_set.

    query = " == 1 and ".join(item_set) + " == 1"
    return df.query(query).shape[0] / df.shape[0]

However, the Apriori algorithm only checks if the support of a given itemset exceeds a certain threshold. It does not require the computation of exact metric value. Hence, the following function breaks the loop iterating over all transactions if the minimum number of occurences is attained. This way, the computational load is less, whilst performing the requirements.

As we ultimately need to view the frequent itemsets on each layer, we chose to structure the data as follows: we created a dictionary with the keys representing the layers (the number of elements/products in a single itemset) and the values being lists of sets. Thus, we have added a helper function which converts a set of items (in our case, strings) to a list of sets. This is particularly useful when creating the first layer out of the column names.

The following function prunes the candidate itemsets after evaluating their support. If they exceed the minimum value passed as a parameter, then the itemset will go into L (the final frequent itemsets collection)

In [232]:
def pick_candidates_for_layer(candidates, df, minSup):
    return [item_set for item_set in candidates if compute_support_for(df, item_set) > minSup]

This is the function which returns the collection of frequent itemsets, represented by a dictionary.

In [233]:
def compute_frequent_item_sets(df, min_support):

    # Define first layer candidates in a dictionary
    candidates = {
        1: [{item} for item in list(df.columns)]
    }

    # Compute first layer frequent item sets.
    layers = {
        1: pick_candidates_for_layer(candidates[1], df, min_support)
    }

    n_columns = df.shape[1]
    for layer in range(2, n_columns + 1):

        print("Computing for layer " + str(layer))

        # Compute candidates
        candidates[layer] = list()
        for p in range(len(layers[layer - 1]) - 1):

            for q in range(p + 1, len(layers[layer - 1])):  # start at p+1 to avoid duplicates.

                p_set = layers[layer - 1][p]
                q_set = layers[layer - 1][q]

                # Check if the number of common elements between the two item sets equals layer - 2.
                # If so, they differ by one element and a new candidate (union) can be computed.
                if len(p_set.intersection(q_set)) == layer - 2:
                    new_candidate = p_set.union(q_set)

                    # Only append if not a duplicate
                    if not new_candidate in candidates[layer]:

                        # Prune based on existence of subsets in previous layer
                        subsets_of_new_candidate = set(itertools.combinations(new_candidate, layer - 1))
                        if all([set(subset) in layers[layer - 1] for subset in subsets_of_new_candidate]):
                            candidates[layer].append(new_candidate)


        # Pick item sets for layer if they exceed the minimum support.
        picked = pick_candidates_for_layer(candidates[layer], df, min_support)

        # Break early if no more frequent item sets are found.
        if len(picked) == 0:
            break

        layers[layer] = picked

    return layers, candidates

In [234]:
# takes 3 input parameters, the reference to the dataframe and 2 item sets
def compute_confidence(df, antecedent, consequent):
    return compute_support_for(df, set.union(antecedent, consequent)) / compute_support_for(df, antecedent)

In [None]:
def generate_rules(df, layers):

    # For each layer of the item sets
    for k, item_sets in layers.items():

        # No rules can be formed out of item sets with length 1.
        if k == 1:
            continue

        print("Rules for item sets of length " + str(k) + ":")



# Tests

In [235]:
df

Unnamed: 0,Instant_food_products,UHT_milk,abrasive_cleaner,artif__sweetener,baby_cosmetics,baby_food,bags,baking_powder,bathroom_cleaner,beef,...,turkey,vinegar,waffles,whipped_sour_cream,whisky,white_bread,white_wine,whole_milk,yogurt,zwieback
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9830,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
9831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9832,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9833,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [236]:
L, C = compute_frequent_item_sets(df, 0.005)

Computing for layer 2
Computing for layer 3
Computing for layer 4
Computing for layer 5


In [237]:
L

{1: [{'Instant_food_products'},
  {'UHT_milk'},
  {'baking_powder'},
  {'beef'},
  {'berries'},
  {'beverages'},
  {'bottled_beer'},
  {'bottled_water'},
  {'brown_bread'},
  {'butter'},
  {'butter_milk'},
  {'cake_bar'},
  {'candles'},
  {'candy'},
  {'canned_beer'},
  {'canned_fish'},
  {'canned_vegetables'},
  {'cat_food'},
  {'cereals'},
  {'chewing_gum'},
  {'chicken'},
  {'chocolate'},
  {'chocolate_marshmallow'},
  {'citrus_fruit'},
  {'cleaner'},
  {'cling_film_bags'},
  {'coffee'},
  {'condensed_milk'},
  {'cream_cheese_'},
  {'curd'},
  {'curd_cheese'},
  {'dental_care'},
  {'dessert'},
  {'detergent'},
  {'dish_cleaner'},
  {'dishes'},
  {'dog_food'},
  {'domestic_eggs'},
  {'female_sanitary_products'},
  {'finished_products'},
  {'flour'},
  {'flower__seeds_'},
  {'frankfurter'},
  {'frozen_dessert'},
  {'frozen_fish'},
  {'frozen_meals'},
  {'frozen_potato_products'},
  {'frozen_vegetables'},
  {'fruit_vegetable_juice'},
  {'grapes'},
  {'ham'},
  {'hamburger_meat'},
  {'h

In [238]:
C

{1: [{'Instant_food_products'},
  {'UHT_milk'},
  {'abrasive_cleaner'},
  {'artif__sweetener'},
  {'baby_cosmetics'},
  {'baby_food'},
  {'bags'},
  {'baking_powder'},
  {'bathroom_cleaner'},
  {'beef'},
  {'berries'},
  {'beverages'},
  {'bottled_beer'},
  {'bottled_water'},
  {'brandy'},
  {'brown_bread'},
  {'butter'},
  {'butter_milk'},
  {'cake_bar'},
  {'candles'},
  {'candy'},
  {'canned_beer'},
  {'canned_fish'},
  {'canned_fruit'},
  {'canned_vegetables'},
  {'cat_food'},
  {'cereals'},
  {'chewing_gum'},
  {'chicken'},
  {'chocolate'},
  {'chocolate_marshmallow'},
  {'citrus_fruit'},
  {'cleaner'},
  {'cling_film_bags'},
  {'cocoa_drinks'},
  {'coffee'},
  {'condensed_milk'},
  {'cooking_chocolate'},
  {'cookware'},
  {'cream'},
  {'cream_cheese_'},
  {'curd'},
  {'curd_cheese'},
  {'decalcifier'},
  {'dental_care'},
  {'dessert'},
  {'detergent'},
  {'dish_cleaner'},
  {'dishes'},
  {'dog_food'},
  {'domestic_eggs'},
  {'female_sanitary_products'},
  {'finished_products'},
 

In [240]:
print("Lengths for L:")
for key, value in L.items():
    print(str(key) + ": " + str(len(value)))

Lengths for L:
1: 120
2: 605
3: 264
4: 12
5: 0
