In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
from collections import Counter
import random

In [2]:
example_data = open("../resource/asnlib/publicdata/small.txt").read().split("\n")

In [3]:
# Implement the mapper in the candidate generation stage

#     combinations = sort(list(combinations(all_items, item_set_size))


def map_step_1(data, num_maps, support_thresh, item_set_size):
    
    # initial variables
    frequent_item_sets = []
    c = Counter()
    
    # get itemsets for each basket
    for i in data:
        # parse string
        if i != '':
            basket = i.split(':')[1]
            items = basket.split(',')
            items = sorted(list(set(items)))
            
            # only check if basket size is compatible with itemset size
            if len(items) >= item_set_size:
                c.update(combinations(items, item_set_size))
    
    # create new threshold
    new_thresh = support_thresh/num_maps
    
    # add frequent itemset to tuple if number of occurrences is greater than threshold
    for item_set, count in c.items():
        if count >= new_thresh:
            frequent_item_sets.append((item_set, 1))
            
    return frequent_item_sets

In [5]:
# Implement the reducer in the candidate generation stage

def reduce_step_1(data):
    candidates = []
    c = Counter(data)
    
    # getting counts of how many times the key appears
    for item_set, count in c:
        if count >= 1:
            candidates.append(item_set)
    
    return candidates

In [7]:
# Implement the mapper in the true frequent itemset stage

def map_step_2(data, candidates):
    # initial variables
    true_frequent_item_sets = []
    c = Counter()
    
    # loop through data
    for i in data:
        
        # parse and turn to sets to use issubset
        if i != '':
            basket = i.split(':')[1]
            items_set = set(basket.split(','))
            
            # loop through candidates
            for candidate in candidates:
                candidate_set = set(candidate)
                
                # check if candidate is in itemset, then update counter with tuple so that Counter counts the whole set
                if candidate_set.issubset(items_set):
                    c.update([tuple(candidate)])
    
    
    # add frequent itemset to tuple (itemset, count)
    for item_set, count in c.items():
        true_frequent_item_sets.append((item_set, count))
            
    return true_frequent_item_sets

In [1]:

def reduce_step_2(data, support_thresh):
    
    truly_frequent_item_sets = []
    c = Counter()
    
    # getting counts of how many times the key appears
    for item_set, count in data:
        c[item_set] += count
    
    for item_set, count in c.items():
        if count >= support_thresh:
            truly_frequent_item_sets.append(item_set)
    
    return truly_frequent_item_sets