In [1]:
from itertools import combinations
from collections import deque
import numpy as np

In [16]:
class Frequent_items:
    """Implementation of the Apriori algorithm for frequent itemsets detection proposed
    in the paper 'Fast Algorithms for Mining Association Rules' by R. Agrawal and R. Srikant.
    """
    def __init__(self, filepath):
        """
        Initialization method that receives a filepath from which read information.
        
        filepath:   Path to the file containing the transactions (one per line),
                    each represented as a set of integers separated by spaces.
        """
        with open(filepath, "r") as f:
            self.transactions = {}
            for i,line in enumerate(f.readlines()):
                for elem in line.split():
                    try:
                        elem = int(elem)
                    except ValueError:
                        pass
                    if elem not in self.transactions:
                        self.transactions[elem] = set()
                    self.transactions[elem].add(i)
            self.c1 = [({elem}, len(indices)) for elem, indices in self.transactions.items()]

    def _get_support(self, itemsets):
        """
        This method computes the support of a collection of itemsets based on the
        transactions.
        
        itemsets: An iterable of Python sets representing itemsets.
        returns:  An iterable of 2-tuples where the first element is
                  a set (representing an itemset) and the second one
                  is an integer (representing the support).
        """
        # Initialize counts
        supports = deque()
        for iset in itemsets:
            common_indices = set.intersection(*[self.transactions[item] for item in iset])
            supports.append((iset, len(common_indices)))
        return supports
    
    def _next_candidates(self, lprev):
        """
        Find the set of candidates based on the previous frequent
        (k-1)-itemsets.
        
        lprev:      The collection of previous large (k-1)-itemsets as
                    an an iterable of sets.
        returns:    An iterable of 2-tuples where the first element is
                    a set (representing an itemset) and the second one
                    is an integer (representing the support).
                
        """
        lprev = [set(itemset) for itemset in lprev.keys()]
        k = len(lprev[0])+1
        
        # Join (k-1)-itemsets to get all candidates of size k
        allcandidates = [s1 | s2 for s1 in lprev for s2 in lprev
                                 if len(s1 | s2) == k]
        
        # Filter out candidates which have some (k-1) subset not
        # identified as a large (k-1)-itemset
        candidates = deque()
        for iset in allcandidates:
            # Compute subsets of k-1 elements
            # Number of (k-1) combinations for a set of
            # k elements is precisely k (binomial coefficient(k,k-1))
            # so len(subsets) is k
            subsets = [set(x) for x in combinations(iset, k-1)]
            for i in range(k):
                if subsets[i] not in lprev:
                    break
                if i == k-1 and iset not in candidates: # Last iteration
                    candidates.append(iset)
            
        # Return candidates with their corresponding support
        return self._get_support(candidates)

    def _filter_candidates(self, candidates):
        """
        This methods select only those candidates such that their
        support is greater than or equal the support threshold.
        
        candidates: An iterable of 2-tuples where the first element is
                    a set (representing an itemset) and the second one
                    is an integer (representing the support).
        returns:    An iterable of itemsets as sets.
        """
        # Filter out itemsets with low support
        return {tuple(itemset):sup for itemset,sup in candidates
                        if sup >= self.minsup}
    
    def get_frequent_items(self, minsup):
        """
        Get the frequent items of the loaded transactions based on the
        provided support threshold.
        
        minsup:    Support threshold for the itemsets filtering.
        returns:   The identified frequent items as a set of tuples.
        """
        # Initialize variables
        self.minsup = minsup
        l = fi._filter_candidates(self.c1)
        answer = l
        
        # Updates candidates and answer
        while l:
            ck = fi._next_candidates(l)
            l = fi._filter_candidates(ck)
            answer.update(l)
        
        return answer

In [17]:
fi = Frequent_items("data/small_test.dat")
fi.get_frequent_items(2)

{('A',): 2,
 ('A', 'C'): 2,
 ('B',): 3,
 ('B', 'C'): 2,
 ('C',): 3,
 ('E',): 3,
 ('E', 'B'): 3,
 ('E', 'B', 'C'): 2,
 ('E', 'C'): 2}

In [18]:
fi = Frequent_items('data/T10I4D100K.dat')
fi.get_frequent_items(7000)

{(368,): 7828, (529,): 7057}

In [21]:
import cProfile, pstats, io
pr = cProfile.Profile()
pr.enable()
fi = Frequent_items('data/T10I4D100K.dat')
print(fi.get_frequent_items(2000))
pr.disable()
s = io.StringIO()
sortby = 'time'
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print(s.getvalue())

{(960,): 2732, (661,): 2693, (32,): 4248, (598,): 3219, (279,): 3014, (674,): 2527, (829,): 6810, (780,): 2306, (510,): 3281, (548,): 2843, (229,): 2281, (112,): 2680, (523,): 2244, (561,): 2783, (204,): 2174, (871,): 2810, (242,): 2325, (653,): 2634, (280,): 2108, (947,): 3690, (78,): 2471, (217,): 5375, (116,): 2193, (782,): 2767, (192,): 2004, (758,): 2860, (874,): 2237, (183,): 3883, (477,): 2462, (862,): 3649, (809,): 2163, (196,): 2096, (675,): 2976, (663,): 2354, (694,): 2847, (70,): 2411, (620,): 2100, (775,): 3771, (914,): 4037, (494,): 5102, (276,): 2479, (175,): 2791, (825,): 3085, (918,): 3012, (956,): 3626, (738,): 2129, (381,): 2959, (8,): 3090, (419,): 5057, (460,): 4438, (766,): 6265, (797,): 2684, (944,): 2794, (75,): 3151, (998,): 2713, (12,): 3415, (27,): 2165, (205,): 3605, (676,): 2717, (793,): 3063, (692,): 4993, (798,): 3103, (373,): 2007, (411,): 2047, (54,): 2595, (285,): 2600, (579,): 2164, (617,): 2614, (885,): 3043, (368,): 7828, (390,): 2685, (529,): 7057, 

In [5]:
import cProfile, pstats, io
pr = cProfile.Profile()
pr.enable()
fi = Frequent_items('data/T10I4D100K.dat')
print(fi.get_frequent_items(1000))
pr.disable()
s = io.StringIO()
sortby = 'time'
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print(s.getvalue())

[{1}, {4}, {5}, {6}, {8}, {10}, {12}, {17}, {21}, {25}, {27}, {28}, {31}, {32}, {33}, {35}, {37}, {38}, {39}, {41}, {43}, {45}, {48}, {51}, {52}, {54}, {55}, {57}, {58}, {68}, {69}, {70}, {71}, {72}, {73}, {75}, {78}, {85}, {90}, {93}, {94}, {97}, {100}, {104}, {105}, {110}, {111}, {112}, {115}, {116}, {120}, {122}, {125}, {126}, {129}, {130}, {132}, {140}, {143}, {145}, {147}, {151}, {154}, {157}, {161}, {162}, {163}, {168}, {170}, {171}, {173}, {175}, {177}, {181}, {183}, {185}, {192}, {196}, {197}, {198}, {201}, {204}, {205}, {207}, {208}, {210}, {214}, {217}, {227}, {229}, {234}, {236}, {239}, {240}, {242}, {258}, {259}, {265}, {266}, {274}, {275}, {276}, {279}, {280}, {283}, {285}, {290}, {294}, {296}, {308}, {309}, {310}, {319}, {322}, {325}, {326}, {332}, {334}, {335}, {336}, {343}, {346}, {348}, {349}, {350}, {351}, {354}, {357}, {361}, {362}, {366}, {368}, {373}, {377}, {378}, {381}, {385}, {387}, {390}, {392}, {394}, {401}, {403}, {405}, {411}, {413}, {414}, {419}, {422}, {42