In [19]:
from itertools import combinations
from collections import deque
import numpy as np

In [20]:
class Frequent_items:
    """Implementation of the Apriori algorithm for frequent itemsets detection proposed
    in the paper 'Fast Algorithms for Mining Association Rules' by R. Agrawal and R. Srikant.
    """
    def __init__(self, filepath):
        """
        Initialization method that receives a filepath from which read information.
        
        filepath:   Path to the file containing the transactions (one per line),
                    each represented as a set of integers separated by spaces.
        """
        with open(filepath, "r") as f:
            self.transactions = {}
            for i,line in enumerate(f.readlines()):
                for elem in line.split():
                    try:
                        elem = int(elem)
                    except ValueError:
                        pass
                    if elem not in self.transactions:
                        self.transactions[elem] = set()
                    self.transactions[elem].add(i)
            self.c1 = [({elem}, len(indices)) for elem, indices in self.transactions.items()]

    def _get_support(self, itemsets):
        """
        This method computes the support of a collection of itemsets based on the
        transactions.
        
        itemsets: An iterable of Python sets representing itemsets.
        returns:  An iterable of 2-tuples where the first element is
                  a set (representing an itemset) and the second one
                  is an integer (representing the support).
        """
        # Initialize counts
        supports = deque()
        for iset in itemsets:
            common_indices = set.intersection(*[self.transactions[item] for item in iset])
            supports.append((iset, len(common_indices)))
        return supports
    
    def _next_candidates(self, lprev):
        """
        Find the set of candidates based on the previous frequent
        (k-1)-itemsets.
        
        lprev:      The collection of previous large (k-1)-itemsets as
                    an an iterable of sets.
        returns:    An iterable of 2-tuples where the first element is
                    a set (representing an itemset) and the second one
                    is an integer (representing the support).
                
        """
        k = len(lprev[0])+1
        
        # Join (k-1)-itemsets to get all candidates of size k
        allcandidates = [s1 | s2 for s1 in lprev for s2 in lprev
                                 if len(s1 | s2) == k]
        
        # Filter out candidates which have some (k-1) subset not
        # identified as a large (k-1)-itemset
        candidates = deque()
        for iset in allcandidates:
            # Compute subsets of k-1 elements
            # Number of (k-1) combinations for a set of
            # k elements is precisely k (binomial coefficient(k,k-1))
            # so len(subsets) is k
            subsets = [set(x) for x in combinations(iset, k-1)]
            for i in range(k):
                if subsets[i] not in lprev:
                    break
                if i == k-1 and iset not in candidates: # Last iteration
                    candidates.append(iset)
            
        # Return candidates with their corresponding support
        return self._get_support(candidates)

    def _filter_candidates(self, candidates):
        """
        This methods select only those candidates such that their
        support is greater than or equal the support threshold.
        
        candidates: An iterable of 2-tuples where the first element is
                    a set (representing an itemset) and the second one
                    is an integer (representing the support).
        returns:    An iterable of itemsets as sets.
        """
        # Filter out itemsets with low support
        return [itemset for itemset,sup in candidates
                        if sup >= self.minsup]
    
    def get_frequent_items(self, minsup):
        """
        Get the frequent items of the loaded transactions based on the
        provided support threshold.
        
        minsup:    Support threshold for the itemsets filtering.
        returns:   The identified frequent items as a set of tuples.
        """
        # Initialize variables
        self.minsup = minsup
        l = fi._filter_candidates(self.c1)
        answer = l
        
        # Updates candidates and answer
        while l:
            ck = fi._next_candidates(l)
            l = fi._filter_candidates(ck)
            answer += l
        
        return answer

In [21]:
fi = Frequent_items("data/small_test.dat")
fi.get_frequent_items(2)

[{'E'},
 {'A'},
 {'B'},
 {'C'},
 {'B', 'E'},
 {'C', 'E'},
 {'A', 'C'},
 {'B', 'C'},
 {'B', 'C', 'E'}]

In [22]:
fi = Frequent_items('data/T10I4D100K.dat')
fi.get_frequent_items(7000)

[{368}, {529}]

In [23]:
import cProfile, pstats, io
pr = cProfile.Profile()
pr.enable()
fi = Frequent_items('data/T10I4D100K.dat')
print(fi.get_frequent_items(2000))
pr.disable()
s = io.StringIO()
sortby = 'time'
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print(s.getvalue())

[{6}, {8}, {12}, {21}, {27}, {32}, {38}, {39}, {48}, {54}, {57}, {69}, {70}, {71}, {72}, {73}, {75}, {78}, {93}, {112}, {116}, {120}, {132}, {140}, {145}, {151}, {161}, {175}, {177}, {183}, {192}, {196}, {204}, {205}, {210}, {217}, {229}, {236}, {239}, {242}, {274}, {276}, {279}, {280}, {283}, {285}, {296}, {334}, {346}, {349}, {350}, {354}, {362}, {368}, {373}, {381}, {387}, {390}, {392}, {401}, {411}, {413}, {419}, {438}, {450}, {460}, {470}, {471}, {472}, {477}, {480}, {487}, {489}, {494}, {509}, {510}, {522}, {523}, {526}, {529}, {538}, {541}, {548}, {561}, {569}, {571}, {579}, {581}, {593}, {597}, {598}, {606}, {614}, {617}, {620}, {631}, {634}, {638}, {653}, {661}, {663}, {674}, {675}, {676}, {682}, {684}, {692}, {694}, {720}, {722}, {738}, {744}, {752}, {758}, {766}, {774}, {775}, {778}, {780}, {782}, {788}, {789}, {793}, {795}, {797}, {798}, {803}, {809}, {825}, {826}, {829}, {832}, {844}, {854}, {862}, {871}, {874}, {878}, {883}, {885}, {886}, {888}, {895}, {914}, {918}, {919}