diff --git a/pymining/assocrules.py b/pymining/assocrules.py index 8aa7d02..ed49eb7 100644 --- a/pymining/assocrules.py +++ b/pymining/assocrules.py @@ -9,13 +9,15 @@ def mine_assoc_rules(isets, min_support=2, min_confidence=0.5): for item in key: left = key.difference([item]) right = frozenset([item]) - _mine_assoc_rules(left, right, support, visited, isets, - min_support, min_confidence, rules) + _mine_assoc_rules( + left, right, support, visited, isets, + min_support, min_confidence, rules) return rules -def _mine_assoc_rules(left, right, rule_support, visited, isets, min_support, +def _mine_assoc_rules( + left, right, rule_support, visited, isets, min_support, min_confidence, rules): if (left, right) in visited or len(left) < 1: return @@ -30,5 +32,6 @@ def _mine_assoc_rules(left, right, rule_support, visited, isets, min_support, for item in left: new_left = left.difference([item]) new_right = right.union([item]) - _mine_assoc_rules(new_left, new_right, rule_support, visited, isets, - min_support, min_confidence, rules) + _mine_assoc_rules( + new_left, new_right, rule_support, visited, isets, + min_support, min_confidence, rules) diff --git a/pymining/compat.py b/pymining/compat.py index ea620e1..db40042 100644 --- a/pymining/compat.py +++ b/pymining/compat.py @@ -1,6 +1,6 @@ import sys if sys.version_info[0] < 3: - range = xrange + range = xrange # noqa else: range = range diff --git a/pymining/itemmining.py b/pymining/itemmining.py index fc8c680..c6bbaa6 100644 --- a/pymining/itemmining.py +++ b/pymining/itemmining.py @@ -1,7 +1,8 @@ from collections import defaultdict, deque, OrderedDict -def _sort_transactions_by_freq(transactions, key_func, reverse_int=False, +def _sort_transactions_by_freq( + transactions, key_func, reverse_int=False, reverse_ext=False, sort_ext=True): key_seqs = [{key_func(i) for i in sequence} for sequence in transactions] frequencies = get_frequencies(key_seqs) @@ -45,7 +46,8 @@ def get_sam_input(transactions, key_func=None): ''' if key_func is None: - key_func = lambda e: e + def key_func(e): + return e (asorted_seqs, _) = _sort_transactions_by_freq(transactions, key_func) @@ -113,7 +115,6 @@ def _sam(sam_input, fis, report, min_support): if s >= min_support: fis.add(i[1]) report[frozenset(fis)] = s - #print('{0} with support {1}'.format(fis, s)) n = n + 1 + _sam(c, fis, report, min_support) fis.remove(i[1]) return n @@ -160,10 +161,11 @@ def get_relim_input(transactions, key_func=None): # relim_input[x][1][x][1] = rest of transaction prefixed by key_freq if key_func is None: - key_func = lambda e: e + def key_func(e): + return e - (asorted_seqs, frequencies) = _sort_transactions_by_freq(transactions, - key_func) + (asorted_seqs, frequencies) = _sort_transactions_by_freq( + transactions, key_func) key_map = _get_key_map(frequencies) relim_input = _new_relim_input(len(key_map), key_map) @@ -207,15 +209,12 @@ def relim(rinput, min_support=2): def _relim(rinput, fis, report, min_support): (relim_input, key_map) = rinput n = 0 - # Maybe this one isn't necessary - #a = deque(relim_input) a = relim_input while len(a) > 0: item = a[-1][0][1] s = a[-1][0][0] if s >= min_support: fis.add(item[1]) - #print('Report {0} with support {1}'.format(fis, s)) report[frozenset(fis)] = s b = _new_relim_input(len(a) - 1, key_map) rest_lists = a[-1][1] @@ -288,7 +287,8 @@ def _create_child(self, child_key, heads, last_insert): return child - def get_cond_tree(self, child, count, visited, heads, last_insert, + def get_cond_tree( + self, child, count, visited, heads, last_insert, dont_create=False): key = self.key @@ -300,13 +300,13 @@ def get_cond_tree(self, child, count, visited, heads, last_insert, try: cond_node = visited[self] except Exception: - cond_node = self._create_cond_child(visited, heads, - last_insert) + cond_node = self._create_cond_child( + visited, heads, last_insert) if self.parent is not None: # Recursion - parent_node = self.parent.get_cond_tree(cond_node, count, visited, - heads, last_insert, False) + parent_node = self.parent.get_cond_tree( + cond_node, count, visited, heads, last_insert, False) if cond_node is not None: cond_node.count += count heads[key][1] += count @@ -339,7 +339,8 @@ def _find_ancestor(self, heads, min_support): ancestor = ancestor.parent return ancestor - def prune_me(self, from_head_list, visited_parents, merged_before, + def prune_me( + self, from_head_list, visited_parents, merged_before, merged_now, heads, min_support): try: # Parent was merged @@ -367,8 +368,9 @@ def prune_me(self, from_head_list, visited_parents, merged_before, def __str__(self): child_str = ','.join([str(key) for key in self.children]) - return '{0} ({1}) [{2}] {3}'.format(self.key, self.count, child_str, - self.next_node is not None) + return '{0} ({1}) [{2}] {3}'.format( + self.key, self.count, child_str, + self.next_node is not None) def __repr__(self): return self.__str__() @@ -385,12 +387,14 @@ def get_fptree(transactions, key_func=None, min_support=2): ''' if key_func is None: - key_func = lambda e: e + def key_func(e): + return e - asorted_seqs, frequencies = _sort_transactions_by_freq(transactions, - key_func, True, False, False) - transactions = [[item[1] for item in aseq if item[0] >= min_support] for - aseq in asorted_seqs] + asorted_seqs, frequencies = _sort_transactions_by_freq( + transactions, key_func, True, False, False) + transactions = [ + [item[1] for item in aseq if item[0] >= min_support] for + aseq in asorted_seqs] root = FPNode(FPNode.root_key, None) heads = {} @@ -403,7 +407,6 @@ def get_fptree(transactions, key_func=None, min_support=2): new_heads = OrderedDict() for (head, head_support) in sorted_heads: new_heads[head.key] = (head, head_support) - #new_heads = tuple(heads.values()) return (root, new_heads) @@ -419,8 +422,9 @@ def _create_cond_tree(head_node, new_heads, pruning): visited = {} last_insert = {} while head_node is not None: - head_node.get_cond_tree(None, head_node.count, visited, new_heads, - last_insert, True) + head_node.get_cond_tree( + None, head_node.count, visited, new_heads, + last_insert, True) head_node = head_node.next_node return new_heads @@ -436,8 +440,9 @@ def _prune_cond_tree(heads, min_support): while node is not None: # If the node is merged, we lose the next_node next_node = node.next_node - node.prune_me(previous_node, visited_parents, merged_before, - merged_now, heads, min_support) + node.prune_me( + previous_node, visited_parents, merged_before, + merged_now, heads, min_support) if node.next_node is not None: # Only change the previous node if it wasn't merged. previous_node = node @@ -470,13 +475,12 @@ def _fpgrowth(fptree, fis, report, min_support=2, pruning=True): continue fis.add(head_node.key) - #print('Report {0} with support {1}'.format(fis, head_support)) report[frozenset(fis)] = head_support new_heads = _init_heads(heads) _create_cond_tree(head_node, new_heads, pruning) if pruning: _prune_cond_tree(new_heads, min_support) - n = n + 1 + _fpgrowth((None, new_heads), fis, report, min_support, - pruning) + n = n + 1 + _fpgrowth( + (None, new_heads), fis, report, min_support, pruning) fis.remove(head_node.key) return n diff --git a/pymining/perftesting.py b/pymining/perftesting.py index 836f680..ffd300f 100644 --- a/pymining/perftesting.py +++ b/pymining/perftesting.py @@ -40,10 +40,11 @@ def get_default_transactions_alt(): def get_default_sequences(): '''Returns a small list of sequences. For testing purpose.''' - return ( 'caabc', 'abcb', 'cabc', 'abbca' ) + return ('caabc', 'abcb', 'cabc', 'abbca') -def get_random_transactions(transaction_number=500, +def get_random_transactions( + transaction_number=500, max_item_per_transaction=100, max_key_length=50, key_alphabet=string.ascii_letters, universe_size=1000): '''Generates a random list of `transaction_number` transactions containing @@ -62,14 +63,17 @@ def get_random_transactions(transaction_number=500, words = [] for _ in range(universe_size): - word = ''.join((random.choice(key_alphabet) for x in + word = ''.join(( + random.choice(key_alphabet) for x in range(random.randint(1, max_key_length)))) words.append(word) transactions = [] for _ in range(transaction_number): - transaction = {word for word in random.sample(words, random.randint(0, - max_item_per_transaction))} + transaction = { + word for word in + random.sample(words, random.randint(0, max_item_per_transaction)) + } transactions.append(transaction) return transactions @@ -146,8 +150,8 @@ def test_itemset_perf(perf_round=10, sparse=True, seed=None): start = time() for i in range(perf_round): - (n, report) = test_fpgrowth(False, transactions, support, - pruning=True) + (n, report) = test_fpgrowth( + False, transactions, support, pruning=True) print('Done round {0}'.format(i)) end = time() print('FP-Growth (pruning on) took: {0}'.format(end - start)) @@ -155,8 +159,8 @@ def test_itemset_perf(perf_round=10, sparse=True, seed=None): start = time() for i in range(perf_round): - (n, report) = test_fpgrowth(False, transactions, support, - pruning=False) + (n, report) = test_fpgrowth( + False, transactions, support, pruning=False) print('Done round {0}'.format(i)) end = time() print('FP-Growth (pruning off) took: {0}'.format(end - start)) diff --git a/setup.py b/setup.py index e1af12f..f5db790 100644 --- a/setup.py +++ b/setup.py @@ -2,31 +2,30 @@ from distutils.core import setup -setup(name='pymining', - version='0.1', - description='Small collection of data mining algorithms', - long_description= - ''' +setup( + name='pymining', + version='0.1', + description='Small collection of data mining algorithms', + long_description=''' pymining is a small collection of data mining algorithms implemented in Python (no C extension). This is mainly useful for environments without support for C. All algorithms come from the scientific literature. ''', - author='Barthelemy Dagenais', - author_email='barthe@users.sourceforge.net', - license='BSD License', - url='https://github.com/bartdag/pymining', - packages=['pymining'], - classifiers=[ - 'Intended Audience :: Developers', - 'License :: OSI Approved :: BSD License', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.2', - 'Topic :: Software Development :: Libraries', - ], - ) - + author='Barthelemy Dagenais', + author_email='barthe@users.sourceforge.net', + license='BSD License', + url='https://github.com/bartdag/pymining', + packages=['pymining'], + classifiers=[ + 'Intended Audience :: Developers', + 'License :: OSI Approved :: BSD License', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.2', + 'Topic :: Software Development :: Libraries', + ], +) diff --git a/tests/assocrules_tests.py b/tests/assocrules_tests.py index 2b78060..1f8e195 100644 --- a/tests/assocrules_tests.py +++ b/tests/assocrules_tests.py @@ -1,6 +1,7 @@ import unittest from pymining import itemmining, perftesting, assocrules + class TestAssocRule(unittest.TestCase): def testDefaultSupportConf(self): @@ -26,8 +27,8 @@ def testConfidence075(self): ts1 = perftesting.get_default_transactions() relim_input = itemmining.get_relim_input(ts1) report = itemmining.relim(relim_input, 2) - rules = assocrules.mine_assoc_rules(report, min_support=2, - min_confidence=0.75) + rules = assocrules.mine_assoc_rules( + report, min_support=2, min_confidence=0.75) self.assertEqual(8, len(rules)) a_rule = (frozenset(['b']), frozenset(['d']), 6, 0.75)