Skip to content

Commit

Permalink
Implemented search_all (including overlaps)
Browse files Browse the repository at this point in the history
  • Loading branch information
FrederikP committed Mar 8, 2018
1 parent c5dd590 commit fb68813
Show file tree
Hide file tree
Showing 6 changed files with 23 additions and 37 deletions.
6 changes: 2 additions & 4 deletions README.md
Expand Up @@ -25,10 +25,7 @@ suitable for really large sets of keywords') which really was the case the last
### Differences

- Compared to [pyahocorasick](https://github.com/WojciechMula/pyahocorasick/) our library supports unicode in python 2.7 just like [py-aho-corasick](https://github.com/JanFan/py-aho-corasick).
We don't use any C-Extension so the library is not platform dependant.

- We don't look for multiple results, we are okay with finding only one. We might extend this in the future.
For our use cases that was sufficient and helps with performance.
We don't use any C-Extension so the library is not platform dependant.

- On top of the standard Aho-Corasick longest suffix search, we also perform a shortcutting routine in the end, so
that our lookup is fast while, the setup takes longer. During set up we go through the states and directly add transitions that are
Expand Down Expand Up @@ -108,6 +105,7 @@ for result in results:
Prints :
```python
('mallorca', 11)
('orca', 15)
('mallorca bella', 11)
('lacrosse', 23)
```
Expand Down
Binary file modified img/readme_example.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
39 changes: 17 additions & 22 deletions src/ahocorapy/keywordtree.py
Expand Up @@ -18,15 +18,15 @@

class State(object):
__slots__ = ['identifier', 'symbol', 'success', 'transitions', 'parent',
'matched_keyword', 'longest_strict_suffix']
'matched_keywords', 'longest_strict_suffix']

def __init__(self, identifier, symbol=None, parent=None, success=False):
self.symbol = symbol
self.identifier = identifier
self.transitions = {}
self.parent = parent
self.success = success
self.matched_keyword = None
self.matched_keywords = []
self.longest_strict_suffix = None


Expand Down Expand Up @@ -74,7 +74,7 @@ def add(self, keyword):
current_state.transitions[char] = next_state
current_state = next_state
current_state.success = True
current_state.matched_keyword = original_keyword
current_state.matched_keywords.append(original_keyword)

def search(self, text):
'''
Expand All @@ -84,30 +84,22 @@ def search(self, text):

def search_one(self, text):
'''
Search a text for any occurence of a keyword.
Search a text for any occurence of any added keyword.
Returns when one keyword has been found.
Can only be called after finalized() has been called.
O(n) with n = len(text)
@return: 2-Tuple with keyword and startindex in text.
Or None if no keyword was found in the text.
'''
if not self._finalized:
raise ValueError('KeywordTree has not been finalized.' +
' No search allowed. Call finalize() first.')
if self._case_insensitive:
text = text.lower()
current_state = self._zero_state
for idx, symbol in enumerate(text):
current_state = current_state.transitions.get(
symbol, self._zero_state.transitions.get(symbol,
self._zero_state))
if current_state.success:
keyword = current_state.matched_keyword
return (keyword, idx + 1 - len(keyword))
result_gen = self.search_all(text)
try:
return next(result_gen)
except StopIteration:
return None

def search_all(self, text):
'''
Search a text for any occurence of a keyword.
Search a text for all occurences of the added keywords.
Can only be called after finalized() has been called.
O(n) with n = len(text)
@return: Generator used to iterate over the results.
Expand All @@ -124,8 +116,8 @@ def search_all(self, text):
symbol, self._zero_state.transitions.get(symbol,
self._zero_state))
if current_state.success:
keyword = current_state.matched_keyword
yield (keyword, idx + 1 - len(keyword))
for keyword in current_state.matched_keywords:
yield (keyword, idx + 1 - len(keyword))

def finalize(self):
'''
Expand Down Expand Up @@ -172,10 +164,13 @@ def search_lss(self, state):
if traversed.longest_strict_suffix is None:
self.search_lss(traversed)
traversed = traversed.longest_strict_suffix

if state.longest_strict_suffix.success:
state.success = True
state.matched_keywords.extend(state.longest_strict_suffix.matched_keywords)

for symbol, next_state in\
state.longest_strict_suffix.transitions.items():
if (symbol not in state.transitions and
state.longest_strict_suffix != self._zero_state)\
or next_state.success:
state.longest_strict_suffix != self._zero_state):
state.transitions[symbol] = next_state
2 changes: 1 addition & 1 deletion src/ahocorapy_visualizer/visualizer.py
Expand Up @@ -17,7 +17,7 @@ def _add_state_and_children(self, graph, state, added_state_ids):
state.identifier,
color='green',
label=str(state.identifier) + ' [' +
state.matched_keyword + ']')
','.join(state.matched_keywords) + ']')
else:
graph.add_node(state.identifier)
added_state_ids.add(state.identifier)
Expand Down
2 changes: 2 additions & 0 deletions tests/ahocorapy_test.py
Expand Up @@ -43,6 +43,7 @@ def test_readme_example(self):
kwtree.add('lacrosse')
kwtree.add('mallorca')
kwtree.add('mallorca bella')
kwtree.add('orca')
kwtree.finalize()

result = kwtree.search('My favorite islands are malaga and sylt.')
Expand All @@ -55,6 +56,7 @@ def test_readme_example(self):
results = kwtree.search_all('malheur on mallorca bellacrosse')
self.assertIsNotNone(results)
self.assertEqual(('mallorca', 11), next(results))
self.assertEqual(('orca', 15), next(results))
self.assertEqual(('mallorca bella', 11), next(results))
self.assertEqual(('lacrosse', 23), next(results))
with self.assertRaises(StopIteration):
Expand Down
11 changes: 1 addition & 10 deletions tests/visualizer_test.py
Expand Up @@ -14,18 +14,9 @@ def test_visualizer(self):
kwtree.add('lacrosse')
kwtree.add('mallorca')
kwtree.add('mallorca bella')
kwtree.add('orca')
kwtree.finalize()

result = kwtree.search('My favorite islands are malaga and sylt.')
self.assertEqual(('malaga', 24), result)

result = kwtree.search(
'idontlikewhitespaceswhereismalacrossequestionmark')
self.assertEqual(('lacrosse', 29), result)

result = kwtree.search('crossing on mallorca bella')
self.assertEqual(('mallorca', 12), result)

visualizer = Visualizer()
visualizer.draw('readme_example.png', kwtree)

Expand Down

0 comments on commit fb68813

Please sign in to comment.