Skip to content

Commit

Permalink
Merge d071b71 into 7eef60d
Browse files Browse the repository at this point in the history
  • Loading branch information
Tagl committed Mar 3, 2024
2 parents 7eef60d + d071b71 commit 9d327a7
Show file tree
Hide file tree
Showing 2 changed files with 148 additions and 0 deletions.
116 changes: 116 additions & 0 deletions automata/fa/dfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -1824,6 +1824,122 @@ def from_substring(
final_states=final_states if contains else states - final_states,
)

@classmethod
def from_substrings(
cls: Type[Self],
input_symbols: AbstractSet[str],
substrings: AbstractSet[str],
*,
contains: bool = True,
must_be_suffix: bool = False,
) -> Self:
"""
Directly computes a DFA recognizing strings containing at least one of the given substrings.
The implementation is based on the Aho-Corasick string-searching algorithm.
If `contains` is set to `False`, then the complement is constructed instead.
If `must_be_suffix` is set to `True`, then the each substring must be a suffix instead.
Parameters
----------
input_symbols : AbstractSet[str]
The set of input symbols to construct the DFA over.
substrings : str
The set of strings to be recognized by this DFA.
contains : bool, default: True
Whether or to construct the compliment DFA.
must_be_suffix : bool, default: False
Whether or not the target substrings must be a suffix.
Returns
------
Self
The DFA accepting the desired language.
"""

class OutNode:
def __init__(self, keyword: str, next_node: Optional[OutNode]) -> None:
self.keyword: str = keyword
self.successor: Optional[OutNode] = next_node

class Node:
def __init__(self) -> None:
self.out: Optional[OutNode] = None
self.fail: Optional[Node] = None
self.successors: Dict[str, Node] = {}

root = Node()
labels = {id(root): 0}
final_states = set()
for substring in substrings:
current_node = root
for symbol in substring:
current_node.successors.setdefault(symbol, Node())
current_node = current_node.successors[symbol]
labels.setdefault(id(current_node), len(labels))
current_node.out = OutNode(substring, None)

queue = deque(root.successors.values())
while queue:
current_node = queue.popleft()
for symbol, successor in current_node.successors.items():
queue.append(successor)

st = current_node.fail
while st is not None and symbol not in st.successors:
st = st.fail

if st is None:
st = root

successor.fail = st.successors.get(symbol, None)

if successor.fail is not None:
if successor.out is None:
successor.out = successor.fail.out
else:
out = successor.out
while out.successor is not None:
out = out.successor
out.successor = successor.fail.out

transitions: Dict[DFAStateT, Dict[str, DFAStateT]] = {}

queue = deque([root])
while queue:
current_node = queue.popleft()
state = labels[id(current_node)]
if current_node.out is not None:
final_states.add(state)
current_transitions = {}
for symbol in input_symbols:
if symbol in current_node.successors:
queue.append(current_node.successors[symbol])
parent_node = current_node
while parent_node is not None and symbol not in parent_node.successors:
parent_node = parent_node.fail
if parent_node is None:
parent_node = root
successor = parent_node.successors.get(symbol, root)
current_transitions[symbol] = labels[id(successor)]

transitions[state] = current_transitions

if not must_be_suffix:
end_state = len(transitions)
transitions[end_state] = {symbol: end_state for symbol in input_symbols}
for state in final_states:
transitions[state] = {symbol: end_state for symbol in input_symbols}
final_states.add(end_state)

states = frozenset(transitions.keys())
return cls(
states=states,
input_symbols=input_symbols,
transitions=transitions,
initial_state=0,
final_states=final_states if contains else states - final_states,
)

@classmethod
def from_subsequence(
cls: Type[Self],
Expand Down
32 changes: 32 additions & 0 deletions tests/test_dfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -2133,6 +2133,38 @@ def test_contains_substring(self, as_partial: bool) -> None:
break
self.assertIn("nano", word)

def test_contains_substrings(self) -> None:
input_symbols = {"a", "n", "o", "b"}
substring_dfa = DFA.from_substring(input_symbols, "nano")
substrings_dfa = DFA.from_substrings(input_symbols, {"nano"})

self.assertEqual(substring_dfa, substrings_dfa)

substring_dfa = substring_dfa | DFA.from_substring(input_symbols, "banana")
substrings_dfa = DFA.from_substrings(input_symbols, {"banana", "nano"})

self.assertEqual(substring_dfa, substrings_dfa)

self.assertEqual(
~substrings_dfa, DFA.from_substrings(input_symbols, {"banana", "nano"},
contains=False)
)

m = 50
n = 50
input_symbols = {"a", "b"}
language = {("a" * i + "b" * j) for i, j in product(range(n), range(m))}

equiv_dfa = DFA.from_substrings(
input_symbols, language,
)

res_dfa = DFA.empty_language(input_symbols)
for string in language:
res_dfa |= DFA.from_substring(input_symbols, string)

self.assertEqual(equiv_dfa, res_dfa)

@params(True, False)
def test_contains_subsequence(self, as_partial: bool) -> None:
"""Should compute the minimal DFA accepting strings with the given
Expand Down

0 comments on commit 9d327a7

Please sign in to comment.