Merge d071b71 into 7eef60d

caleb531 · Mar 3, 2024 · 9d327a7 · 9d327a7
2 parents 7eef60d + d071b71
commit 9d327a7
Show file tree

Hide file tree

Showing 2 changed files with 148 additions and 0 deletions.
diff --git a/automata/fa/dfa.py b/automata/fa/dfa.py
@@ -1824,6 +1824,122 @@ def from_substring(
             final_states=final_states if contains else states - final_states,
         )
 
+    @classmethod
+    def from_substrings(
+        cls: Type[Self],
+        input_symbols: AbstractSet[str],
+        substrings: AbstractSet[str],
+        *,
+        contains: bool = True,
+        must_be_suffix: bool = False,
+    ) -> Self:
+        """
+        Directly computes a DFA recognizing strings containing at least one of the given substrings.
+        The implementation is based on the Aho-Corasick string-searching algorithm.
+        If `contains` is set to `False`, then the complement is constructed instead.
+        If `must_be_suffix` is set to `True`, then the each substring must be a suffix instead.
+
+        Parameters
+        ----------
+        input_symbols : AbstractSet[str]
+            The set of input symbols to construct the DFA over.
+        substrings : str
+            The set of strings to be recognized by this DFA.
+        contains : bool, default: True
+            Whether or to construct the compliment DFA.
+        must_be_suffix : bool, default: False
+            Whether or not the target substrings must be a suffix.
+
+        Returns
+        ------
+        Self
+            The DFA accepting the desired language.
+        """
+
+        class OutNode:
+            def __init__(self, keyword: str, next_node: Optional[OutNode]) -> None:
+                self.keyword: str = keyword
+                self.successor: Optional[OutNode] = next_node
+
+        class Node:
+            def __init__(self) -> None:
+                self.out: Optional[OutNode] = None
+                self.fail: Optional[Node] = None
+                self.successors: Dict[str, Node] = {}
+
+        root = Node()
+        labels = {id(root): 0}
+        final_states = set()
+        for substring in substrings:
+            current_node = root
+            for symbol in substring:
+                current_node.successors.setdefault(symbol, Node())
+                current_node = current_node.successors[symbol]
+                labels.setdefault(id(current_node), len(labels))
+            current_node.out = OutNode(substring, None)
+
+        queue = deque(root.successors.values())
+        while queue:
+            current_node = queue.popleft()
+            for symbol, successor in current_node.successors.items():
+                queue.append(successor)
+
+                st = current_node.fail
+                while st is not None and symbol not in st.successors:
+                    st = st.fail
+
+                if st is None:
+                    st = root
+
+                successor.fail = st.successors.get(symbol, None)
+
+                if successor.fail is not None:
+                    if successor.out is None:
+                        successor.out = successor.fail.out
+                    else:
+                        out = successor.out
+                        while out.successor is not None:
+                            out = out.successor
+                        out.successor = successor.fail.out
+
+        transitions: Dict[DFAStateT, Dict[str, DFAStateT]] = {}
+
+        queue = deque([root])
+        while queue:
+            current_node = queue.popleft()
+            state = labels[id(current_node)]
+            if current_node.out is not None:
+                final_states.add(state)
+            current_transitions = {}
+            for symbol in input_symbols:
+                if symbol in current_node.successors:
+                    queue.append(current_node.successors[symbol])
+                parent_node = current_node
+                while parent_node is not None and symbol not in parent_node.successors:
+                    parent_node = parent_node.fail
+                if parent_node is None:
+                    parent_node = root
+                successor = parent_node.successors.get(symbol, root)
+                current_transitions[symbol] = labels[id(successor)]
+
+            transitions[state] = current_transitions
+
+        if not must_be_suffix:
+            end_state = len(transitions)
+            transitions[end_state] = {symbol: end_state for symbol in input_symbols}
+            for state in final_states:
+                transitions[state] = {symbol: end_state for symbol in input_symbols}
+            final_states.add(end_state)
+
+        states = frozenset(transitions.keys())
+        return cls(
+            states=states,
+            input_symbols=input_symbols,
+            transitions=transitions,
+            initial_state=0,
+            final_states=final_states if contains else states - final_states,
+        )
+
     @classmethod
     def from_subsequence(
         cls: Type[Self],

diff --git a/tests/test_dfa.py b/tests/test_dfa.py
@@ -2133,6 +2133,38 @@ def test_contains_substring(self, as_partial: bool) -> None:
                 break
             self.assertIn("nano", word)
 
+    def test_contains_substrings(self) -> None:
+        input_symbols = {"a", "n", "o", "b"}
+        substring_dfa = DFA.from_substring(input_symbols, "nano")
+        substrings_dfa = DFA.from_substrings(input_symbols, {"nano"})
+
+        self.assertEqual(substring_dfa, substrings_dfa)
+
+        substring_dfa = substring_dfa | DFA.from_substring(input_symbols, "banana")
+        substrings_dfa = DFA.from_substrings(input_symbols, {"banana", "nano"})
+
+        self.assertEqual(substring_dfa, substrings_dfa)
+
+        self.assertEqual(
+            ~substrings_dfa, DFA.from_substrings(input_symbols, {"banana", "nano"},
+                                                 contains=False)
+        )
+
+        m = 50
+        n = 50
+        input_symbols = {"a", "b"}
+        language = {("a" * i + "b" * j) for i, j in product(range(n), range(m))}
+
+        equiv_dfa = DFA.from_substrings(
+            input_symbols, language,
+        )
+
+        res_dfa = DFA.empty_language(input_symbols)
+        for string in language:
+            res_dfa |= DFA.from_substring(input_symbols, string)
+
+        self.assertEqual(equiv_dfa, res_dfa)
+
     @params(True, False)
     def test_contains_subsequence(self, as_partial: bool) -> None:
         """Should compute the minimal DFA accepting strings with the given