In [1]:
with open("../lab2/brown_nouns.txt", "r") as f:
    words = [line.strip() for line in f if line.strip()]

In [2]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end = False
        self.freq = 0

class PrefixTrie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, word):
        node = self.root
        for ch in word:
            if ch not in node.children:
                node.children[ch] = TrieNode()
            node = node.children[ch]
            node.freq += 1
        node.is_end = True

    def search(self, word):
        node = self.root
        for ch in word:
            if ch not in node.children:
                return False
            node = node.children[ch]
        return node.is_end

    def starts_with(self, prefix):
        node = self.root
        for ch in prefix:
            if ch not in node.children:
                return []
            node = node.children[ch]
        results = []
        self._dfs(node, prefix, results)
        return results

    def _dfs(self, node, prefix, results):
        if node.is_end:
            results.append(prefix)
        for ch, child in node.children.items():
            self._dfs(child, prefix + ch, results)

class SuffixTrie(PrefixTrie):
    def insert(self, word):
        super().insert(word[::-1])

In [3]:
def split_word_prefix(word, trie):
    node = trie.root
    stem = ""
    max_branch_point = 0
    split_index = 0
    for i, ch in enumerate(word):
        node = node.children[ch]
        if len(node.children) > max_branch_point:
            max_branch_point = len(node.children)
            split_index = i+1
        stem += ch
    return word[:split_index], word[split_index:]

def split_word_suffix(word, trie):
    rev = word[::-1]
    node = trie.root
    max_branch_point = 0
    split_index = 0
    for i, ch in enumerate(rev):
        node = node.children[ch]
        if len(node.children) > max_branch_point:
            max_branch_point = len(node.children)
            split_index = i+1
    return word[:-split_index], word[-split_index:]


In [4]:
prefix_trie = PrefixTrie()
suffix_trie = SuffixTrie()

for w in words:
    prefix_trie.insert(w)
    suffix_trie.insert(w)

for w in words[:20]:
    stem_p, suf_p = split_word_prefix(w, prefix_trie)
    stem_s, suf_s = split_word_suffix(w, suffix_trie)
    print(f"{w} (prefix) = {stem_p}+{suf_p}")
    print(f"{w} (suffix) = {stem_s}+{suf_s}")


investigation (prefix) = in+vestigation
investigation (suffix) = investigati+on
primary (prefix) = p+rimary
primary (suffix) = primar+y
election (prefix) = e+lection
election (suffix) = electi+on
evidence (prefix) = e+vidence
evidence (suffix) = evidenc+e
irregularities (prefix) = i+rregularities
irregularities (suffix) = irregularitie+s
place (prefix) = p+lace
place (suffix) = plac+e
jury (prefix) = ju+ry
jury (suffix) = jur+y
presentments (prefix) = p+resentments
presentments (suffix) = presentment+s
charge (prefix) = c+harge
charge (suffix) = charg+e
election (prefix) = e+lection
election (suffix) = electi+on
praise (prefix) = p+raise
praise (suffix) = prais+e
thanks (prefix) = t+hanks
thanks (suffix) = thank+s
manner (prefix) = ma+nner
manner (suffix) = mann+er
election (prefix) = e+lection
election (suffix) = electi+on
term (prefix) = te+rm
term (suffix) = ter+m
jury (prefix) = ju+ry
jury (suffix) = jur+y
reports (prefix) = re+ports
reports (suffix) = report+s
irregularities (pref