In [1]:
import time
terminal = chr(0xF0000) # Unicode Private Use Area (https://en.wikipedia.org/wiki/Private_Use_Areas)

# Trie

In [2]:
class Trie:
    class _Node:
        def __init__(self, letter, parent=None, children=None, link=None, pref_len=0):
            self.letter = letter
            self.parent = parent
            self.children = {} if not children else children
            self.link = link
            self.len = pref_len
        
        def add(self, child):
            if child.letter in self.children:
                raise RuntimeError('Undefined behavior')
            self.children[child.letter] = child
        
        def __getitem__(self, key):
            return self.children[key]
        
        def __contains__(self, key):
            return key in self.children
        
        def __len__(self):
            return self.len
        
    def __init__(self, text=None):
        self.root = self._Node(letter=None, pref_len=0)
        self.root.link = self.root
        self.root.parent = None
        
        if text:
            self.build(text)
    
    def _validate_and_add(self, text):
        if text[-1] == terminal:
            return text
        else:
            return text + terminal
        
    def build(self, text):
        text = self._validate_and_add(text)
        head = self.root
        
        for i in range(len(text)):
            #if (i+1) % 1000 == 0: print(i+1, '/', len(text))
            leaf = self.add(text, i, head)
            head = self.up_link_down(leaf)
    
    def add(self, text, suffix_start, head):
        leaf = head
        for i in range(suffix_start+len(head), len(text)):
            new_node = self._Node(parent = leaf, letter=text[i], pref_len=len(leaf)+1)
            leaf.add(new_node)
            leaf = new_node
            
        return leaf
    
    def up_link_down(self, leaf):
        q = [] # works A LOT faster than LifoQueue (on 10^6 elements x30 faster)
        prev_branch = leaf
        while prev_branch.link is None:
            q.append(prev_branch.letter)
            prev_branch = prev_branch.parent
        
        new_head = prev_branch.link
        if prev_branch is self.root:
            l = q.pop()
            self.root[l].link = self.root
            prev_branch = self.root[l]
        
        while q and q[-1] in new_head:
            l = q.pop()
            prev_branch = prev_branch[l]
            new_head = new_head[l]
            prev_branch.link = new_head
        return new_head
    
    def find_prefix(self, text, suffix_start, start_node=None): # used erlier
        node = start_node if start_node else self.root
        for i in range(suffix_start, len(text)):
            if text[i] not in node:
                return node, i
            else: 
                node = node[text[i]]
        return node, len(text)
        
    def __contains__(self, suffix):
        return self.find_prefix(suffix, 0)[1] == len(suffix)

In [5]:
import random
TEST1 = 'bbb$'
TEST2 = 'aabbabd'
TEST3 = 'ababcd'
TEST4 = 'abcbccd'
max_test_num = 1000

with open('1997_714.txt') as f:
    TEST5 = f.read()

def test(test_class, num_tests=None):
    num_tests = num_tests if num_tests else 5
    for test_num, s in enumerate([TEST1, TEST2, TEST3, TEST4, TEST5][:num_tests]):
        print(f'Building tree for TEST{test_num+1}')
        %timeit index = test_class(s)
        index = test_class(s)
        n = min(max_test_num, len(s))
        if max_test_num < len(s):
            for i in range(0, n):
                if not s[random.randint(0, len(s)):] in index:
                    return False
        else:
            for i in range(0, n):
                if not s[i:] in index:
                    return False
            
        if s is TEST5:
            if TEST1 in index:
                return False
        else:
            if TEST5 in index:
                return False
        print(f'Test num {test_num+1} passed')
    return True

In [6]:
if test(Trie, 4): # test(Trie, 5) U can do this but this takes eternity to build and a bit MORE then all your RAM  
    print('Tests passed') 
else:
    print('Tests failed')

Building tree for TEST1
28.1 µs ± 863 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Test num 1 passed
Building tree for TEST2
59.4 µs ± 2.2 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Test num 2 passed
Building tree for TEST3
48.2 µs ± 207 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Test num 3 passed
Building tree for TEST4
63 µs ± 3.68 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Test num 4 passed
Tests passed


# McCraigh algorithm

In [7]:
class SuffixTree:
    class _Node:
        def __init__(self, letter=None, suffix_start=-1, pref_len=-1, parent=None, children=None):
            self.letter = '' if not letter else letter
            self.parent = parent
            self.start = suffix_start
            self.len = pref_len
            self.children = {} if not children else children
            self.suffix_link = None
        
        def __len__(self): # depth in tree, but I do it my way XD
            return self.len # length of string builded from root to the and of this node 
        
        def __contains__(self, key):
            return key in self.children
        
        def __getitem__(self, key):
            return self.children[key]
        
        def add(self, child):
            self.children[child.letter] = child
        
    
    def __init__(self, text=None):
        self.root = self._Node(suffix_start=0, pref_len=0)
        self.root.suffix_link = self.root
        self.root.parent = self.root
        
        if text is not None:
            self.build(text)
    
    def _validate_and_add(self, text):
        if text[-1] == terminal:
            return text
        else:
            return text + terminal
        
    def build(self, text):
        text = self._validate_and_add(text)
        
        head = self.root
        pref_len = 0 # length of matched prefix of i'th suffix 
        
        for i in range(len(text)):
            if pref_len == len(head) and text[i + pref_len] in head:
                head, pref_len = self.slow_find(text, i, head, pref_len)
            
            if len(head) > pref_len:
                head = self.split_node(text, head, pref_len)
            self.create_leaf(text, i, head, pref_len)
            
            if head.suffix_link is None:
                self.fast_find(text, head, pref_len)
            head = head.suffix_link
            pref_len = len(head) # generaly max(0, pref_len - 1) 'cause of Lemma 1 in original paper
        
        self.text = text
    
    def slow_find(self, text, cur_start, head, pref_len):
        # jump to next node
        while pref_len == len(head) and text[cur_start + pref_len] in head:
            head = head[text[cur_start + pref_len]]
            pref_len += 1
            # go until the end of node searching for new head
            while pref_len < len(head) and text[cur_start + pref_len] == text[head.start + pref_len]:
                pref_len += 1
        return head, pref_len
    
    def split_node(self, text, node, pref_len):
        parent = node.parent
        new_node = self._Node(suffix_start=node.start, pref_len=pref_len, 
                              letter=text[node.start + len(parent)], parent=parent)
        parent.add(new_node)
        node.parent = new_node
        node.letter = text[node.start + pref_len]
        new_node.add(node)
        
        return new_node
    
    def create_leaf(self, text, suffix_start, head, pref_len):
        leaf = self._Node(suffix_start=suffix_start, pref_len=len(text)-suffix_start,
                          letter=text[suffix_start+pref_len])
        leaf.parent = head
        head.add(leaf)
        return leaf # just in case
    
    def fast_find(self, text, head, pref_len):
        next_head = head.parent.suffix_link
        
        while len(next_head) < pref_len - 1:
            next_head = next_head[text[head.start + len(next_head) + 1]]
        if len(next_head) > pref_len - 1:
            next_head = self.split_node(text, next_head, pref_len - 1)
        head.suffix_link = next_head
    
    def __contains__(self, key):
        node = self.root
        
        for d, l in enumerate(key):
            if d == len(node):
                if l not in node:
                    return False
                node = node[l]
            else:
                if l != self.text[node.start + d]:
                    return False
        return True

In [8]:
if test(SuffixTree, num_tests=5):
    print('Tests passed')
else:
    print('Tests failed')

Building tree for TEST1
19.8 µs ± 559 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Test num 1 passed
Building tree for TEST2
33.3 µs ± 1.58 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Test num 2 passed
Building tree for TEST3
27.6 µs ± 999 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Test num 3 passed
Building tree for TEST4
29.4 µs ± 1.23 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Test num 4 passed
Building tree for TEST5
1.31 s ± 132 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Test num 5 passed
Tests passed


# Degraded McCraight

In [9]:
class DegradedSuffixTree:
    class _Node:
        def __init__(self, letter=None, suffix_start=-1, pref_len=-1, parent=None, children=None):
            self.letter = '' if not letter else letter
            self.parent = parent
            self.start = suffix_start
            self.len = pref_len
            self.children = {} if not children else children
            self.suffix_link = None
        
        def __len__(self): # depth in tree, but I do it my way XD
            return self.len # length of string builded from root to the and of this node 
        
        def __contains__(self, key):
            return key in self.children
        
        def __getitem__(self, key):
            return self.children[key]
        
        def add(self, child):
            self.children[child.letter] = child
        
    
    def __init__(self, text=None):
        self.root = self._Node(suffix_start=0, pref_len=0)
        self.root.suffix_link = self.root
        self.root.parent = self.root
        
        if text is not None:
            self.build(text)
    
    def _validate_and_add(self, text):
        if text[-1] == terminal:
            return text
        else:
            return text + terminal
        
    def build(self, text):
        text = self._validate_and_add(text)
        
        head = self.root
        pref_len = 0 # length of matched prefix of i'th suffix 
        
        for i in range(len(text)):
            if pref_len == len(head) and text[i + pref_len] in head: ##TODO change
                head, pref_len = self.slow_find(text, i, head, pref_len)
            
            if len(head) > pref_len:
                head = self.split_node(text, head, pref_len)
            self.create_leaf(text, i, head, pref_len)
            
            pref_len = 0 # generaly max(0, pref_len - 1) 'cause of Lemma 1 in original paper
            head = self.root
            pref_len = 0
            
        self.text = text
    
    def slow_find(self, text, cur_start, head, pref_len):
        # jump to next node
        while pref_len == len(head) and text[cur_start + pref_len] in head:
            head = head[text[cur_start + pref_len]]
            pref_len += 1
            # go until the end of node searching for new head
            while pref_len < len(head) and text[cur_start + pref_len] == text[head.start + pref_len]:
                pref_len += 1
        return head, pref_len
    
    def split_node(self, text, node, pref_len):
        parent = node.parent
        new_node = self._Node(suffix_start=node.start, pref_len=pref_len, 
                              letter=text[node.start + len(parent)], parent=parent)
        parent.add(new_node)
        node.parent = new_node
        node.letter = text[node.start + pref_len]
        new_node.add(node)
        
        return new_node
    
    def create_leaf(self, text, suffix_start, head, pref_len):
        leaf = self._Node(suffix_start=suffix_start, pref_len=len(text)-suffix_start,
                          letter=text[suffix_start+pref_len])
        leaf.parent = head
        head.add(leaf)
        return leaf # just in case
    
    def __contains__(self, key):
        node = self.root
        
        for d, l in enumerate(key):
            if d == len(node):
                if l not in node:
                    return False
                node = node[l]
            else:
                if l != self.text[node.start + d]:
                    return False
        return True

In [None]:
if test(DegradedSuffixTree, num_tests=5):
    print('Tests passed')
else:
    print('Tests failed')

Building tree for TEST1
18.2 µs ± 596 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Test num 1 passed
Building tree for TEST2
27.9 µs ± 171 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Test num 2 passed
Building tree for TEST3
22.7 µs ± 1.31 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Test num 3 passed
Building tree for TEST4
27.4 µs ± 1.54 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Test num 4 passed
Building tree for TEST5
4.29 s ± 262 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
