Agnieszka Dutka

# Text algorithms, lab 2
### *Trie vs Tree structure*
Skip to Contents:
[Suffix trie](#trie)
[Suffix Tree](#tree) 
[Comparision](#tests)

<a id='trie'></a>
##  --Suffix Trie--

#### Useful structures

In [2]:
class Node:
    idx = -1
    def __init__(self, c, parent=None, depth=0):
        self.c = c
        self.parent = parent
        self.depth = depth
        self.children = {}
        self.link = None
        self.idx = self.next_idx()
    
    def child(self, c):
        """ find child with label c """
        return self.children.get(c, None)
        
    def add_child(self, c):
        if c in self.children.keys():
            print("child already exists!")
            return self.children[c]
        self.children[c] = Node(c, self, self.depth+1)
        return self.children[c]
    
    def find(self, word: list):
        """ find word in a trie of which self is root """
        if len(word) == 0:
            return True
        if self.child(word[0]) is None:
            return False
        return self.child(word[0]).find(word[1:])
    
    def next_idx(self):  
        """ finding unique idx for each node (idx of last-added node = size of trie-1)
        (unless several tries created at once)"""
        if not self.parent:
            Node.idx = -1
        Node.idx += 1
        return Node.idx
    
    def __repr__(self):
        return f"{self.c}-{self.idx}"

#### Suffix Trie implementation

In [3]:
from queue import LifoQueue as queue 

def up_link_down(sibling):
    letters = queue()
    while(sibling and not sibling.link):
        letters.put(sibling.c)
        sibling=sibling.parent
    if(not sibling):
        return(None,None)
    node=sibling.link
    current_letter=letters.get()
    while(current_letter):
        if(node.child(current_letter)):
            node=node.child(current_letter)
            sibling=sibling.child(current_letter)
            sibling.link = node
        else:
            break
        current_letter=letters.get()
    return(node,sibling)

def graft(node, fragment, sibling=None):
#     print(f"adding fragment length {len(fragment)}, {fragment}")
    for letter in list(fragment):
        node = node.add_child(letter)
        if(sibling):
            sibling=sibling.child(letter)
            sibling.link = node
    return node

""" building trie """
def left_to_right(text):
    root = Node("")
    leaf = graft(root, text)
    root.child(text[0]).link = root
    for i in range(1, len(text)):
        head, sibling = up_link_down(leaf)
        leaf = graft(head,text[i+head.depth:],sibling)
        root.child(text[i]).link = root # experimental
    return root


#### Suffix Trie checker

In [4]:
def check(trie, text, w_print=False):
    """ check if trie contains all suffixes from text
    @w_print: print results instead of returning value """
    for i in range(len(text)):
        if not trie.find(text[i:]):
            if w_print:
                print(f"suffix {text[i:]} not found")
                return
            else:
                return False
    if w_print:
        print(f"trie correct\n{Node.idx+1} nodes created")
    else:
        return True

#### Tests

In [16]:
text = "bbbd"
trie = left_to_right(text)
check(trie, text, True)

text = "aabbabd"
trie = left_to_right(text)
check(trie, text, True)

text = "ababcd"
trie = left_to_right(text)
check(trie, text, True)

trie correct
8 nodes created
trie correct
24 nodes created
trie correct
19 nodes created


<a id='tree'></a>
## --Suffix Tree--

<a id='tests'></a>
## --Comparison--

#### Running time evaluation

In [28]:
from time import perf_counter
def time_eval(func, args, w_print=False, name=None, count=5):
    start = perf_counter()
    for i in range(count):
        func(*args)
    end = perf_counter()
    avg = (end-start)/count
    if w_print:
        print(f"{name} average time: {avg}")
    else:
        return avg 

In [27]:
text = "bbbd"
trie = left_to_right(text)
check(trie, text, True)
time_eval(left_to_right, [text], True, "Trie")

trie correct
8 nodes created
Trie avreage time: 0.00014986000001044887


In [6]:
text = "aabbabd"
trie = left_to_right(text)
check(trie, text, True)

trie correct
24 nodes created


In [7]:
text = "ababcd"
trie = left_to_right(text)
check(trie, text)

True

In [8]:
text = "abcbccd"
trie = left_to_right(text)
check(trie, text)


True

In [10]:
f = open("1997_714.txt", "r", encoding='utf-8')
text = ''.join(f.readlines())
text = text[:1000]
text += chr(27)
trie = left_to_right(text)
check(trie, text, True)

trie correct
474636 nodes created
