# Algorytmy tekstowe - laboratorium 2 
## Implementacja trie i suffix tree
Aneta Porębska gr. wtorek 17.50A

### Trie

In [1]:
class TrieNode:
    
    def __init__(self, letter):
        self.children = dict()
        self.letter = letter


class Trie:

    def __init__(self, root):
        self.root = root

    def find(self, suffix, root):
        if suffix[0] in root.children.keys():
            return self.find(suffix[1:], root.children[suffix[0]])
        else:
            return (root, suffix)

    def add(self, suffix, node):
        last = node
        curr = None

        for l in suffix:
            curr = TrieNode(l)
            last.children[l] = curr
            last = curr
        return True


def compute_initial_trie(text):
    root = TrieNode(None)
    last = root
    curr = None
    for i in range(len(text)):
        curr = TrieNode(text[i])
        last.children[text[i]] = curr
        last = curr
    return root


def build_trie(text):
    root = compute_initial_trie(text)
    trie = Trie(root)
    suffix = text[1:]
    while suffix:
        node, suf = trie.find(suffix, trie.root)
        trie.add(suf, node)
        suffix = suffix[1:]
    return trie


def traverse_trie(root, depth):
    print(root.letter, depth, root.children.keys())
    if root.children:
        for key in root.children.keys():
            traverse_trie(root.children[key], depth+1)


def find_pattern_trie_(pattern, root):
    if pattern == "":
        return True

    if not root.children:
        return False

    if pattern[0] in root.children.keys():
        return find_pattern_trie_(pattern[1:], root.children[pattern[0]])
    else:
        return False

    
def find_pattern_trie(text, pattern):
    trie = build_trie(text)
    root = trie.root
    return find_pattern_trie_(pattern, root)
    
    

### Suffix tree (slow_find)

In [2]:
class Node:
    def __init__(self, start, end, depth, parent):
        self.start = start
        self.end = end
        self.children = dict()
        self.parent = parent
        self.depth = depth

    # st - indeks startowy w text, w którym label zaczyna się w text
    # idx - indeks w label w którym należy rozbić Node
    def break_path(self, label, st, idx, text):
        child = self.children[label[0]]
        prev_end = child.end
        child.end = child.start + idx
        node_prev = Node(child.end, prev_end, child.depth, child)
        child.depth = st - child.start
        
        if child.children:
            node_prev.children = child.children
            child.children = dict()
        
        child.children[text[node_prev.start]] = node_prev
        return child

    # label = to co wstawiam; text to cały tekst
    def slow_find(self, text, label):
        
        if len(label)==0 or label[0] not in self.children.keys():
            return self

        child = self.children[label[0]]
        j = 0
        for i in range(child.start, child.end):
            if text[i]==label[j]:
                j +=1
            else:
                return self.break_path(label, i, j, text)

        return child.slow_find(text, label[j:])


def graft(node, text, idx):

    start = idx + node.depth
    end = start + len(text[idx:]) - node.depth
    depth = node.depth + (end-start)
    child = Node(start, end, depth, node)
    node.children[text[start]] = child
    return child


def traverse_suffix_tree(root, depth):
    print(" Level: ", depth, "    Children: ", root.children.keys())
    for key in root.children.keys():
        print(key, end="")
        traverse_suffix_tree(root.children[key], depth+1)

        
def build_suffix_tree(text):
    root = Node(0,0,0, None)
    graft(root, text, 0)

    for i in range(1, len(text)):
        head = root.slow_find(text, text[i:])
        graft(head, text, i)

    return root


def find_pattern(text, pattern):
    length = len(pattern)
    root = build_suffix_tree(text)

    node = root
    i = 0
    while i<length and pattern[i] in node.children.keys():
        node = node.children[pattern[i]]
        for j in range(node.start, node.end):
            if pattern[i] == text[j]:
                i += 1
            else:
                return False

            if i == length:
                return True

    return False
        
    
    

### Sprawdzenie poprawności działania trie

In [3]:
print(find_pattern_trie("aaabc", "ab"))
print(find_pattern_trie("aaabc", "abc"))
print(find_pattern_trie("aaabc", "ab"))
print(find_pattern_trie("aaabc", "aab"))
print(find_pattern_trie("aaabc", "aaab"))
print(find_pattern_trie("aaabc", "aabc"))
print(find_pattern_trie("aaabc", "aaabc"))
print(find_pattern_trie("aaabc", "kaabc"))
print(find_pattern_trie("aaabc", "aabbc"))
print(find_pattern_trie("aaabc", "aabcc"))

True
True
True
True
True
True
True
False
False
False


In [4]:
f = open('1997_714_head.txt', 'r')
text5 = f.read()
text5 +="$"
print(find_pattern_trie(text5, "Art"))
print(find_pattern_trie(text5, "Nr"))
print(find_pattern_trie(text5, "numer"))

True
True
False


### Sprawdzenie poprawności działania suffix tree

In [5]:
print(find_pattern("aaabc", "ab"))
print(find_pattern("aaabc", "abc"))
print(find_pattern("aaabc", "ab"))
print(find_pattern("aaabc", "aab"))
print(find_pattern("aaabc", "aaab"))
print(find_pattern("aaabc", "aabc"))
print(find_pattern("aaabc", "aaabc"))
print(find_pattern("aaabc", "kaabc"))
print(find_pattern("aaabc", "aabbc"))
print(find_pattern("aaabc", "aabcc"))

True
True
True
True
True
True
True
False
False
False


In [6]:
print(find_pattern(text5, "Art"))
print(find_pattern(text5, "Nr"))
print(find_pattern(text5, "numer"))

True
True
False


# Testy - porównanie czasu tworzenia struktur

In [7]:
import time

In [8]:
text1 = "bbbd"

start = time.time()
build_trie(text1)
end = time.time()
print("Czas tworzenia trie: ", end - start)

start = time.time()
build_suffix_tree(text1)
end = time.time()
print("Czas tworzenia suffix tree: ", end - start)

Czas tworzenia trie:  5.1975250244140625e-05
Czas tworzenia suffix tree:  5.1975250244140625e-05


In [9]:
text2 = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"*10+"d"

start = time.time()
build_trie(text2)
end = time.time()
print("Czas tworzenia trie: ", end - start)

start = time.time()
build_suffix_tree(text2)
end = time.time()
print("Czas tworzenia suffix tree: ", end - start)

Czas tworzenia trie:  0.0351557731628418
Czas tworzenia suffix tree:  0.009258270263671875


In [10]:
text3 = "bbbdcccccccccaaaaaacacccccccccccccccababababababababababab"*100 + "k"

start = time.time()
#build_trie(text3)
end = time.time()
print("Czas tworzenia trie: - (przekroczenie rozmiaru stosu)", )

start = time.time()
build_suffix_tree(text3)
end = time.time()
print("Czas tworzenia suffix tree: ", end - start)

Czas tworzenia trie: - (przekroczenie rozmiaru stosu)
Czas tworzenia suffix tree:  0.5113461017608643


In [11]:
f = open('1997_714_head.txt', 'r')
text5 = f.read()
text5 +="$"

start = time.time()
build_trie(text5)
end = time.time()
print("Czas tworzenia trie: ", end - start)

start = time.time()
build_suffix_tree(text5)
end = time.time()
print("Czas tworzenia suffix tree: ", end - start)

Czas tworzenia trie:  5.163111209869385
Czas tworzenia suffix tree:  0.01050877571105957
