# Pronalaženje šablona

In [1]:
'''
Klasa koja reprezentuje
čvor prefiksnog stabla
'''
class TrieNode:
    def __init__(self, character):
        self.character = character
        self.children = {}
        self.is_leaf = True
        
    '''Dodavanje dete-čvora zadatim karakterom'''
    def add_child(self, character):
        self.is_leaf = False
        new_node = TrieNode(character)
        self.children[character] = new_node
        return new_node
    
    '''Metod vraća dete-čvor identifikovan zadatim karakterom'''
    def get_child(self, character):
        if character not in self.children:
            return None
        return self.children[character]
        
    def __str__(self):
        return f'{self.character}: {list(self.children.keys())}, is_leaf: {self.is_leaf}'

In [2]:
'''
Klasa koja reprezentuje
prefiksno stablo
'''
class Trie:
    def __init__(self, patterns):
        self.trie_construction(patterns)
        
    '''Konstruisanje stabla na osnovu zadate liste šablona'''
    def trie_construction(self, patterns):
        self.root = TrieNode(None)
        
        for pattern in patterns:
            current_node = self.root
            for current_symbol in pattern:
                if current_symbol not in current_node.children:
                    current_node.add_child(current_symbol)
                    
                current_node = current_node.get_child(current_symbol)
                
        return self.root
    
    def __str__(self):
        return f'{self.root}'
    
    '''
    DFS obilazak prefiksnog stabla,
    očekivan izlaz je lista svih unetih šablona    
    '''
    def dfs(self, node, path=''):
        for character, child in node.children.items():
            if child.is_leaf:
                print(f'{path}{character}')
            else:
                self.dfs(node=child, path=f'{path}{character}')
                
    '''
    Pronalaženje šablona 
    koji odgovara prefiksu zadatog teksta
    '''
    def prefix_trie_matching(self, text):
        current_node = self.root
        path = ''
        for character in text:
            if current_node.is_leaf and current_node.character != None:
                return path
            
            if character in current_node.children:
                path += character
                current_node = current_node.get_child(character)
                
            else:
                return None
            
    '''
    Pronalaženje svih šablona stabla
    u zadatom tekstu
    '''
    def trie_matching(self, text):
        found_patterns = []
        
        n = len(text)
        for i in range(n):
            result = self.prefix_trie_matching(text[i:])
            
            if result != None:
                found_patterns.append((result, i))
                
        return found_patterns

In [3]:
'''
Klasa koja reprezentuje sufiksni niz
'''
class SuffixArray:
    def __init__(self, text):
        self.arr = self.generate_suffix_array(text)
        
    '''
    Konstruisanje sufiksnog niza
    na osnovu zadatog teksta
    '''
    def generate_suffix_array(self, text):
        terminated_text = f'{text}$'
        arr = [(terminated_text[i:], i) for i in range(len(terminated_text))]
            
        return sorted(arr)
    
    '''Poređenje prefiksa šablona i prefiksa sufiksa'''
    def compare(self, pattern, suffix):
        n = len(pattern)
        m = len(suffix)
        
        if n > m:
            return 1
        
        suffix_pref = suffix[:n]
        
        if suffix_pref == pattern:
            return 0
        
        if suffix_pref < pattern:
            return 1
        else:
            return -1
        
    '''
    Pronalaženje svih pozicija u tekstu
    (na osnovu koga je konstruisan sufiksni niz)
    na kojima se nalazi zadati šablon
    '''
    def pattern_matching_with_suffix_array(self, pattern):
        n = len(self.arr)
        min_index = 0
        max_index = n
        
        while min_index <= max_index:
            mid_index = (min_index + max_index) // 2
            current_suffix = self.arr[mid_index][0]
           
            compare_res = self.compare(pattern, current_suffix)
            
            if compare_res == 0:
                i = mid_index
                while i >= 0 and self.compare(pattern, self.arr[i][0]) == 0:
                    i -= 1
                i += 1
                
                j = mid_index
                while j < n and self.compare(pattern, self.arr[j][0]) == 0:
                    j += 1
                
                return [self.arr[k][1] for k in range(i, j)]
            
            elif compare_res < 0:
                max_index = mid_index
            else:
                min_index = mid_index
                
        return []
        

In [4]:
'''
BWT transformacija teksta
'''
class BWT:
    def __init__(self, text):
        self.bwt_text = self.construct_bwt(text + '$')
        
    '''Konstrukcija transformisane vrednosti'''
    def construct_bwt(self, text):
        n = len(text)
        permutations = [text[i:n] + text[:i] for i in range(n)]
        return [x[-1] for x in sorted(permutations)]
    
    '''
    Rekonstrukcija originalne vrednosti teksta
    '''
    def inverse_bwt(self):
        last_column = self.bwt_text
        columns = sorted(last_column[:])
        
        original_row = last_column.index('$')
        
        n = len(self.bwt_text)
        for _ in range(n - 1):
            for j in range(n):
                columns[j] = last_column[j] + columns[j]
            columns.sort()

        return columns[original_row]
    
    '''
    Mapiranje indeksa karaktera iz poslednje kolone
    na odgovarajuće indekse iz prve kolone tabele
    '''
    def last_to_first(self, last_col_index):
        last_column = self.bwt_text
        first_column = sorted(last_column[:])
        
        last_col_char = last_column[last_col_index]
        
        n = len(last_column)
        rank = 0
        
        for i in range(last_col_index + 1):
            if last_column[i] == last_col_char:
                rank += 1
            
        count = 0
        for i in range(n):
            if first_column[i] == last_col_char:
                count += 1
                
            if count == rank:
                return i
            
    '''
    Pronalaženje broja pojavljivanja zadatog šablona
    u tekstu pomoću transformisane vrednosti
    '''
    def bw_matching(self, pattern):
        last_column = self.bwt_text
        
        n = len(self.bwt_text)
        top = 0
        bottom = n - 1
        
        j = len(pattern) - 1
        while top <= bottom:
            if j < 0:
                return bottom - top + 1
            
            symbol = pattern[j]
            j -= 1
            
            if symbol in last_column[top : bottom + 1]:
                first_index = None
                last_index = None
                
                for i in range(top, bottom + 1):
                    if symbol == last_column[i]:
                        if first_index == None:
                            first_index = i
                        last_index = i
                        
                top = self.last_to_first(first_index)
                bottom = self.last_to_first(last_index)
                    
            else:
                return 0

## Primeri

In [5]:
# Trie
patterns = ["ananas", "and", "antenna", "banana", "bandana", "nab", "nana", "pan"]
trie = Trie(patterns)
trie.dfs(trie.root)

ananas
and
antenna
banana
bandana
nab
nana
pan


In [6]:
trie.prefix_trie_matching('panamabananas')

'pan'

In [7]:
trie.trie_matching('panamabananas')

[('pan', 0), ('banana', 6), ('nana', 8)]

In [8]:
# Suffix array
suff_arr = SuffixArray('panamabananas')

for i in range(len(suff_arr.arr)):
    print(suff_arr.arr[i])

('$', 13)
('abananas$', 5)
('amabananas$', 3)
('anamabananas$', 1)
('ananas$', 7)
('anas$', 9)
('as$', 11)
('bananas$', 6)
('mabananas$', 4)
('namabananas$', 2)
('nanas$', 8)
('nas$', 10)
('panamabananas$', 0)
('s$', 12)


In [9]:
suff_arr.pattern_matching_with_suffix_array('banana')

[6]

In [10]:
# BWT
bwt = BWT('panamabananas')
print(''.join(bwt.bwt_text))
print(''.join(bwt.inverse_bwt()))
bwt.last_to_first(6)
bwt.bw_matching('ana')

smnpbnnaaaaa$a
panamabananas$


3