## prosty przykład

In [86]:
words1 = ["aaa", "aam", "kot"]
example1 = [
    "kaataak",
    "oaataao",
    "tamaamt"
]
def create_alphabet(words):
    alphabet = set()
    for word in words:
        alphabet.update(word)
    return alphabet

alphabet = create_alphabet(words1)

## budowanie automatu

In [63]:
from collections import deque

class node:
    def __init__(self, index, terminal=""):
        self.index = index
        
        # indeksy wierzchołków grafu połączonego z self
        self.nodes = dict()


        # w polu ter_str zapisane jest za jakie słowo terminalne odpowiada węzeł
        # węzeł nie jest terminalny jeśli ter_str == ""
        self.terminals = set([terminal] if terminal else [])
    def __repr__(self):
        return f"node({self.index}, {self.nodes}, {self.terminals})"


# zwraca liste tupli (węzeł drezwa Trie, indeks ojca)
def build_Trie(Pi, A):
    trie = [(node(0), None)]
    last_i = 0
    for word in Pi:
        walker = trie[0][0]
        for letter in word:
            if letter not in walker.nodes:
                last_i += 1
                trie.append((node(last_i), walker.index))
                walker.nodes[letter] = last_i
            walker = trie[walker.nodes[letter]][0]
        walker.terminals.add(word)
    return trie
            
def build_SMA(Pi, A):
    T = set()
    trie = build_Trie(Pi, A)
        
    sma = [None for _ in trie]
    sma[0] = node(0)
    if trie[0][0].terminals:
        sma[0].terminals.update(trie[0][0].terminals)
        T.add(0)
    for letter in A:
        sma[0].nodes[letter] = 0
    if trie[0][0].terminals:
        sma[0].terminals = trie[0][0].terminals
    
    q = deque()
    q.append(0)
    while q:
        v = q.popleft()
        for letter, x in trie[v][0].nodes.items():
            # print(f"x={x}")
            if x == 0: continue
            t = trie[x][1]
            # print(f"t={t}")
            q.append(x)
            if sma[x] == None:
                sma[x] = node(x)
            temp = sma[t].nodes[letter]
            # print(f"tmp={t}")
            sma[t].nodes[letter] = x
            # print(f"sma[{t}].nodes[{letter}]={x}")
            if trie[x][0].terminals:
                T.add(x)
                sma[x].terminals.update(trie[x][0].terminals)
            if trie[temp][0].terminals:
                T.add(x)
                sma[x].terminals.update(trie[temp][0].terminals)
            for char_al in A:
                if char_al in trie[temp][0].nodes:
                    # print(f"\tsma[{x}].nodes[{char_al}]={trie[temp][0].nodes[char_al]}")
                    sma[x].nodes[char_al] = trie[temp][0].nodes[char_al]
                else:
                    # print(f"\tsma[{x}].nodes[{char_al}]={sma[temp].nodes[char_al]}")
                    sma[x].nodes[char_al] = sma[temp].nodes[char_al]
    return sma, T


build_SMA(words1, alphabet)

([node(0, {'k': 5, 't': 0, 'o': 0, 'a': 1, 'm': 0}, set()),
  node(1, {'k': 5, 't': 0, 'o': 0, 'a': 2, 'm': 0}, set()),
  node(2, {'k': 5, 't': 0, 'o': 0, 'a': 3, 'm': 4}, set()),
  node(3, {'k': 5, 't': 0, 'o': 0, 'a': 3, 'm': 4}, {'aaa'}),
  node(4, {'k': 5, 't': 0, 'o': 0, 'a': 1, 'm': 0}, {'aam'}),
  node(5, {'k': 5, 't': 0, 'o': 6, 'a': 1, 'm': 0}, set()),
  node(6, {'k': 5, 't': 7, 'o': 0, 'a': 1, 'm': 0}, set()),
  node(7, {'k': 5, 't': 0, 'o': 0, 'a': 1, 'm': 0}, {'kot'})],
 {3, 4, 7})

## szukanie dwówymiarowego wzorca w macierzy znaków

In [87]:
def prefixFunction(P):
    res = [0]
    k = 0
    for j in range(1, len(P)):
        while k > 0 and P[k] != P[j]:
            k = res[k-1]
        if P[k] == P[j]:
            k += 1
        res.append(k)
    return res


def _findPatternKMP(T, P, pi):
    S = []
    m = len(P)
    q = 0
    for i in range(len(T)):
        while q > 0 and T[i] != P[q]:
            q = pi[q-1]
        if T[i] == P[q]:
            q += 1 
        if q == m:
            q = pi[q-1]
            S.append(i - m + 1)
    return S

# algorytm kmp
def findPattern(T, P):
    pi = prefixFunction(P)
    return _findPatternKMP(T, P, pi)

print(findPattern([1,2,3,4,1,2,3], [1,2,3]))


[0, 4]


In [103]:
#Matrix - prostokątna tablica znaków w której szukamy wzorca, A - alfabet, Pi - lista kolum których szukamy: muszą być tej samej długości
import numpy as np

def find_pattern(Matrix, A, Pi):
    # budowanie automatu
    SMA, T = build_SMA(Pi, A)
    n = len(Matrix[0])
    m = len(Matrix)

    # odnajdowanine stsnów terminalnych
    Terminals = np.zeros((m,n), dtype=np.int64)
    for i in range(n):
        walker = SMA[0]
        for j in range(m):
            walker = SMA[walker.nodes[Matrix[j][i]]]
            Terminals[j][i] = walker.index
    
    # wyznaczenie szukanego ciągu terminali - ostatni wiersz Pi_terminals
    Pi_terminals = np.zeros((len(Pi), len(Pi[0])), dtype=np.int64)
    for i in range(len(Pi[0])):
        walker = SMA[0]
        for j in range(len(Pi)):
            walker = SMA[walker.nodes[Pi[i][j]]]
            Pi_terminals[j][i] = walker.index
    # szukanie rozwiązania
    res = []
    for i, line in enumerate(Terminals):
        print(line)
        for j in findPattern(line, Pi_terminals[-1]):
            res.append((i-len(Pi[0])+1,j))
    return res

for line in example1:
    print(line)
find_pattern(example1, alphabet, words1)

kaataak
oaataao
tamaamt
[5 1 1 0 1 1 5]
[6 2 2 0 2 2 6]
[7 3 4 1 3 4 7]


[(0, 4)]