## prosty przykład

In [1]:
words1 = ["aaa", "aam", "kot"]
example1 = [
    "kaataak",
    "oaataao",
    "tamaamt"
]
def create_alphabet(words):
    alphabet = set()
    for word in words:
        alphabet.update(word)
    return alphabet

alphabet = create_alphabet(words1)

## budowanie automatu

In [2]:
from collections import deque

class node:
    def __init__(self, index, terminal=""):
        self.index = index
        
        # indeksy wierzchołków grafu połączonego z self
        self.nodes = dict()


        # w polu ter_str zapisane jest za jakie słowo terminalne odpowiada węzeł
        # węzeł nie jest terminalny jeśli ter_str == ""
        self.terminals = [terminal] if terminal else []
    def __repr__(self):
        return f"node({self.index}, {self.nodes}, {self.terminals})"


# zwraca liste tupli (węzeł drezwa Trie, indeks ojca)
def build_Trie(Pi, A):
    trie = [(node(0), None)]
    last_i = 0
    for word in Pi:
        walker = trie[0][0]
        for letter in word:
            if letter not in walker.nodes:
                last_i += 1
                trie.append((node(last_i), walker.index))
                walker.nodes[letter] = last_i
            walker = trie[walker.nodes[letter]][0]
        walker.terminals.append(word)
    return trie

# zwraca liste węzłów automatu, oraz zbiór indeksów terminalnych
def build_SMA(Pi, A):
    T = set()
    trie = build_Trie(Pi, A)
        
    sma = [None for _ in trie]
    sma[0] = node(0)
    if trie[0][0].terminals:
        sma[0].terminals.append(trie[0][0].terminals)
        T.add(0)
    for letter in A:
        sma[0].nodes[letter] = 0
    if trie[0][0].terminals:
        sma[0].terminals = trie[0][0].terminals
    
    q = deque()
    q.append(0)
    while q:
        v = q.popleft()
        for letter, x in trie[v][0].nodes.items():
            if x == 0: continue
            t = trie[x][1]
            q.append(x)
            if sma[x] == None:
                sma[x] = node(x)
            temp = sma[t].nodes[letter]
            sma[t].nodes[letter] = x
            if trie[x][0].terminals:
                T.add(x)
                sma[x].terminals.append(trie[x][0].terminals)
            if trie[temp][0].terminals:
                T.add(x)
                sma[x].terminals.append(trie[temp][0].terminals)
            for char_al in A:
                if char_al in trie[temp][0].nodes:
                    sma[x].nodes[char_al] = trie[temp][0].nodes[char_al]
                else:
                    sma[x].nodes[char_al] = sma[temp].nodes[char_al]
    return sma, T


build_SMA(words1, alphabet)

([node(0, {'m': 0, 'k': 5, 't': 0, 'a': 1, 'o': 0}, []),
  node(1, {'m': 0, 'k': 5, 't': 0, 'a': 2, 'o': 0}, []),
  node(2, {'m': 4, 'k': 5, 't': 0, 'a': 3, 'o': 0}, []),
  node(3, {'m': 4, 'k': 5, 't': 0, 'a': 3, 'o': 0}, [['aaa']]),
  node(4, {'m': 0, 'k': 5, 't': 0, 'a': 1, 'o': 0}, [['aam']]),
  node(5, {'m': 0, 'k': 5, 't': 0, 'a': 1, 'o': 6}, []),
  node(6, {'m': 0, 'k': 5, 't': 7, 'a': 1, 'o': 0}, []),
  node(7, {'m': 0, 'k': 5, 't': 0, 'a': 1, 'o': 0}, [['kot']])],
 {3, 4, 7})

## szukanie dwówymiarowego wzorca w macierzy znaków

In [3]:
def prefixFunction(P):
    res = [0]
    k = 0
    for j in range(1, len(P)):
        while k > 0 and P[k] != P[j]:
            k = res[k-1]
        if P[k] == P[j]:
            k += 1
        res.append(k)
    return res


def _findPatternKMP(T, P, pi):
    S = []
    m = len(P)
    q = 0
    for i in range(len(T)):
        while q > 0 and T[i] != P[q]:
            q = pi[q-1]
        if T[i] == P[q]:
            q += 1 
        if q == m:
            q = pi[q-1]
            S.append(i - m + 1)
    return S

# algorytm kmp
def findPattern(T, P):
    pi = prefixFunction(P)
    return _findPatternKMP(T, P, pi)

print(findPattern([1,2,3,4,1,2,3], [1,2,3]))


[0, 4]


In [4]:
#Matrix - prostokątna tablica znaków w której szukamy wzorca, A - alfabet, Pi - lista kolum których szukamy: muszą być tej samej długości
import numpy as np

def find_pattern(Matrix, A, Pi):
    # budowanie automatu
    SMA, T = build_SMA(Pi, A)
    n = len(Matrix[0])
    m = len(Matrix)

    # odnajdowanine stsnów terminalnych
    Terminals = np.zeros((m,n), dtype=np.int64)
    for i in range(n):
        walker = SMA[0]
        for j in range(m):
            walker = SMA[walker.nodes[Matrix[j][i]]]
            Terminals[j][i] = walker.index
    
    # wyznaczenie szukanego ciągu terminali - ostatni wiersz Pi_terminals
    Pi_terminals = np.zeros((len(Pi[0]), len(Pi)), dtype=np.int64)
    for i in range(len(Pi)):
        walker = SMA[0]
        for j in range(len(Pi[0])):
            walker = SMA[walker.nodes[Pi[i][j]]]
            Pi_terminals[j][i] = walker.index
    # szukanie rozwiązania
    res = []
    for i, line in enumerate(Terminals):
        for j in findPattern(line, Pi_terminals[-1]):
            res.append((i-len(Pi[0])+1,j))
    return res

for line in example1:
    print(line)
find_pattern(example1, alphabet, words1)

example2 = [
    [1,1,1],
    [2,2,2],
    [2,2,1]
]
pattern2 = [[1,2], [1,2], [1,2]]
find_pattern(np.array(example2), create_alphabet(example2), pattern2)

kaataak
oaataao
tamaamt


[(0, 0)]

# haystack.txt

In [5]:
# z pliku wejsciowego zwraca macierz prostokątną kodów znaków
def get_matrix_from_text_file(file_name):
    with open(file_name, "r") as f:
        lines = f.readlines()
    n = len(lines)
    m = len(max(lines, key=len)) - 1
    matrix = np.zeros((n,m), dtype=np.int64)
    for i, line in enumerate(lines):
        for j, letter in enumerate(line[:-1]):
            matrix[i][j] = ord(letter)
    return matrix

matrix = get_matrix_from_text_file("lab5/haystack.txt")
alphabet = set()
for line in matrix:
    alphabet.update(line)
letter_codes = np.array(list(range(ord("a"),ord("z")+1)) + list(range(ord("A"), ord("Z")+1)), dtype=np.int64)
alphabet.update(letter_codes)
matrix

array([[ 79, 110, 101, ...,   0,   0,   0],
       [111, 102,  32, ...,   0,   0,   0],
       [ 98, 101,  32, ...,   0,   0,   0],
       ...,
       [101, 102, 102, ...,   0,   0,   0],
       [ 40,  80,  97, ...,   0,   0,   0],
       [ 97, 110, 100, ...,   0,   0,   0]])

znalezienie tych samych liter na tych smamych pozycjach w sąsiednich wierszacg

In [6]:

patterns2d = np.zeros((len(letter_codes),2), dtype=np.int64)
for i, letter_code in enumerate(letter_codes):
    patterns2d[i][0] = letter_code
    patterns2d[i][1] = letter_code
patterns2d

SMA, T = build_SMA(patterns2d, alphabet)
# budowanie automatu
n = len(matrix[0])
m = len(matrix)

# szukanie stanów terminalnych
answer = []
for i in range(n):
    walker = SMA[0]
    for j in range(m):
        walker = SMA[walker.nodes[matrix[j][i]]]
        if walker.index in T:
            answer.append((j,i+1))
# (line, number of character)
answer

[(32, 1),
 (38, 1),
 (42, 1),
 (45, 1),
 (51, 1),
 (55, 1),
 (69, 1),
 (11, 2),
 (32, 2),
 (42, 2),
 (54, 2),
 (78, 2),
 (15, 3),
 (28, 3),
 (38, 3),
 (51, 3),
 (65, 3),
 (2, 4),
 (17, 4),
 (25, 4),
 (72, 4),
 (73, 4),
 (2, 5),
 (24, 5),
 (25, 5),
 (38, 5),
 (2, 6),
 (17, 6),
 (45, 6),
 (53, 6),
 (70, 6),
 (2, 7),
 (18, 7),
 (21, 7),
 (77, 7),
 (78, 7),
 (81, 7),
 (1, 8),
 (2, 8),
 (2, 9),
 (23, 9),
 (53, 9),
 (2, 10),
 (14, 11),
 (21, 11),
 (34, 11),
 (36, 11),
 (73, 11),
 (80, 11),
 (34, 12),
 (41, 12),
 (55, 12),
 (57, 12),
 (16, 13),
 (53, 13),
 (54, 13),
 (74, 13),
 (8, 14),
 (57, 14),
 (74, 14),
 (78, 14),
 (5, 15),
 (18, 15),
 (50, 15),
 (65, 15),
 (82, 15),
 (70, 16),
 (31, 17),
 (28, 18),
 (29, 18),
 (56, 18),
 (68, 18),
 (16, 19),
 (36, 19),
 (42, 19),
 (38, 20),
 (9, 22),
 (77, 22),
 (65, 23),
 (70, 23),
 (78, 23),
 (5, 24),
 (29, 24),
 (73, 24),
 (47, 25),
 (60, 25),
 (72, 25),
 (80, 25),
 (44, 26),
 (34, 27),
 (41, 27),
 (11, 28),
 (19, 28),
 (74, 28),
 (8, 30),
 (65, 30),

In [7]:
pattern1 = np.array(
    [
        [ord("t"), ord("t")],
        [ord("h"), ord("h")]
    ], dtype=np.int64
)

pattern2 = np.array(
    [
        [ord("t"), ord("t")],
        [ord(" "), ord(" ")],
        [ord("h"), ord("h")]
    ], dtype=np.int64
)

print(find_pattern(matrix, alphabet, pattern1))
print(find_pattern(matrix, alphabet, pattern2))

[]
[(37, 0)]


haystack.png

In [33]:
from PIL import Image
import numpy as np

def get_matrix_from_png_file(file_name):
    matrix = np.array(Image.open(file_name))
    shape = matrix.shape[:2]
    table = [[None for _ in range(shape[1])] for _ in range(shape[0])]
    for i, line in enumerate(matrix):
        for j, pixel in enumerate(line):
            table[i][j] = tuple(pixel)

    return table

def transpose(matrix):
    n = len(matrix)
    m = len(matrix[0])

    transposed = [[None for _ in range(n)] for _ in range(m)]

    for i, line in enumerate(matrix):
        for j, value in enumerate(line):
            transposed[j][i] = value
    return transposed

matrix = get_matrix_from_png_file("lab5/haystack.png")
alphabet = create_alphabet(matrix)

# plik png przedstawiający litere powinien się znaleść w folderze letters
def number_of_found_letters(letter, alphabet, matrix):
    
    letter_matrix = get_matrix_from_png_file(f"lab5/letters/{letter}.png")
    letter_pattern = transpose(letter_matrix)

    letter_found = find_pattern(matrix, alphabet, letter_pattern)
    return len(letter_found)

488


In [34]:
print(number_of_found_letters("e", alphabet, matrix))

488


In [35]:
print(number_of_found_letters("t", alphabet, matrix))

58


In [36]:
print(number_of_found_letters("s", alphabet, matrix))

173


In [37]:
print(number_of_found_letters("a", alphabet, matrix))

397


In [38]:
print(number_of_found_letters("pattern", alphabet, matrix))

5


# mierzenie czasu

In [53]:
from time import time

# zwraca tuple (wymiary Matrix, wymiary Pi, czas budowania automatu, czas szukania wzorca)
def find_pattern_times(Matrix, A, Pi):
    # budowanie automatu
    start = time()
    SMA, T = build_SMA(Pi, A)
    end = time()
    bulding_SMA_time = end - start

    n = len(Matrix[0])
    m = len(Matrix)

    start = time()
    # odnajdowanine stsnów terminalnych
    Terminals = np.zeros((m,n), dtype=np.int64)
    for i in range(n):
        walker = SMA[0]
        for j in range(m):
            walker = SMA[walker.nodes[Matrix[j][i]]]
            Terminals[j][i] = walker.index
    
    # wyznaczenie szukanego ciągu terminali - ostatni wiersz Pi_terminals
    Pi_terminals = np.zeros((len(Pi[0]), len(Pi)), dtype=np.int64)
    for i in range(len(Pi)):
        walker = SMA[0]
        for j in range(len(Pi[0])):
            walker = SMA[walker.nodes[Pi[i][j]]]
            Pi_terminals[j][i] = walker.index
    # szukanie rozwiązania
    res = []
    for i, line in enumerate(Terminals):
        for j in findPattern(line, Pi_terminals[-1]):
            res.append((i-len(Pi[0])+1,j))
    end = time()
    finding_pattern_time = end - start
    return f"{n}x{m}", f"{len(Pi[0])}x{len(Pi)}", bulding_SMA_time, finding_pattern_time

def get_row_input_of_found_letters(letter, alphabet, matrix):
    letter_matrix = get_matrix_from_png_file(f"lab5/letters/{letter}.png")
    letter_pattern = transpose(letter_matrix)

    return list(find_pattern_times(matrix, alphabet, letter_pattern))
    

In [62]:
from prettytable import PrettyTable

times = PrettyTable()

times.field_names = ["rozmiar pliku wejściowego", "rozmiar wzorca", "czas budowania automatu[s]", "czas szukania wzorca [s]"]
times.add_row(get_row_input_of_found_letters("a", alphabet, matrix))
times.add_row(get_row_input_of_found_letters("pattern", alphabet, matrix))
times.add_row(get_row_input_of_found_letters("medium_pattern", alphabet, matrix))
times.add_row(get_row_input_of_found_letters("big_pattern", alphabet, matrix))
print(times)

+---------------------------+----------------+----------------------------+--------------------------+
| rozmiar pliku wejściowego | rozmiar wzorca | czas budowania automatu[s] | czas szukania wzorca [s] |
+---------------------------+----------------+----------------------------+--------------------------+
|          860x1900         |      8x7       |    0.007540464401245117    |    2.398759365081787     |
|          860x1900         |     19x107     |    0.06301641464233398     |    3.050617218017578     |
|          860x1900         |    842x388     |     30.878023862838745     |    3.8068411350250244    |
|          860x1900         |    1451x526    |     88.54110074043274      |    264.8278646469116     |
+---------------------------+----------------+----------------------------+--------------------------+


In [61]:
n = len(matrix)
times2 = PrettyTable()

times2.field_names = ["rozmiar pliku wejściowego", "rozmiar wzorca", "czas budowania automatu[s]", "czas szukania wzorca [s]"]
times2.add_row(get_row_input_of_found_letters("pattern", alphabet, matrix))
times2.add_row(get_row_input_of_found_letters("pattern", alphabet, matrix[:n//2]))
times2.add_row(get_row_input_of_found_letters("pattern", alphabet, matrix[:n//4]))

times2.add_row(get_row_input_of_found_letters("pattern", alphabet, matrix[:n//8]))

print(times2)

+---------------------------+----------------+----------------------------+--------------------------+
| rozmiar pliku wejściowego | rozmiar wzorca | czas budowania automatu[s] | czas szukania wzorca [s] |
+---------------------------+----------------+----------------------------+--------------------------+
|          860x1900         |     19x107     |    0.18419694900512695     |    15.63606882095337     |
|          860x950          |     19x107     |     0.4484097957611084     |    1.3525171279907227    |
|          860x475          |     19x107     |    0.05156302452087402     |    0.6641104221343994    |
|          860x237          |     19x107     |    0.04889655113220215     |    0.312591552734375     |
+---------------------------+----------------+----------------------------+--------------------------+
