In [1]:
from collections import defaultdict
from queue import Queue
import matplotlib.pyplot as plt
from prettytable import PrettyTable
from PIL import Image
import numpy as np
from random import randint
from time import time

# Wyszukiwanie wzorców dwuwymiarowych

## Bartosz Kucharz

# Implementacja

In [2]:
def build_sma(pattern):
    transition = [defaultdict(lambda :0)]
    alphabet = set(pattern)
    terminal = 0
    for pattern_letter in pattern:
        previous = transition[terminal][pattern_letter]
        new_state = len(transition)
        transition[terminal][pattern_letter] = new_state
        transition.append(defaultdict(lambda :0))
        
        for alphabet_letter in alphabet:
            transition[new_state][alphabet_letter] = transition[previous][alphabet_letter]
        terminal = new_state
    return transition
        

In [3]:
class Node():
    def __init__(self, letter, parent, terminal, ID=0):
        self.letter = letter
        self.parent = parent
        self.terminal = terminal
        self.children = dict()
        self.ID = ID

In [4]:
def build_trie(patterns):
    root = Node(None, None, False, 0)
    nodes = dict()
    nodes[0] = root
    ID = 1
    state_to_word = dict()
    for pattern in patterns:
        node = root
        for idx, letter in enumerate (pattern):
            terminal = (idx == len(pattern)-1) 
            
            if letter in node.children:
                node = node.children[letter]
                node.terminal = terminal
            else:
                new_node = Node(letter, node, terminal, ID)
                nodes[ID] = new_node
                ID += 1
                node.children[letter] = new_node
                node = new_node
                
                if terminal:
                    state_to_word[ID-1] = pattern
                
    return (root, nodes), state_to_word

In [5]:
def build_multi_sma(trie, patterns):
    root, nodes = trie
    alphabet = set()
    for pattern in patterns:
        alphabet.update(set(pattern))
            
    finals = dict()
    queue = Queue()
    transitions = dict()
    transitions[root.ID] = dict()
    two_d_match = []

    failures = dict()
    failures[root.ID] = root.ID
    for letter, child in root.children.items():
        failures[child.ID] = root.ID
        queue.put(child.ID)
        transitions[root.ID][letter] = child.ID

    while not queue.empty():
        node_id = queue.get()
        node = nodes[node_id]
        letter = node.letter
        
        previous_node_id = failures[node_id]
        previous_node = nodes[previous_node_id]
        
        if node.terminal:
            finals[node_id] = node_id
            two_d_match.append(node_id)
        elif  previous_node.terminal:
            finals[node_id] = finals[previous_node_id]


        
        transitions[node_id] = dict()
        for letter, child in node.children.items():
            transitions[node_id][letter] = child.ID
#             print(child.ID, previous_node_id, letter)
            failures[child.ID] = transitions[previous_node_id].get(letter, root.ID)
            queue.put(child.ID)
        
        for letter in alphabet:
            transitions[node_id][letter] = transitions[node_id].get(letter, transitions[previous_node_id].get(letter, root.ID))
            

    
    return transitions, finals, two_d_match
        
        

In [6]:
def match_patterns(text, transitions, finals):
    current_state = 0
    found_states = []
    for letter in text:
        current_state = transitions[current_state][letter]
        if current_state in finals:
            found_states.append(finals[current_state])
    return found_states

In [7]:
def get_state_list(text, transitions, finals):
    current_state = 0
    states = []
    for letter in text:
        states.append(current_state)
        current_state = transitions[current_state][letter]
    states.append(current_state)

    return states

In [8]:
def find_2d_pattern(text_list, transitions, row_transitions=[], finals=set()):
    height, width = len(text_list), len(text_list[0])
    state_matrix = [[0 for _ in range(width)] for _ in range(height)]
    result = []
    
    for column in range(width):
        current_state = 0
        for row in range(height):
            letter = text_list[row][column]
            current_state = transitions[current_state].get(letter, 0)
            state_matrix[row][column] = current_state
        
    
    if finals:
        for row in range(height):
            for column in range(width):
                if state_matrix[row][column] in finals:
                    result.append((row, column))
        return result
    
    
    final_state = len(row_transitions)-1
    for row in range(height):
        current_state = 0
        for column in range(width):
            number = state_matrix[row][column]
            current_state = row_transitions[current_state].get(number, 0)
            
            if current_state == final_state:
                result.append((row, column))
      
    return result
    

# Znajdź w załączonym pliku "haystack.txt" wszyskie sytuacje, gdy taka sama litera występuje na tej samej pozycji w dwóch kolejnych linijkach.

In [9]:
with open('haystack.txt', 'r') as file:
    text = file.readlines()
    max_line = len(max(text, key=lambda x: len(x)))
    alphabet = set()
    
    haystack = [[0 for _ in range(max_line)] for _ in range(len(text))]
    
    
    for row, line in enumerate(text):
        for column, letter in enumerate(line):
            haystack[row][column] = letter
            alphabet.add(letter)
                
    result = []
    patterns = []
    for letter in alphabet:
        patterns.append(letter+letter)
    
    trie, _  = build_trie(patterns)
    transitions, _, two_d_match = build_multi_sma(trie, patterns)
    
result1 = find_2d_pattern(haystack, transitions, finals=set(two_d_match))    

In [10]:
table = PrettyTable()
table.title = 'Wystąpienia dwóch liter w kolumnie'
table.field_names = ['Litera', 'Linia', 'Kolumna']
for i, j in result1:
    table.add_row([repr(haystack[i][j]), i+1, j+1])
table.sortby = 'Litera'
print(table)
print('Liczba wystąpień wzorca:', len(result1))

+------------------------------------+
| Wystąpienia dwóch liter w kolumnie |
+-----------+----------+-------------+
|   Litera  |  Linia   |   Kolumna   |
+-----------+----------+-------------+
|    ' '    |    2     |      11     |
|    ' '    |    2     |      28     |
|    ' '    |    2     |      59     |
|    ' '    |    2     |      74     |
|    ' '    |    3     |      3      |
|    ' '    |    3     |      11     |
|    ' '    |    3     |      35     |
|    ' '    |    4     |      3      |
|    ' '    |    5     |      42     |
|    ' '    |    5     |      45     |
|    ' '    |    6     |      8      |
|    ' '    |    6     |      65     |
|    ' '    |    7     |      8      |
|    ' '    |    7     |      10     |
|    ' '    |    7     |      46     |
|    ' '    |    8     |      46     |
|    ' '    |    10    |      26     |
|    ' '    |    11    |      49     |
|    ' '    |    12    |      8      |
|    ' '    |    12    |      12     |
|    ' '    |    15    | 

# Znajdź wszystkie wystąpienia "th" oraz "t h" w dwóch kolejnych liniach na tej samej pozycji. 

In [11]:
with open('haystack.txt', 'r') as file:
    text = file.readlines()
    max_line = len(max(text, key=lambda x: len(x)))
    alphabet = set()
    
    haystack = [[0 for _ in range(max_line)] for _ in range(len(text))]
    
    
    for row, line in enumerate(text):
        for column, letter in enumerate(line):
            haystack[row][column] = letter
            alphabet.add(letter)
                
    result = []
    patterns = ['tt', 'hh']
    
    trie, _  = build_trie(patterns)
    transitions, _, two_d_match = build_multi_sma(trie, patterns)
    
    row_transition = build_sma(two_d_match)
    
result2 = find_2d_pattern(haystack, transitions, row_transition)

In [12]:
table = PrettyTable()
table.title = 'Wystąpienia dwóch \'th\' '
table.field_names = ['Linia', 'Kolumna']
for i, j in result2:
    table.add_row([i+1, j+1])
print(table)
print('Liczba wystąpień wzorca:', len(result2))

+------------------------+
| Wystąpienia dwóch 'th'  |
+----------+-------------+
|  Linia   |   Kolumna   |
+----------+-------------+
+----------+-------------+
Liczba wystąpień wzorca: 0


In [13]:
with open('haystack.txt', 'r') as file:
    text = file.readlines()
    max_line = len(max(text, key=lambda x: len(x)))
    alphabet = set()
    
    haystack = [[0 for _ in range(max_line)] for _ in range(len(text))]
    
    
    for row, line in enumerate(text):
        for column, letter in enumerate(line):
            haystack[row][column] = letter
            alphabet.add(letter)
                
    result = []
    patterns = ['tt' ,'  ', 'hh']
    
    trie, _  = build_trie(patterns)
    transitions, _, two_d_match = build_multi_sma(trie, patterns)
    
    row_transition = build_sma(two_d_match)
    
result3 = find_2d_pattern(haystack, transitions, row_transition)

In [14]:
table = PrettyTable()
table.title = 'Wystąpienia dwóch \'t h\' '
table.field_names = ['Linia', 'Kolumna']
for i, j in result3:
    table.add_row([i+1, j+1])
print(table)
print('Liczba wystąpień wzorca:', len(result3))

+------------------------+
| Wystąpienia dwóch 't h'  |
+----------+-------------+
|  Linia   |   Kolumna   |
+----------+-------------+
|    39    |      3      |
+----------+-------------+
Liczba wystąpień wzorca: 1


# Wybierz przynajmniej 4 litery (małe). Znajdź wszystkie wystąpienia tej litery w załączonym pliku "haystack.png"

In [15]:
def image_to_matrix(img):
    pixel_map = img.load()
    pixels = []
    for row in range(img.height):
        pix = []
        for col in range(img.width):
            pix.append(pixel_map[col, row][0])
        pixels.append(pix)
    return np.array(pixels)

In [16]:
with Image.open('haystack.png') as img, \
        Image.open('patterns/s.png') as s_img, \
        Image.open('patterns/v.png') as v_img, \
        Image.open('patterns/c.png') as c_img, \
        Image.open('patterns/u.png') as u_img:
    
    haystack = image_to_matrix(img)
    results = []
    for letter_img in [s_img, v_img, c_img, u_img]:
        letter = image_to_matrix(letter_img)

        result = []
        patterns = letter.T

        trie, _  = build_trie(patterns)
        transitions, _, two_d_match = build_multi_sma(trie, patterns)

        row_transition = build_sma(two_d_match)
    
        results.append(find_2d_pattern(haystack, transitions, row_transition))
    

In [17]:
table = PrettyTable()
table.title = 'Wystąpienia obrazu \'s\''
table.field_names = ['x', 'y']
for i, j in results[0]:
    table.add_row([j+1, i+1])
print(table)
print('Liczba wystąpień wzorca:', len(results[0]))

+-------------------+
| Wystąpienia obrazu 's' |
+--------+----------+
|   x    |    y     |
+--------+----------+
|  133   |    47    |
|  192   |    47    |
|  398   |    47    |
|  592   |    47    |
|  686   |    47    |
|  780   |    47    |
|  161   |    69    |
|  209   |    69    |
|  362   |    69    |
|  195   |    91    |
|  271   |    91    |
|  470   |    91    |
|  613   |    91    |
|   38   |   113    |
|  264   |   113    |
|  512   |   113    |
|  521   |   113    |
|  571   |   113    |
|  591   |   113    |
|  636   |   113    |
|  171   |   135    |
|  445   |   135    |
|  515   |   135    |
|  535   |   135    |
|  580   |   135    |
|  606   |   135    |
|  714   |   135    |
|  723   |   135    |
|   88   |   157    |
|  452   |   157    |
|  666   |   157    |
|  265   |   179    |
|  415   |   179    |
|  604   |   179    |
|  642   |   179    |
|  192   |   201    |
|  201   |   201    |
|  398   |   201    |
|  661   |   201    |
|   98   |   223    |
|  19

In [18]:
table = PrettyTable()
table.title = 'Wystąpienia obrazu \'v\''
table.field_names = ['x', 'y']
for i, j in results[1]:
    table.add_row([j+1, i+1])
print(table)
print('Liczba wystąpień wzorca:', len(results[1]))

+-------------------+
| Wystąpienia obrazu 'v' |
+--------+----------+
|   x    |    y     |
+--------+----------+
|   64   |   135    |
|  700   |   201    |
|  607   |   289    |
|   95   |   377    |
|  588   |   575    |
|  618   |   597    |
|  163   |   663    |
|  266   |   707    |
|  316   |   729    |
|  321   |   751    |
|  348   |   751    |
|  514   |   817    |
|  672   |   839    |
|  714   |   1081   |
|   97   |   1169   |
|  356   |   1213   |
|  636   |   1213   |
|  736   |   1235   |
|  195   |   1257   |
|  547   |   1279   |
|   35   |   1301   |
|  349   |   1323   |
|  364   |   1433   |
|  420   |   1455   |
|  385   |   1499   |
|  307   |   1521   |
|  134   |   1565   |
|  546   |   1587   |
|   89   |   1807   |
|  259   |   1829   |
+--------+----------+
Liczba wystąpień wzorca: 30


In [19]:
table = PrettyTable()
table.title = 'Wystąpienia obrazu \'c\''
table.field_names = ['x', 'y']
for i, j in results[2]:
    table.add_row([j+1, i+1])
print(table)
print('Liczba wystąpień wzorca:', len(results[2]))

+-------------------+
| Wystąpienia obrazu 'c' |
+--------+----------+
|   x    |    y     |
+--------+----------+
|  378   |    69    |
|  428   |    69    |
|  597   |    69    |
|  698   |    69    |
|  336   |    91    |
|  386   |    91    |
|  436   |    91    |
|  514   |    91    |
|  678   |    91    |
|   54   |   113    |
|  315   |   113    |
|  492   |   113    |
|  689   |   113    |
|  124   |   135    |
|  489   |   135    |
|  694   |   135    |
|   72   |   157    |
|  601   |   157    |
|  181   |   179    |
|  231   |   179    |
|  442   |   179    |
|  452   |   179    |
|  652   |   179    |
|  689   |   179    |
|  172   |   201    |
|   51   |   223    |
|  173   |   223    |
|  102   |   245    |
|  216   |   245    |
|  309   |   245    |
|   34   |   267    |
|  444   |   267    |
|  476   |   267    |
|  577   |   267    |
|  614   |   267    |
|  665   |   267    |
|  363   |   289    |
|  419   |   289    |
|  441   |   289    |
|  652   |   289    |
|   5

In [20]:
table = PrettyTable()
table.title = 'Wystąpienia obrazu \'u\''
table.field_names = ['x', 'y']
for i, j in results[3]:
    table.add_row([j+1, i+1])
print(table)
print('Liczba wystąpień wzorca:', len(results[3]))

+-------------------+
| Wystąpienia obrazu 'u' |
+--------+----------+
|   x    |    y     |
+--------+----------+
|  302   |    47    |
|  304   |    91    |
|  504   |    91    |
|  646   |    91    |
|  321   |   135    |
|  479   |   135    |
|  597   |   135    |
|  617   |   135    |
|  211   |   157    |
|  463   |   179    |
|  619   |   201    |
|  667   |   223    |
|  113   |   245    |
|  184   |   245    |
|  209   |   267    |
|  527   |   267    |
|  687   |   267    |
|  174   |   333    |
|   35   |   355    |
|  377   |   355    |
|  687   |   355    |
|  335   |   377    |
|  397   |   377    |
|  693   |   443    |
|  512   |   465    |
|  223   |   509    |
|  252   |   575    |
|  538   |   641    |
|  687   |   641    |
|  190   |   663    |
|  321   |   685    |
|  159   |   707    |
|  137   |   773    |
|  222   |   773    |
|  712   |   773    |
|   73   |   795    |
|  502   |   795    |
|  286   |   817    |
|  406   |   817    |
|  669   |   861    |
|  11

# Znajdź wszystkie wystąpienia słowa "p a t t e r n" w haystack.png. 

In [21]:
with Image.open('haystack.png') as img, \
        Image.open('patterns/pattern.png') as pattern_img:
    haystack = image_to_matrix(img)
    
    letter = image_to_matrix(pattern_img)
    
    patterns = letter.T

    trie, _  = build_trie(patterns)
    transitions, _, two_d_match = build_multi_sma(trie, patterns)

    row_transition = build_sma(two_d_match)

    result_pattern = find_2d_pattern(haystack, transitions, row_transition)

In [22]:
table = PrettyTable()
table.title = 'Wystąpienia obrazu \'p a t t e r n\''
table.field_names = ['x', 'y']
for i, j in result_pattern:
    table.add_row([j+1, i+1])
print(table)
print('Liczba wystąpień wzorca:', len(result_pattern))

+---------------+
| Wystąpienia obrazu 'p a t t e r n' |
+-------+-------+
|   x   |   y   |
+-------+-------+
+-------+-------+
Liczba wystąpień wzorca: 0


# Porównaj czas budowania automatu i czas wyszukiwania dla różnych rozmiarów wzorca 

In [31]:
def test():
    building_times = []
    matching_time = []
    pattern_names = ['small.png', 'medium.png', 'large.png']
    with Image.open('haystack.png') as img, \
        Image.open('patterns/small.png') as small_img, \
        Image.open('patterns/medium.png') as medium_img, \
        Image.open('patterns/large.png') as large_img:

        haystack = image_to_matrix(img)
        results = []
        for pattern_img in [small_img, medium_img, large_img]:
            pattern = image_to_matrix(pattern_img)

            result = []
            patterns = pattern.T

            start_building_time = time()
            trie, _  = build_trie(patterns)
            transitions, _, two_d_match = build_multi_sma(trie, patterns)
            row_transition = build_sma(two_d_match)
            end_building_time = time()

            start_matching_time = time()
            res = find_2d_pattern(haystack, transitions, row_transition)
            end_matching_time = time()

            results.append(res)
            building_times.append(end_building_time-start_building_time)
            matching_time.append(end_matching_time - start_matching_time)
    table = PrettyTable()
    table.title = "Czasy [s]"
    table.add_column('wzorzec', pattern_names)
    table.add_column('budownie automatu', building_times)
    table.add_column('wyszukiwanie', matching_time)
    return table

In [32]:
print(test())

+---------------------------------------------------------+
|                        Czasy [s]                        |
+------------+-----------------------+--------------------+
|  wzorzec   |   budownie automatu   |    wyszukiwanie    |
+------------+-----------------------+--------------------+
| small.png  | 0.0030078887939453125 | 1.4270544052124023 |
| medium.png |   0.9157960414886475  | 1.6280052661895752 |
| large.png  |   14.598987102508545  | 2.474362850189209  |
+------------+-----------------------+--------------------+
