# Lab6 - Wzorce dwuwymiarowe
Aneta Porębska

In [1]:
class Node:
    def __init__(self, leaf = -1):
        self.children = dict()
        self.leaf = leaf # numer stanu akceptującego
        self.fail_link = None

In [2]:
def find_prefix(root, start, letter_arr):
    curr = root
    for i in range(start, len(letter_arr)):
        if letter_arr[i] in curr.children.keys():
            curr = curr.children[letter_arr[i]]
        else:
            return None
    return curr
    

In [3]:
def get_pattern_dict(patterns):
    
    d = {}
    idx = 0
    for pattern in patterns:
        if not pattern in d.keys():
            d[pattern] = idx
            idx +=1
    
    return d

In [4]:
def construct_trie(patterns_arr):
    root = Node()
    pattern_dict = get_pattern_dict(patterns_arr)
    # trie
    for pattern in patterns_arr:
        curr = root
        for letter in pattern:
            if letter in curr.children.keys():
                curr = curr.children[letter]
            else:
                new_node = Node()
                curr.children[letter] = new_node
                curr = new_node
        curr.leaf = pattern_dict[pattern]
        
    
    # add fail links
    for pattern in patterns_arr:
        curr = root
        lett = []
        for letter in pattern:
            lett.append(letter)
            curr = curr.children[letter]
            if len(lett)==1:
                curr.fail_link = root
            else:
                start = 1
                node = find_prefix(root, start, lett)
                while node == None and start<len(lett): 
                    start+=1
                    node = find_prefix(root, start, lett)
                if node != None:
                    curr.fail_link = node
                else:
                    curr.fail_link = root
    
    return root
    
    

In [5]:
def traverse_trie(root, depth):
    print(depth, root.children.keys(), end ="")
    if root.fail_link:
        print(root.fail_link.children.keys())
    else:
        print()
    if root.children:
        for key in root.children.keys():
            traverse_trie(root.children[key], depth+1)

In [6]:
def find_multiple_patterns(trie, text):
    arr = [-1]*len(text)
    curr = trie
    for i in range(len(text)):
        letter = text[i]
        if letter in curr.children.keys():
            curr = curr.children[letter]
        else:
            curr = curr.fail_link
        if curr == None: # trafiliśmy do korzenia i nic nie pasuje
            curr = trie
            continue
        if curr.leaf != -1:
            arr[i] = curr.leaf
    
    return arr

In [7]:
def prefix_function(pattern):
    pi = [0]
    k = 0
    for q in range(1, len(pattern)):
        while(k > 0 and pattern[k] != pattern[q]):
            k = pi[k-1]
        if(pattern[k] == pattern[q]):
            k = k + 1
        pi.append(k)
    return pi


def kmp_string_matching(pi, text, pattern):
    
    result = []
    q = 0
    for i in range(0, len(text)):
        while(q > 0 and pattern[q] != text[i]):
            q = pi[q-1]
        if(pattern[q] == text[i]):
            q = q + 1
        if(q == len(pattern)):
            result.append(i)
            q = pi[q-1]
            
    return result

In [30]:
import time

def find_2d_pattern(pattern, text, pr = True, measure_t = False, find_all = 0):
    
    
    arr_text = [None]*len(text)
    pattern_dict = get_pattern_dict(pattern)
    constr_start = time.time()
    trie = construct_trie(pattern)
    constr_stop = time.time()
    max_n = -1 #maksymalna długość linijki
    
    find_start = time.time()
    for i in range(len(text)):
        arr_text[i] = find_multiple_patterns(trie, text[i])
        #print(arr_text[i])
        max_n = max(max_n, len(arr_text[i]))
    
    for i in range(len(text)):
        n =len(arr_text[i])
        for k in range(n, max_n):
            arr_text[i].append(-1)
    
    digit_patt = [-1]*len(pattern)
    for i in range(len(pattern)):
        digit_patt[i] = pattern_dict[pattern[i]]

    
    arr_text_T = [None]*max_n
    for i in range(max_n):
        arr_text_T[i] = [-1]*len(text)

    for i in range(max_n):
        for j in range(len(text)):
            arr_text_T[i][j] = arr_text[j][i]
    #print("Indexes for end of patterns (lower right indexes):")
    pi = prefix_function(digit_patt)
    for i in range(max_n):
        idx_arr = kmp_string_matching(pi, arr_text_T[i], digit_patt)
        if pr:
            for idx in idx_arr:
                print(idx, i)
    
    find_stop = time.time()
    if measure_t:
        print("Construction time:", constr_stop - constr_start)
        f = find_stop - find_start
        print("Finding time:", f)
        find_all +=f
    return find_all

In [9]:
pattern = ["ab",
           "ba"]

text = ["abbaab",
        "baabbadddd",
        "bbbaaa"]
# 3 wystąpienia powinno znaleźć
find_2d_pattern(pattern, text)

1 1
2 3
1 5


## Zadanie 2

In [10]:
file = open('haystack.txt')
text = file.read()
text = text.split('\n')
file.close()

In [11]:
from string import ascii_lowercase

for c in ascii_lowercase:
    pattern = [c,c]
    print(pattern)
    find_2d_pattern(pattern, text, True, False)
    print("\n\n")

['a', 'a']
65 2
38 4
21 6
57 11
53 12
54 12
65 14
77 21
65 22
60 24
4 30
66 35
70 35
58 36
59 36
80 37
78 42
54 48
32 50
79 59
6 60
78 61
7 63
34 66
29 69
32 73
77 74
1 82



['b', 'b']



['c', 'c']
42 0
69 0
11 45
4 54



['d', 'd']
38 19



['e', 'e']
11 1
15 2
25 3
18 6
81 6
2 8
21 10
41 11
82 14
70 15
68 17
73 23
41 26
19 27
74 27
52 31
43 36
30 38
72 38
16 43
30 43
69 46
83 47
38 48
43 48
71 49
48 50
59 50
47 52
23 53
58 54
59 54
42 57
22 61
1 63
11 64
8 65
25 65
79 65
64 66
29 67
66 69
67 72
29 73
60 73
5 77



['f', 'f']
78 1
31 59



['g', 'g']



['h', 'h']
28 2
38 2
74 12
57 31



['i', 'i']
32 0
2 5
74 13
78 13
56 17
32 31
45 33
9 37
61 45
69 51
20 55
10 60
53 69



['j', 'j']



['k', 'k']



['l', 'l']
34 45
54 45
29 72
42 77



['m', 'm']
45 0
17 5
35 60
29 70



['n', 'n']
32 1
2 9
36 18
65 29
52 32
55 33
68 35
20 37
68 40
15 54
21 56
68 57
22 62
1 83



['o', 'o']
42 1
54 1
80 10
34 11
28 17
29 17
34 26
11 27
33 34
7 38
8 38
72 42
59 45
82 52
45 55
31 58
16 60
6 66
5 7

## Zadanie 3

In [12]:
pattern = ["th", "t h"]
print(pattern)
find_2d_pattern(pattern, text)

['th', 't h']


## Zadanie 4 i Zadanie 5

In [13]:
from PIL import Image
def image_to_matrix(img: Image):
    pixel_map = img.load()
    pixels = []
    for row in range(img.height):
        pix = []
        for col in range(img.width):
            if pixel_map[col, row][0] == 255:
                pix.append(0) #white
            else:
                pix.append(1) #black or similar
        # convert bits to string
        s = [str(i) for i in pix]
        s = "".join(s)
        pixels.append(s)
    return pixels

In [14]:
image = Image.open('haystack.png')
a_lett = Image.open('a.png')
c_lett = Image.open('c.png')
e_lett = Image.open('e.png')
pattern = Image.open('pattern.png')

im = image_to_matrix(image)
a = image_to_matrix(a_lett)
c = image_to_matrix(c_lett)
e = image_to_matrix(e_lett)
p = image_to_matrix(pattern)
for row in a:
    print(row)
print()    
for row in c:
    print(row)
print()    
for row in e:
    print(row)
print()    
print("a:")
find_2d_pattern(a, im)
print("c:")
find_2d_pattern(c, im)
print("e:")
find_2d_pattern(e, im)
print("p a t t e r n :")
find_2d_pattern(p, im)




11111110
11111111
11111111
00000011
01111111
11111111
11111011
11111111
11111111

01111111
11111111
11111111
11100000
11000000
11000000
11100000
11111111
11111111
01111111

011111110
111111111
111111111
111000011
111111111
111111111
111000000
111111111
111111111
011111111

a:
221 39
c:
266 33
948 33
970 33
1454 33
1542 33
1564 33
1608 33
e:
310 34
1762 34
1828 34
p a t t e r n :


## Zadanie 6

In [15]:
find_2d_pattern(["011","011", "011"], im, pr=False, measure_t=True)
print()
find_2d_pattern(a, im, pr=False, measure_t=True)
print()
find_2d_pattern(p, im, pr=False, measure_t=True)
print()
pat2 = Image.open('patt2.png')
p2 = image_to_matrix(pat2)
find_2d_pattern(p2, im, pr=False, measure_t=True)

Construction time: 1.7404556274414062e-05
Finding time: 0.6265246868133545

Construction time: 0.00010609626770019531
Finding time: 0.6304998397827148

Construction time: 0.07992148399353027
Finding time: 0.6859278678894043

Construction time: 0.4109766483306885
Finding time: 0.6804931163787842


## Zadanie 7

In [33]:
def part(slices, pattern, text):
    n = len(text)
    l = n//slices
    find_all = 0
    for i in range(slices):
        find_all = find_2d_pattern(pattern, text[i*l:(i+1)*l], False, True, find_all)
    print()
    print("Total finding time:", find_all)
    

In [34]:
part(2, p2, im)

Construction time: 0.4181489944458008
Finding time: 0.3327329158782959
Construction time: 0.4268791675567627
Finding time: 0.33394575119018555

Total finding time: 0.6666786670684814


In [35]:
part(4, p2, im)

Construction time: 0.46812987327575684
Finding time: 0.17142176628112793
Construction time: 0.4167623519897461
Finding time: 0.17960715293884277
Construction time: 0.4220142364501953
Finding time: 0.16904306411743164
Construction time: 0.4245626926422119
Finding time: 0.16231346130371094

Total finding time: 0.6823854446411133


In [36]:
part(8, p2, im)

Construction time: 0.4170684814453125
Finding time: 0.08295035362243652
Construction time: 0.4452381134033203
Finding time: 0.07903671264648438
Construction time: 0.46268177032470703
Finding time: 0.07755160331726074
Construction time: 0.43882107734680176
Finding time: 0.07798361778259277
Construction time: 0.42842602729797363
Finding time: 0.08820295333862305
Construction time: 0.4134337902069092
Finding time: 0.07832074165344238
Construction time: 0.41245293617248535
Finding time: 0.07801461219787598
Construction time: 0.40730929374694824
Finding time: 0.07886385917663574

Total finding time: 0.6409244537353516


In [37]:
part(16, p2, im)

Construction time: 0.4757847785949707
Finding time: 0.043305158615112305
Construction time: 0.4185323715209961
Finding time: 0.04026222229003906
Construction time: 0.42504358291625977
Finding time: 0.04024481773376465
Construction time: 0.421764612197876
Finding time: 0.03954935073852539
Construction time: 0.4198756217956543
Finding time: 0.03851938247680664
Construction time: 0.4072234630584717
Finding time: 0.03911852836608887
Construction time: 0.46978020668029785
Finding time: 0.038728952407836914
Construction time: 0.42168545722961426
Finding time: 0.0390625
Construction time: 0.4278385639190674
Finding time: 0.04300832748413086
Construction time: 0.4503655433654785
Finding time: 0.0438838005065918
Construction time: 0.4284079074859619
Finding time: 0.04071784019470215
Construction time: 0.4246082305908203
Finding time: 0.03951668739318848
Construction time: 0.4675283432006836
Finding time: 0.03883218765258789
Construction time: 0.41579413414001465
Finding time: 0.0390899181365966