In [1]:
#different algorithms for finding a pattern in text

# ------------------- naive algorithm ------------------------

def naive_string_matching(text, pattern):
    res = 0
    for s in range(0, len(text) - len(pattern) + 1):
        if(pattern == text[s:s+len(pattern)]):
            res += 1
#             print(f"correct pattern at {s}")
    return res
            
            
# ------------------- finite automat  ------------------------          

import re

def transition_table(pattern, alphabet):
    result = []
    for q in range(0, len(pattern) + 1):
        result.append({})
        for a in alphabet:
            k = min(len(pattern) + 1, q + 2)
            while True:
                k = k - 1
                if(re.search(f"{pattern[:k]}$", pattern[:q] + a)):
                    break
            result[q][a] = k    
    return result

def fa_string_matching(text, delta):
    res = 0
    q = 0
    for s in range(0, len(text)):
        if( text[s] not in delta[q].keys()):
            q = 0
        else:
            q = delta[q][text[s]]
            if(q == len(delta) - 1):
                res += 1
    #             print(f"correct pattern at {s + 1 - q}")
                # s + 1 - ponieważ przeczytaliśmy znak o indeksie s, więc przesunięcie jest po tym znaku
    return res
            
#------------ Knuth-Morris-Pratt algorithm --------------------

def prefix_function(pattern):
    pi = [0]
    k = 0
    for q in range(1, len(pattern)):
        while(k > 0 and pattern[k] != pattern[q]):
            k = pi[k-1]
        if(pattern[k] == pattern[q]):
            k = k + 1
        pi.append(k)
    return pi

def kmp_string_matching(text, pattern):
    res = 0
    pi = prefix_function(pattern)
    q = 0
    for i in range(0, len(text)):
        while(q > 0 and pattern[q] != text[i]):
            q = pi[q-1]
        if(pattern[q] == text[i]):
            q = q + 1
        if(q == len(pattern)):
            res += 1
#             print(f"correct pattern at {i + 1 - q}")
            q = pi[q-1]
    return res
            

In [2]:
# time evaluation
from time import perf_counter

def eval_func(text, pattern, func, count=20):
    start = perf_counter()
    for i in range(0, count):
        func(text, pattern)
    end = perf_counter()
    average = (end-start)/count
    return average


In [3]:
# summary comparing 3 algorithms for given pattern and text

def summary(text, pattern):
    print("summary for pattern ", pattern)
    res1 = naive_string_matching(text, pattern)
    res2 = kmp_string_matching(text, pattern)
    tr_table = transition_table(pattern, set(pattern))
    res3 = fa_string_matching(text, tr_table)
    print(res1, " ",res2, " ", res3)
    print("naive alg average time: ",eval_func(text, pattern, naive_string_matching))
    print("kmp alg average time: ",eval_func(text, pattern, kmp_string_matching))
    print("finite automat average time: ",eval_func(text, tr_table, fa_string_matching))


In [None]:
f = open("ustawa.txt", "r", encoding='utf-8')
text = ''.join(f.readlines())
summary(text, "art")
summary(text, "przy")
summary(text, "przy")
summary(text, "podatników")
summary(text, "podatników")
summary(text, "przyc")
summary(text, "przyc"*400)



summary for pattern  art
273   273   273
naive alg average time:  0.12118183999999994
kmp alg average time:  0.14128960500000004
finite automat average time:  0.08933183500000004
summary for pattern  przy
262   262   262
naive alg average time:  0.10642110500000009
kmp alg average time:  0.09291586499999997
finite automat average time:  0.08487109000000004
summary for pattern  przy
262   262   262
naive alg average time:  0.1070067400000001
kmp alg average time:  0.09413776499999997
finite automat average time:  0.08587919
summary for pattern  podatników
29   29   29
naive alg average time:  0.11348639000000009
kmp alg average time:  0.09834390000000007
finite automat average time:  0.10487191500000001
summary for pattern  podatników
29   29   29
naive alg average time:  0.11235165999999985
kmp alg average time:  0.09896914999999992
finite automat average time:  0.11555256500000013
summary for pattern  przyc
141   141   141
naive alg average time:  0.10788824000000012
kmp alg average t

In [None]:
naive