Наивный алгоритм

In [1]:
def native_search(text, pattern, debug = False):
    counter = 0
    ans = None
    len_pattern = len(pattern)
    for i in range(len(text) - len_pattern + 1):
        counter += 1
        if text[i : i + len_pattern] == pattern:
            ans = i
            break
    if debug: return counter
    return ans

Алгоритм Бойера-Мура


In [2]:
def bmPredCompil(_pattern):
    _len_pattern = len(_pattern)
    offset_dict  = {}
    for i in range(_len_pattern):
        offset_dict[_pattern[i]] = _len_pattern - i
    return offset_dict


def boyer_moore_horspool_search(text, pattern, debug = False):
    counter = 0
    ans = None

    len_text    = len(text)
    len_pattern = len(pattern)
    offset_dict = bmPredCompil(pattern)

    start_index = index_pattern = index_text = len_pattern

    while index_pattern > 0 and start_index <= len_text:
        counter += 1
        if text[index_text - 1] == pattern[index_pattern - 1]:
            index_text    -= 1
            index_pattern -= 1
        else:
            start_index  += offset_dict.get(text[start_index], len_pattern)
            index_text    = start_index
            index_pattern = len_pattern
    
    if index_pattern <= 0:
        ans = index_text
    if debug: return counter
    return ans

Алгоритм Рабина-Карпа

In [3]:
def rabin_karp_search(text, pattern, debug=False):
    counter = 0

    d = 10
    q = 13
    len_pattern = len(pattern) 
    len_text = len(text) 
    p = 0
    t = 0
    h = 1

    for i in range(len_pattern-1):
        h = (h*d) % q
    for i in range(len_pattern): 
        p = (d*p + ord(pattern[i])) % q
        t = (d*t + ord(text[i])) % q
    for i in range(len_text-len_pattern+1): 
        if p == t: 
            for j in range(len_pattern): 
                counter += 1
                if text[i+j] != pattern[j]: break 
                j += 1 
                if j == len_pattern: 
                    if debug: return counter
                    return i
        if i < len_text - len_pattern: 
            t = (d*(t-ord(text[i])*h) + ord(text[i+len_pattern])) % q 
            if t < 0: 
                t = t+q

Алгоритм Кнутта-Мориса-Пратта


In [4]:
def get_prefix(_pattern):
    prefix_doct = {0: 0}
    for i in range(1, len(_pattern)):
        j = prefix_doct[i - 1]
        while j > 0 and _pattern[j] != _pattern[i]:
            j = prefix_doct[j - 1]
        if _pattern[j] == _pattern[i]:
            j += 1
        prefix_doct[i] = j
    return prefix_doct


def knuth_morris_pratt_search(text, pattern, debug = False):
    counter = 0
    ans = None

    len_text    = len(text)
    len_pattern = len(pattern)
    prefix_doct = get_prefix(pattern)

    index_text = index_pattern = 0

    while index_text < len_text and index_pattern < len_pattern:
        counter += 1
        if pattern[index_pattern] == text[index_text]:
            index_text    += 1
            index_pattern += 1
        elif index_pattern == 0:
            index_text += 1
        else:
            index_pattern = prefix_doct.get(index_pattern - 1)
    else:
        if index_pattern == len_pattern:
            ans = index_text - index_pattern
    if debug: return counter
    return ans

# Test

In [5]:
!wget https://github.com/alexgiving/data_collab/raw/main/benchmarks.rar >> /dev/null
!unrar x benchmarks.rar  >> /dev/null

--2022-04-22 14:46:12--  https://github.com/alexgiving/data_collab/raw/main/benchmarks.rar
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alexgiving/data_collab/main/benchmarks.rar [following]
--2022-04-22 14:46:12--  https://raw.githubusercontent.com/alexgiving/data_collab/main/benchmarks.rar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8285 (8.1K) [application/octet-stream]
Saving to: ‘benchmarks.rar’


2022-04-22 14:46:13 (55.5 MB/s) - ‘benchmarks.rar’ saved [8285/8285]



In [6]:
bad_t_1 = open(f"benchmarks/bad_t_1.txt", "r").read()
bad_t_2 = open(f"benchmarks/bad_t_2.txt", "r").read()
bad_t_3 = open(f"benchmarks/bad_t_3.txt", "r").read()
bad_t_4 = open(f"benchmarks/bad_t_4.txt", "r").read()

bad_w_1 = open(f"benchmarks/bad_w_1.txt", "r").read()
bad_w_2 = open(f"benchmarks/bad_w_2.txt", "r").read()
bad_w_3 = open(f"benchmarks/bad_w_3.txt", "r").read()
bad_w_4 = open(f"benchmarks/bad_w_4.txt", "r").read()

good_t_1 = open(f"benchmarks/good_t_1.txt", "r").read()
good_t_2 = open(f"benchmarks/good_t_2.txt", "r").read()
good_t_3 = open(f"benchmarks/good_t_3.txt", "r").read()
good_t_4 = open(f"benchmarks/good_t_4.txt", "r").read()

good_w_1 = open(f"benchmarks/good_w_1.txt", "r").read()
good_w_2 = open(f"benchmarks/good_w_2.txt", "r").read()
good_w_3 = open(f"benchmarks/good_w_3.txt", "r").read()
good_w_4 = open(f"benchmarks/good_w_4.txt", "r").read()


In [7]:
from statistics import mean 
import pandas as pd
import time

native_time               = []
boyer_moore_horspool_time = []
knuth_morris_pratt_time   = []
rabin_karp_time           = []

iters = 50

test_cases = {"bad_1" : (bad_t_1,  bad_w_1),
              "bad_2" : (bad_t_2,  bad_w_2),
              "bad_3" : (bad_t_3,  bad_w_3),
              "bad_4" : (bad_t_4,  bad_w_4),
              "good_1": (good_t_1, good_w_1),
              "good_2": (good_t_2, good_w_2),
              "good_3": (good_t_3, good_w_3),
              "good_4": (good_t_4, good_w_4)
}


search_funcs = {"Native-Search"             : (native_search,               native_time),
                "Boyer–Moore-Search"        : (boyer_moore_horspool_search, boyer_moore_horspool_time),
                "Knuth-Morris-Pratt-Search" : (knuth_morris_pratt_search,   knuth_morris_pratt_time),
                "Rabin-Karp-Search"         : (rabin_karp_search,           rabin_karp_time)
}


for test_case in test_cases:
    text, word = test_cases.get(test_case)
    for func_name in search_funcs:
        func, time_array = search_funcs.get(func_name)
        temp_array = []
        iter = None
        for _ in range(iters):
            start_timer = time.time()
            iter = func(text, word, True)
            stop_timer = time.time()
            temp_array.append(stop_timer-start_timer)
        time_array.append( f'sec={"%.10f" % mean(temp_array)}, iter={iter}')

data = pd.DataFrame(data=[i[1:][0] for i in search_funcs.values()],
            columns=test_cases.keys(),
            index=search_funcs.keys()
            ).T

In [8]:
display(data)

Unnamed: 0,Native-Search,Boyer–Moore-Search,Knuth-Morris-Pratt-Search,Rabin-Karp-Search
bad_1,"sec=0.0000023890, iter=9","sec=0.0000031710, iter=6","sec=0.0000069237, iter=18","sec=0.0000066280, iter=2"
bad_2,"sec=0.0000188112, iter=91","sec=0.0000199366, iter=55","sec=0.0000588369, iter=190","sec=0.0000521517, iter=10"
bad_3,"sec=0.0001917982, iter=901","sec=0.0002117157, iter=550","sec=0.0006087923, iter=1900","sec=0.0005240440, iter=100"
bad_4,"sec=0.0010772800, iter=4001","sec=0.0012444162, iter=3000","sec=0.0034785843, iter=9000","sec=0.0027277374, iter=1000"
good_1,"sec=0.0001266623, iter=600","sec=0.0000403070, iter=83","sec=0.0002033138, iter=633","sec=0.0004016256, iter=74"
good_2,"sec=0.0001414156, iter=611","sec=0.0000544834, iter=101","sec=0.0002297497, iter=695","sec=0.0004578495, iter=128"
good_3,"sec=0.0004283524, iter=1630","sec=0.0002254343, iter=426","sec=0.0007815027, iter=2066","sec=0.0013457108, iter=506"
good_4,"sec=0.0022618389, iter=9523","sec=0.0002512264, iter=504","sec=0.0029135513, iter=9614","sec=0.0065657759, iter=826"
