Наивный алгоритм

In [1]:
def native_search(text, pattern, debug = False):
    counter = 0
    ans = None
    len_text, len_pattern = len(text), len(pattern)
    
    for i in range(len_text - len_pattern + 1):
        counter += 1
        if text[i : i + len_pattern] == pattern:
            ans = i
            break
    if debug: return counter
    return ans

Алгоритм Бойера-Мура-Хорспула


In [7]:
def boyer_moore_horspool_search(text, pattern, debug = False):
    counter = 0
    len_text, len_pattern = len(text), len(pattern)

    shift = dict()
    shift[pattern[-1]] = len_pattern
    for i in reversed(range(len(pattern) - 1 )):
        symbol = pattern[i]
        if symbol in shift:
            shift[symbol] = min(shift[symbol], len_pattern - i)
            continue
        shift[symbol] = len_pattern - i - 1
    i = 0
    while i < len_text:
        nextShift = 0
        match = True
        l = i + len_pattern - 1
        if l > len_text:
            return None
        for j in reversed(range(len_pattern)):
            counter += 1
            if pattern[j] == text[l]:
                nextShift = len_pattern
                l -= 1
            else:
                match = False
                if nextShift != 0:
                    i += nextShift
                    break
                if text[l] in shift:
                    i += shift[text[l]]
                    break
                else:
                    i += len_pattern - 1
                    break
        if match:
            if debug: return counter
            return i
    return None

Алгоритм Рабина-Карпа

In [3]:
def rabin_karp_search(text, pattern, debug=False):
    counter = 0
    ans = None
    len_text, len_pattern = len(text), len(pattern)

    hpattern = hash(pattern)
    for i in range(len_text-len_pattern+1):
        hs = hash(text[i:i+len_pattern])
        counter += 1
        if hs == hpattern:
            if text[i:i+len_pattern] == pattern:
                ans = i
                break
    if debug: return counter
    return ans

Алгоритм Кнутта-Мориса-Пратта


In [4]:
def get_prefix(_pattern):
    prefix_doct = {0: 0}
    for i in range(1, len(_pattern)):
        j = prefix_doct[i - 1]
        while j > 0 and _pattern[j] != _pattern[i]:
            j = prefix_doct[j - 1]
        if _pattern[j] == _pattern[i]:
            j += 1
        prefix_doct[i] = j
    return prefix_doct


def knuth_morris_pratt_search(text, pattern, debug = False):
    counter = 0
    ans = None

    len_text, len_pattern = len(text), len(pattern)
    prefix_doct = get_prefix(pattern)

    index_text = index_pattern = 0

    while index_text < len_text and index_pattern < len_pattern:
        counter += 1
        if pattern[index_pattern] == text[index_text]:
            index_text    += 1
            index_pattern += 1
        elif index_pattern == 0:
            index_text += 1
        else:
            index_pattern = prefix_doct.get(index_pattern - 1)
    else:
        if index_pattern == len_pattern:
            ans = index_text - index_pattern
    if debug: return counter
    return ans

# Test

In [5]:
!wget https://github.com/alexgiving/data_collab/raw/main/benchmarks.rar >> /dev/null
!unrar x benchmarks.rar  >> /dev/null

bad_t_1 = open(f"benchmarks/bad_t_1.txt", "r").read()
bad_t_2 = open(f"benchmarks/bad_t_2.txt", "r").read()
bad_t_3 = open(f"benchmarks/bad_t_3.txt", "r").read()
bad_t_4 = open(f"benchmarks/bad_t_4.txt", "r").read()

bad_w_1 = open(f"benchmarks/bad_w_1.txt", "r").read()
bad_w_2 = open(f"benchmarks/bad_w_2.txt", "r").read()
bad_w_3 = open(f"benchmarks/bad_w_3.txt", "r").read()
bad_w_4 = open(f"benchmarks/bad_w_4.txt", "r").read()

good_t_1 = open(f"benchmarks/good_t_1.txt", "r").read()
good_t_2 = open(f"benchmarks/good_t_2.txt", "r").read()
good_t_3 = open(f"benchmarks/good_t_3.txt", "r").read()
good_t_4 = open(f"benchmarks/good_t_4.txt", "r").read()

good_w_1 = open(f"benchmarks/good_w_1.txt", "r").read()
good_w_2 = open(f"benchmarks/good_w_2.txt", "r").read()
good_w_3 = open(f"benchmarks/good_w_3.txt", "r").read()
good_w_4 = open(f"benchmarks/good_w_4.txt", "r").read()

--2022-04-27 19:12:13--  https://github.com/alexgiving/data_collab/raw/main/benchmarks.rar
Resolving github.com (github.com)... 52.69.186.44
Connecting to github.com (github.com)|52.69.186.44|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alexgiving/data_collab/main/benchmarks.rar [following]
--2022-04-27 19:12:14--  https://raw.githubusercontent.com/alexgiving/data_collab/main/benchmarks.rar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8285 (8.1K) [application/octet-stream]
Saving to: ‘benchmarks.rar’


2022-04-27 19:12:14 (75.9 MB/s) - ‘benchmarks.rar’ saved [8285/8285]



In [8]:
from statistics import mean 
import pandas as pd
import time

native_time               = []
boyer_moore_horspool_time = []
knuth_morris_pratt_time   = []
rabin_karp_time           = []

iters = 50

test_cases = {"bad_1" : (bad_t_1,  bad_w_1),
              "bad_2" : (bad_t_2,  bad_w_2),
              "bad_3" : (bad_t_3,  bad_w_3),
              "bad_4" : (bad_t_4,  bad_w_4),
              "good_1": (good_t_1, good_w_1),
              "good_2": (good_t_2, good_w_2),
              "good_3": (good_t_3, good_w_3),
              "good_4": (good_t_4, good_w_4)
}


search_funcs = {"Native-Search"               : (native_search,               native_time),
                "Boyer–Moore-Horspool-Search" : (boyer_moore_horspool_search, boyer_moore_horspool_time),
                "Knuth-Morris-Pratt-Search"   : (knuth_morris_pratt_search,   knuth_morris_pratt_time),
                "Rabin-Karp-Search"           : (rabin_karp_search,           rabin_karp_time)
}

def timer(_func):
    start_timer = time.time()
    _func
    stop_timer = time.time()
    return stop_timer - start_timer

for test_case in test_cases:
    text, word = test_cases.get(test_case)
    for func_name in search_funcs:
        func, time_array = search_funcs.get(func_name)
        if func(text, word) == text.find(word):
            temp_array = [timer(func(text, word)) for _ in range(iters)]
            time_array.append( f'sec={"%.10f" % mean(temp_array)}, iter={func(text, word, True)}')
        else: time_array.append('ERROR')

data = pd.DataFrame(data=[i[1:][0] for i in search_funcs.values()],
            columns=test_cases.keys(),
            index=search_funcs.keys()
            ).T

display(data)

Unnamed: 0,Native-Search,Boyer–Moore-Horspool-Search,Knuth-Morris-Pratt-Search,Rabin-Karp-Search
bad_1,"sec=0.0000001955, iter=9","sec=0.0000002289, iter=10","sec=0.0000000954, iter=18","sec=0.0000001335, iter=9"
bad_2,"sec=0.0000001287, iter=91","sec=0.0000002003, iter=100","sec=0.0000001192, iter=190","sec=0.0000001478, iter=91"
bad_3,"sec=0.0000002432, iter=901","sec=0.0000002193, iter=1000","sec=0.0000002718, iter=1900","sec=0.0000003386, iter=901"
bad_4,"sec=0.0000001717, iter=4001","sec=0.0000002527, iter=5000","sec=0.0000003052, iter=9000","sec=0.0000003767, iter=4001"
good_1,"sec=0.0000002146, iter=600","sec=0.0000002050, iter=84","sec=0.0000002384, iter=633","sec=0.0000002337, iter=600"
good_2,"sec=0.0000001717, iter=611","sec=0.0000002003, iter=105","sec=0.0000001955, iter=695","sec=0.0000001860, iter=611"
good_3,"sec=0.0000001860, iter=1630","sec=0.0000002003, iter=427","sec=0.0000002575, iter=2066","sec=0.0000002337, iter=1630"
good_4,"sec=0.0000002480, iter=9523","sec=0.0000002098, iter=425","sec=0.0000003099, iter=9614","sec=0.0000004339, iter=9523"
