# Naive

In [1]:
def naive_search(s, pattern):
    m = len(pattern)
    res = []
    for i in range(0, len(s) - m+1):
        if s[i:i+m] == pattern:
            res.append(i)  
    return res

In [2]:
naive_search("asasasasas", "asas")

# Finite-state machine

In [3]:
def transition_table(pattern):
    m = len(pattern)
    tr = [{} for _ in range(m+1)]
    alphabet = set(pattern)
    
    for a in alphabet:
        tr[0][a] = 0
    tr[0][pattern[0]] = 1
    
    lps = 0
    
    for i in range(1, m+1):
        
        for a in alphabet:
            tr[i][a] = tr[lps][a]
        
        if i < m:
            tr[i][pattern[i]] = i+1
            lps = tr[lps][pattern[i]]
    return tr

In [4]:
def fine_state_machine_search(s, pattern, Q=None):
    if Q is None:
        Q = transition_table(pattern)
    
    q0 = 0
    q = q0
    a = len(pattern)
    result = []
    
    m = len(pattern)
    
    for idx, c in enumerate(s):
        q = q0 if c not in Q[q] else Q[q][c]
        if q == a:
            result.append(idx - m + 1)
    return result

In [5]:
fine_state_machine_search("asasasasas", "asas")

[0, 2, 4, 6]

# KMP

In [6]:
def prefix_function(pattern):
    pi = [0 for _ in range(len(pattern))]
    
    last_prefix = 0
    for i in range(1, len(pattern)):
        while last_prefix > 0 and pattern[last_prefix] != pattern[i]:
            last_prefix = pi[last_prefix - 1]
        
        if pattern[last_prefix] == pattern[i]:
            last_prefix += 1
        
        pi[i] = last_prefix
    
    return pi

def kmp_search(s, pattern, pi = None):
    
    if pi is None:
        pi = prefix_function(pattern)
    
    result = []
    last_prefix = 0
    m = len(pattern)
    
    for idx, a in enumerate(s):
        while last_prefix > 0 and pattern[last_prefix] != a:
            last_prefix = pi[last_prefix - 1]
        
        if pattern[last_prefix] == a:
            last_prefix += 1
            
            if last_prefix == m:
                result.append(idx + 1 - m)
                last_prefix = pi[last_prefix - 1]
        
    return result

In [7]:
kmp_search("asasasasas", "asas")

[0, 2, 4, 6]

In [8]:
def test_on_document(document_name, pattern, patern_search):
    with open(document_name) as f:
        text = f.read()
        
        %timeit patern_search(text, pattern)
        result = patern_search(text, pattern)
        return result

In [9]:
print('naive search')
result_naive = test_on_document('ustawa.txt', 'art', naive_search)

naive search
38.8 ms ± 295 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
print('finite-state machine search')
result_automat = test_on_document('ustawa.txt', 'art', fine_state_machine_search)

finite-state machine search
25.6 ms ± 2.36 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
print('kmp search')
result_kmp = test_on_document('ustawa.txt', 'art', kmp_search)

kmp search
25.9 ms ± 773 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
result_naive == result_automat == result_kmp

True

In [13]:
with open('wikipedia-tail-kruszwil.txt') as f:
    text = f.read()
    pattern = 'kruszwil'
    %timeit -n1 -r3 naive_search(text, pattern)

48.8 s ± 1 s per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [14]:
with open('wikipedia-tail-kruszwil.txt') as f:
    text = f.read()
    pattern = 'kruszwil'
    %timeit -n1 -r3 kmp_search(text, pattern)

32.8 s ± 29.9 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [15]:
with open('wikipedia-tail-kruszwil.txt') as f:
    text = f.read()
    pattern = 'kruszwil'
    %timeit -n1 -r3 fine_state_machine_search(text, pattern)

36.1 s ± 74.6 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


# Worst case naive

Worst case for naive algorithm is long text with long pattern which is almost match everywhere

In [16]:
def test(text, patter, func):
    %timeit func(text, pattern)

In [17]:
text = 'a'*600000
pattern = 'a' * 10000 + 'b'

In [18]:
test(text, pattern, naive_search)

401 ms ± 6.82 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
test(text, pattern, kmp_search)

172 ms ± 691 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
test(text, pattern, fine_state_machine_search)

74.6 ms ± 320 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Long preprocessing

Worst case for finite-state machine preprocessing is long pattern with large alphabet

In [21]:
# this is a pangram (contains all letters from english alphabet)
pattern = 'The quick brown fox jumps over the lazy dog' 

In [22]:
%timeit prefix_function(pattern)

6.33 µs ± 16.6 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [23]:
%timeit transition_table(patter)

5.02 ms ± 26.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
