## KMP substring search

KMP (Knuth-Morris-Pratt) is a substring search algorithm.
It works in linear time.
It preprocesses pattern, so when we mismatch, we can use information of what matched earlier to jump to the next possible positions in the text/pattern.

In [13]:
from dataclasses import dataclass

@dataclass(frozen=True)
class Jump:
    text_delta: int
    pattern_index: int

def kmp_search(text, pattern):
    jumps = build_jumps(pattern)
    print(f"{pattern=}, {jumps=}")
    t = 0
    p = 0
    while t + p < len(text):
        if text[t + p] == pattern[p]:
            p += 1
            if p == len(pattern):
                return t
        else:
            jump = jumps[p]
            p = jump.pattern_index
            t += jump.text_delta
    return -1


def build_jumps(pattern: str) -> list[Jump]:
    # This is a quadratic algorithm
    if not pattern:
        return []
    
    jumps = [None] * len(pattern)
    jumps[0] = Jump(text_delta=1, pattern_index=0)
    for i in range(1, len(pattern)):
        jumps[i] = Jump(text_delta=i, pattern_index=0)
        for j in range(1, i):
            if pattern.startswith(pattern[j:i]):
                jumps[i] = Jump(text_delta=j, pattern_index=(i - j))
                break
    return jumps

In [14]:
assert kmp_search("abc", "bc") == 1
assert kmp_search("bc", "bc") == 0
assert kmp_search("abcd", "bc") == 1
assert kmp_search("abcd", "bk") == -1

pattern='bc', jumps=[Jump(text_delta=1, pattern_index=0), Jump(text_delta=1, pattern_index=0)]
pattern='bc', jumps=[Jump(text_delta=1, pattern_index=0), Jump(text_delta=1, pattern_index=0)]
pattern='bc', jumps=[Jump(text_delta=1, pattern_index=0), Jump(text_delta=1, pattern_index=0)]
pattern='bk', jumps=[Jump(text_delta=1, pattern_index=0), Jump(text_delta=1, pattern_index=0)]
