# Indexing

In [2]:
import bisect
import sys
import time

In [3]:
class Index(object):
    def __init__(self, t, k):
        ''' Create index from all substrings of size 'length' '''
        self.k = k  # k-mer length (k)
        self.index = []
        for i in range(len(t) - k + 1):  # for each k-mer
            self.index.append((t[i:i+k], i))  # add (k-mer, offset) pair
        self.index.sort()  # alphabetize by k-mer
    
    def query(self, p):
        ''' Return index hits for first k-mer of P '''
        kmer = p[:self.k]  # query with first k-mer
        i = bisect.bisect_left(self.index, (kmer, -1))  # binary search
        hits = []
        while i < len(self.index):  # collect matching index entries
            if self.index[i][0] != kmer:
                break
            hits.append(self.index[i][1])
            i += 1
        return hits


class OptimizedIndex:
    def __init__(self, t, k):
        """
        Create an index from all substrings of length k in the given text t.
        
        Args:
            t (str): The text (e.g., a genome) to index.
            k (int): The length of each substring (k-mer) to extract.
        """
        self.k = k
        # Build and sort the list of (k-mer, offset) pairs using a list comprehension.
        self.index = sorted((t[i:i+k], i) for i in range(len(t) - k + 1))

    def opt_query(self, p):
        """
        Return index hits for the first k-mer of the pattern p using binary search.
        
        Args:
            p (str): The pattern string for which to query the index.
        
        Returns:
            list: A list of starting offsets in the text where the first k-mer of p occurs.
        """
        kmer = p[:self.k]
        # Use bisect_left and bisect_right to find the range of matching k-mers.
        left = bisect.bisect_left(self.index, (kmer, -1))
        right = bisect.bisect_right(self.index, (kmer, float('inf')))
        return [offset for _, offset in self.index[left:right]]



In [4]:
def queryIndex(p, t, index):
    k = index.k
    offsets = []
    for i in index.query(p):
        if p[k:] == t[i+k:i+len(p)]:  # verify that rest of P matches
            offsets.append(i)
    return offsets

def optimized_query_index(p, t, index):
    """
    Query the index for a pattern p in text t and verify full pattern matches.

    For each candidate offset returned by index.query(p), this function checks whether
    the remaining portion of the pattern (after the first k-mer) exactly matches the
    corresponding slice in t.

    Args:
        p (str): The pattern to search for.
        t (str): The text in which to search.
        index: An index object with attribute 'k' (k-mer length) and method query() that
               returns candidate offsets for the first k-mer of p.

    Returns:
        list: A list of offsets where the full pattern p matches in t.
    """
    k = index.k
    pattern_suffix = p[k:]
    p_len = len(p)
    return [i for i in index.query(p) if pattern_suffix == t[i+k:i+p_len]]




In [5]:
t = 'ACTTGGAGATCTTTGAGGCTAGGTATTCGGGATCGAAGCTCATTTCGGGGATCGATTACGATATGGTGGGTATTCGGGA'
p = 'GGTATTCGGGA'


start = time.perf_counter()
index = Index(t, 4)
print(queryIndex(p, t, index))
end = time.perf_counter()
print(f"Time taken: {(end - start) * 1e6:.3f} microseconds")

start = time.perf_counter()
index = Index(t, 4)
print(optimized_query_index(p, t, index))
end = time.perf_counter()
print(f"Time taken: {(end - start) * 1e6:.3f} microseconds")


[21, 68]
Time taken: 331.458 microseconds
[21, 68]
Time taken: 97.042 microseconds
