## Here we shall implement for an approximate match algorithm using the Pigeonhole principle.
We will import functions from different python modules now!

In [11]:
from Boyer_Moore import *
from optimized_boyer_moore import *

In [12]:
def optimized_approximate_match(p, t, n):
    """
    Perform approximate pattern matching allowing up to n mismatches by partitioning the pattern.

    The pattern p is divided into n+1 segments. Each segment is searched in the text t using the
    Boyer-Moore algorithm. Candidate matches are then verified by checking the portions of the pattern
    not covered by the segment. If the total number of mismatches is within n, the match is recorded.

    Args:
        p (str): The pattern string.
        t (str): The text string.
        n (int): The maximum allowed number of mismatches.

    Returns:
        list: A list of offsets in t where an approximate match of p is found.
    """
    p_len = len(p)
    t_len = len(t)
    # Calculate segment length (using rounding to better distribute mismatches)
    segment_length = int(round(p_len / (n + 1)))
    all_matches = set()

    for i in range(n + 1):
        start = i * segment_length
        end = min((i + 1) * segment_length, p_len)
        segment = p[start:end]
        # Build Boyer-Moore index for the current segment
        p_bm = OptimizedBoyerMoore(segment, alphabet='ACGT')
        matches = optimized_boyer_moore(segment, p_bm, t)
        for m in matches:
            offset = m - start
            # Ensure the candidate alignment is valid within t
            if offset < 0 or offset + p_len > t_len:
                continue
            mismatches = 0
            # Check the portion before the segment
            for j in range(0, start):
                if p[j] != t[offset + j]:
                    mismatches += 1
                    if mismatches > n:
                        break
            if mismatches > n:
                continue
            # Check the portion after the segment
            for j in range(end, p_len):
                if p[j] != t[offset + j]:
                    mismatches += 1
                    if mismatches > n:
                        break
            if mismatches <= n:
                all_matches.add(offset)
    return list(all_matches)


## Check if this works

In [13]:
pattern = 'AACTTG'
text = 'CACTTAATTTG'
print(optimized_approximate_match(pattern, text, 2))

[0, 5]
