<a href="https://colab.research.google.com/github/byunsy/bioinformatics-algorithms-py/blob/main/BA_1I.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Frequent Words with Mismatches Problem

### Function

In [None]:
def HammingDistance(str1, str2):
    if len(str1) != len(str2):
        raise Exception("Error: Two strings must be of equal length.")

    mismatch = 0
    for i in range(len(str1)):
        if str1[i] != str2[i]:
            mismatch += 1
    return mismatch

In [None]:
def ApproxPatternCount(pattern, text, d):
    count = 0
    bound = len(text) - len(pattern) + 1
    for i in range(bound):
        pattern2 = text[i:i+len(pattern)] 
        if HammingDistance(pattern, pattern2) <= d:
            count += 1
    return count

In [None]:
def Neighbors(pattern, d):
    if d == 0:
        return [pattern]
    if len(pattern) == 1:
        return ['A', 'C', 'G', 'T']

    first  = pattern[0]
    suffix = pattern[1:]
    nucleotides = ['A', 'C', 'G', 'T']
    neighborhood = []
    suffix_neighbors = Neighbors(suffix, d)

    for text in suffix_neighbors:
        if HammingDistance(suffix, text) < d:
            for base in nucleotides:
                neighborhood.append(base+text)
        else:
            neighborhood.append(first+text)

    return neighborhood

In [None]:
def PatternToNumber(pattern):
    if pattern == "":
        return 0
    
    prefix = pattern[:-1]
    symbol = pattern[-1]
    symbol_num = {"A": 0, "C": 1, "G": 2, "T": 3}

    return 4 * PatternToNumber(prefix) + symbol_num[symbol]

In [None]:
def NumberToPattern(index, k):
    symbol_num = "ACGT"
    if k == 1:
        return symbol_num[index]

    prefix_index = index // 4
    remainder = index % 4

    symbol = symbol_num[remainder]
    prefix_pattern = NumberToPattern(prefix_index, k-1)

    return prefix_pattern + symbol

In [None]:
def ComputingFrequencies(text, k):

    # initialize array (length = 4**k) elements to zero 
    frequency_array = [0]*(4**k)

    bound = len(text) - k + 1
    for i in range(bound):
        pattern = text[i:i+k]
        j = PatternToNumber(pattern)
        frequency_array[j] += 1

    return frequency_array

In [None]:
def FreqWordsMismatch(text, k, d):
    
    frequent_patterns = []
    frequency_array = [0]*(4**k)
    close_array = [0]*(4**k)

    bound = len(text) - k + 1
    for i in range(bound):
        # Get neighbors (kmers with upto d mismatches)
        neighborhood = Neighbors(text[i:i+k], d)

        # Mark the patterns that are neighbors as 1 (close = True)
        for pattern in neighborhood:
            index = PatternToNumber(pattern)
            close_array[index] = 1

    # For those marked as True, count the num of occurences of pattern with 
    # at most d mismatches
    for i in range(4**k):
        if close_array[i] == 1:
            pattern = NumberToPattern(i, k)
            frequency_array[i] = ApproxPatternCount(pattern, text, d)
    
    # Find the pattern that has the highest occurrence (max of frequency_array)
    max_count = max(frequency_array)
    for i in range(4**k):
        if frequency_array[i] == max_count:
            pattern = NumberToPattern(i, k)
            frequent_patterns.append(pattern)
    
    return frequent_patterns

### Test Cases

In [None]:
# Create a function for test suite
def TestSuite(function, cases):
    print("*"*50)
    print("TEST SUITE\n")
    passed = 0
    for i, case in enumerate(cases):
        text, k, d, answer = case
        result = function(text, k, d)
        if sorted(result) == sorted(answer):
            print("- Test Case {} Passed. Expected: {}, Actual: {}"
                  .format(i+1, answer, result))
            passed += 1
        else:
            print("- Test Case {} Failed. Expected: {}, Actual: {}"
                  .format(i+1, answer, result))
    print("\n{} out of {} passed.".format(passed, len(cases)), end=" ")
    print("END OF TEST SUITE.")
    print("*"*50)

In [None]:
# Create test cases to pass into test suite
case1 = ("ACGTTGCATGTCGCATGATGCATGAGAGCT", 4, 1, ["ATGC", "ATGT", "GATG"])
case2 = ("AAAAAAAAAA", 2, 1, ["AA", "AC", "AG", "CA", "AT", "GA", "TA"])
case3 = ("AGTCAGTC", 4, 2, ["TCTC", "CGGC", "AAGC", "TGTG", "GGCC", "AGGT", "ATCC", "ACTG", "ACAC", "AGAG", "ATTA", "TGAC", "AATT", "CGTT", "GTTC", "GGTA", "AGCA", "CATC"]) 
case4 = ("AATTAATTGGTAGGTAGGTA", 4, 0, ["GGTA"])
case5 = ("ATA", 3, 1, ["GTA", "ACA", "AAA", "ATC", "ATA", "AGA", "ATT", "CTA", "TTA", "ATG"])
case6 = ("AAT", 3, 0, ["AAT"])
case7 = ("TAGCG", 2, 1, ["GG", "TG"])
case8 = ("CACAGTAGGCGCCGGCACACACAGCCCCGGGCCCCGGGCCGCCCCGGGCCGGCGGCCGCCGGCGCCGGCACACCGGCACAGCCGTACCGGCACAGTAGTACCGGCCGGCCGGCACACCGGCACACCGGGTACACACCGGGGCGCACACACAGGCGGGCGCCGGGCCCCGGGCCGTACCGGGCCGCCGGCGGCCCACAGGCGCCGGCACAGTACCGGCACACACAGTAGCCCACACACAGGCGGGCGGTAGCCGGCGCACACACACACAGTAGGCGCACAGCCGCCCACACACACCGGCCGGCCGGCACAGGCGGGCGGGCGCACACACACCGGCACAGTAGTAGGCGGCCGGCGCACAGCC", 10, 2,
         ["GCACACAGAC", "GCGCACACAC"])

cases = [case1, case2, case3, case4, case5, case6, case7, case8]

TestSuite(FreqWordsMismatch, cases)

**************************************************
TEST SUITE

- Test Case 1 Passed. Expected: ['ATGC', 'ATGT', 'GATG'], Actual: ['ATGC', 'ATGT', 'GATG']
- Test Case 2 Passed. Expected: ['AA', 'AC', 'AG', 'CA', 'AT', 'GA', 'TA'], Actual: ['AA', 'AC', 'AG', 'AT', 'CA', 'GA', 'TA']
- Test Case 3 Passed. Expected: ['TCTC', 'CGGC', 'AAGC', 'TGTG', 'GGCC', 'AGGT', 'ATCC', 'ACTG', 'ACAC', 'AGAG', 'ATTA', 'TGAC', 'AATT', 'CGTT', 'GTTC', 'GGTA', 'AGCA', 'CATC'], Actual: ['AAGC', 'AATT', 'ACAC', 'ACTG', 'AGAG', 'AGCA', 'AGGT', 'ATCC', 'ATTA', 'CATC', 'CGGC', 'CGTT', 'GGCC', 'GGTA', 'GTTC', 'TCTC', 'TGAC', 'TGTG']
- Test Case 4 Passed. Expected: ['GGTA'], Actual: ['GGTA']
- Test Case 5 Passed. Expected: ['GTA', 'ACA', 'AAA', 'ATC', 'ATA', 'AGA', 'ATT', 'CTA', 'TTA', 'ATG'], Actual: ['AAA', 'ACA', 'AGA', 'ATA', 'ATC', 'ATG', 'ATT', 'CTA', 'GTA', 'TTA']
- Test Case 6 Passed. Expected: ['AAT'], Actual: ['AAT']
- Test Case 7 Passed. Expected: ['GG', 'TG'], Actual: ['GG', 'TG']
- Test Case 8 Passed. 