<a href="https://colab.research.google.com/github/byunsy/bioinformatics-algorithms-py/blob/main/BA_1J.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Frequent Words with Mismatches and Reverse Complements Problem

### Function

In [1]:
def ReverseComplement(pattern):
    comp = ""
    
    # Get complement
    for char in pattern:
        if char == 'A':
            comp += 'T'
        elif char == 'T':
            comp += 'A'
        elif char == 'G':
            comp += 'C'
        elif char == 'C':
            comp += 'G'

    # Reverse it and return
    return comp[::-1]

In [2]:
def HammingDistance(str1, str2):
    if len(str1) != len(str2):
        raise Exception("Error: Two strings must be of equal length.")

    mismatch = 0
    for i in range(len(str1)):
        if str1[i] != str2[i]:
            mismatch += 1
    return mismatch

In [3]:
def ApproxPatternCount(pattern, text, d):
    count = 0
    bound = len(text) - len(pattern) + 1
    for i in range(bound):
        pattern2 = text[i:i+len(pattern)] 
        if HammingDistance(pattern, pattern2) <= d:
            count += 1
    return count

In [4]:
def Neighbors(pattern, d):
    if d == 0:
        return [pattern]
    if len(pattern) == 1:
        return ['A', 'C', 'G', 'T']

    first  = pattern[0]
    suffix = pattern[1:]
    nucleotides = ['A', 'C', 'G', 'T']
    neighborhood = []
    suffix_neighbors = Neighbors(suffix, d)

    for text in suffix_neighbors:
        if HammingDistance(suffix, text) < d:
            for base in nucleotides:
                neighborhood.append(base+text)
        else:
            neighborhood.append(first+text)

    return neighborhood

In [5]:
def PatternToNumber(pattern):
    if pattern == "":
        return 0
    
    prefix = pattern[:-1]
    symbol = pattern[-1]
    symbol_num = {"A": 0, "C": 1, "G": 2, "T": 3}

    return 4 * PatternToNumber(prefix) + symbol_num[symbol]

In [6]:
def NumberToPattern(index, k):
    symbol_num = "ACGT"
    if k == 1:
        return symbol_num[index]

    prefix_index = index // 4
    remainder = index % 4

    symbol = symbol_num[remainder]
    prefix_pattern = NumberToPattern(prefix_index, k-1)

    return prefix_pattern + symbol

In [7]:
def ComputingFrequencies(text, k):

    # initialize array (length = 4**k) elements to zero 
    frequency_array = [0]*(4**k)

    bound = len(text) - k + 1
    for i in range(bound):
        pattern = text[i:i+k]
        j = PatternToNumber(pattern)
        frequency_array[j] += 1

    return frequency_array

In [23]:
def FreqWordsMismatchRC(text, k, d):
    
    frequent_patterns = []
    frequency_array = [0]*(4**k)
    close_array = [0]*(4**k)

    bound = len(text) - k + 1
    for i in range(bound):
        # Get neighbors (kmers with upto d mismatches)
        neighborhood = Neighbors(text[i:i+k], d)

        # Mark the patterns that are neighbors as 1 (close = True)
        for pattern in neighborhood:
            index = PatternToNumber(pattern)
            close_array[index] = 1

            # Also for its reverse complement
            rc_pattern = ReverseComplement(pattern)
            rc_index = PatternToNumber(rc_pattern)
            close_array[rc_index] = 1

    # For those marked as True, count the num of occurences of pattern and
    # rc_pattern in text with at most d mismatches
    for i in range(4**k):
        if close_array[i] == 1:
            pattern = NumberToPattern(i, k)
            rc_pattern = ReverseComplement(pattern)
            frequency_array[i] = (ApproxPatternCount(pattern, text, d) +
                                  ApproxPatternCount(rc_pattern, text, d))
    
    # Find the pattern that has the highest occurrence (max of frequency_array)
    max_count = max(frequency_array)
    for i in range(4**k):
        if frequency_array[i] == max_count:
            pattern = NumberToPattern(i, k)
            frequent_patterns.append(pattern)
    
    return frequent_patterns

### Test Cases

In [19]:
# Create a function for test suite
def TestSuite(function, cases):
    print("*"*50)
    print("TEST SUITE\n")
    passed = 0
    for i, case in enumerate(cases):
        text, k, d, answer = case
        result = function(text, k, d)
        if sorted(result) == sorted(answer):
            print("- Test Case {} Passed. Expected: {}, Actual: {}"
                  .format(i+1, answer, result))
            passed += 1
        else:
            print("- Test Case {} Failed. Expected: {}, Actual: {}"
                  .format(i+1, answer, result))
    print("\n{} out of {} passed.".format(passed, len(cases)), end=" ")
    print("END OF TEST SUITE.")
    print("*"*50)

In [22]:
# Create test cases to pass into test suite
case1 = ("ACGTTGCATGTCGCATGATGCATGAGAGCT", 4, 1, ["ACAT", "ATGT"])
case2 = ("AAAAAAAAAA", 2, 1, ["AT", "TA"])
case3 = ("AGTCAGTC", 4, 2, ["AATT", "GGCC"]) 
case4 = ("AATTAATTGGTAGGTAGGTA", 4, 0, ["AATT"])
case5 = ("ATA", 3, 1, ["AAA", "AAT", "ACA", "AGA", "ATA", "ATC", "ATG", "ATT", "CAT", "CTA", "GAT", "GTA", "TAA", "TAC", "TAG", "TAT", "TCT", "TGT", "TTA", "TTT"])
case6 = ("AAT", 3, 0, ["AAT", "ATT"])
case7 = ("TAGCG", 2, 1, ["CA", "CC", "GG", "TG"])
case8 = ("CTTGCCGGCGCCGATTATACGATCGCGGCCGCTTGCCTTCTTTATAATGCATCGGCGCCGCGATCTTGCTATATACGTACGCTTCGCTTGCATCTTGCGCGCATTACGTACTTATCGATTACTTATCTTCGATGCCGGCCGGCATATGCCGCTTTAGCATCGATCGATCGTACTTTACGCGTATAGCCGCTTCGCTTGCCGTACGCGATGCTAGCATATGCTAGCGCTAATTACTTAT", 9, 3,
         ["AGCGCCGCT", "AGCGGCGCT"])

cases = [case1, case2, case3, case4, case5, case6, case7, case8]

TestSuite(FreqWordsMismatchRC, cases)

**************************************************
TEST SUITE

- Test Case 1 Passed. Expected: ['ACAT', 'ATGT'], Actual: ['ACAT', 'ATGT']
- Test Case 2 Passed. Expected: ['AT', 'TA'], Actual: ['AT', 'TA']
- Test Case 3 Passed. Expected: ['AATT', 'GGCC'], Actual: ['AATT', 'GGCC']
- Test Case 4 Passed. Expected: ['AATT'], Actual: ['AATT']
- Test Case 5 Passed. Expected: ['AAA', 'AAT', 'ACA', 'AGA', 'ATA', 'ATC', 'ATG', 'ATT', 'CAT', 'CTA', 'GAT', 'GTA', 'TAA', 'TAC', 'TAG', 'TAT', 'TCT', 'TGT', 'TTA', 'TTT'], Actual: ['AAA', 'AAT', 'ACA', 'AGA', 'ATA', 'ATC', 'ATG', 'ATT', 'CAT', 'CTA', 'GAT', 'GTA', 'TAA', 'TAC', 'TAG', 'TAT', 'TCT', 'TGT', 'TTA', 'TTT']
- Test Case 6 Passed. Expected: ['AAT', 'ATT'], Actual: ['AAT', 'ATT']
- Test Case 7 Passed. Expected: ['CA', 'CC', 'GG', 'TG'], Actual: ['CA', 'CC', 'GG', 'TG']
- Test Case 8 Passed. Expected: ['AGCGCCGCT', 'AGCGGCGCT'], Actual: ['AGCGCCGCT', 'AGCGGCGCT']

8 out of 8 passed. END OF TEST SUITE.
******************************************