<a href="https://colab.research.google.com/github/byunsy/bioinformatics-algorithms-py/blob/main/BA_2A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implanted Motif Problem

### Function

In [None]:
def HammingDistance(str1, str2):
    if len(str1) != len(str2):
        raise Exception("Error: Two strings must be of equal length.")

    mismatch = 0
    for i in range(len(str1)):
        if str1[i] != str2[i]:
            mismatch += 1
    return mismatch

In [None]:
def Neighbors(pattern, d):
    if d == 0:
        return [pattern]
    if len(pattern) == 1:
        return ['A', 'C', 'G', 'T']

    first  = pattern[0]
    suffix = pattern[1:]
    nucleotides = ['A', 'C', 'G', 'T']
    neighborhood = []
    suffix_neighbors = Neighbors(suffix, d)

    for text in suffix_neighbors:
        if HammingDistance(suffix, text) < d:
            for base in nucleotides:
                neighborhood.append(base+text)
        else:
            neighborhood.append(first+text)

    return neighborhood

In [None]:
def ApproxPatternCount(pattern, text, d):
    count = 0
    bound = len(text) - len(pattern) + 1
    for i in range(bound):
        pattern2 = text[i:i+len(pattern)] 
        if HammingDistance(pattern, pattern2) <= d:
            count += 1
    return count

In [None]:
def MotifEnumeration(dna, k, d):
    motifs = []
    bound = len(dna[0]) - k + 1

    for string in dna:
        for i in range(bound):
            pattern = string[i:i+k]
            neighborhood = Neighbors(pattern, d)
            # For each neighbor, check if it appears in all strings in dna
            # count = len(dna) means appears in all strings
            for neighbor in neighborhood:
                count = 0
                for s in dna:
                    if ApproxPatternCount(neighbor, s, d) > 0:
                        count += 1
                if count == len(dna):
                    motifs.append(neighbor)

    return list(set(motifs))

### Test Cases

In [None]:
# Create a function for test suite
def TestSuite(function, cases):
    print("*"*50)
    print("TEST SUITE\n")
    passed = 0
    for i, case in enumerate(cases):
        k, d, dna, answer = case
        result = function(dna, k, d)
        if sorted(result) == sorted(answer):
            print("- Test Case {} Passed. Expected: {}, Actual: {}"
                  .format(i+1, answer, result))
            passed += 1
        else:
            print("- Test Case {} Failed. Expected: {}, Actual: {}"
                  .format(i+1, answer, result))
    print("\n{} out of {} passed.".format(passed, len(cases)), end=" ")
    print("END OF TEST SUITE.")
    print("*"*50)

In [None]:
# Create test cases to pass into test suite
case1 = (3, 1, ["ATTTGGC", "TGCCTTA", "CGGTATC", "GAAAATT"], ["ATA", "ATT", "GTT", "TTT"])
case2 = (3, 0, ["ACGT", "ACGT", "ACGT"], ["ACG", "CGT"])
case3 = (3, 1, ["AAAAA", "AAAAA", "AAAAA"], ["AAA", "AAC", "AAG", "AAT", "ACA", "AGA", "ATA", "CAA", "GAA", "TAA"]) 
case4 = (3, 3, ["AAAAA", "AAAAA", "AAAAA"], ["AAA", "AAC", "AAG", "AAT", "ACA", "ACC", "ACG", "ACT", "AGA", "AGC", "AGG", "AGT", "ATA", "ATC", "ATG", "ATT", "CAA", "CAC", "CAG", "CAT", "CCA", "CCC", "CCG", "CCT", "CGA", "CGC", "CGG", "CGT", "CTA", "CTC", "CTG", "CTT", "GAA", "GAC", "GAG", "GAT", "GCA", "GCC", "GCG", "GCT", "GGA", "GGC", "GGG", "GGT", "GTA", "GTC", "GTG", "GTT", "TAA", "TAC", "TAG", "TAT", "TCA", "TCC", "TCG", "TCT", "TGA", "TGC", "TGG", "TGT", "TTA", "TTC", "TTG", "TTT"]) 
case5 = (3, 0, ["AAAAA", "AAAAA", "AACAA"], [])
case6 = (3, 0, ["AACAA", "AAAAA", "AAAAA"], [])

cases = [case1, case2, case3, case4, case5, case6]

TestSuite(MotifEnumeration, cases)

**************************************************
TEST SUITE

- Test Case 1 Passed. Expected: ['ATA', 'ATT', 'GTT', 'TTT'], Actual: ['TTT', 'ATA', 'ATT', 'GTT']
- Test Case 2 Passed. Expected: ['ACG', 'CGT'], Actual: ['ACG', 'CGT']
- Test Case 3 Passed. Expected: ['AAA', 'AAC', 'AAG', 'AAT', 'ACA', 'AGA', 'ATA', 'CAA', 'GAA', 'TAA'], Actual: ['CAA', 'GAA', 'AAG', 'ACA', 'TAA', 'AAT', 'AAA', 'ATA', 'AGA', 'AAC']
- Test Case 4 Passed. Expected: ['AAA', 'AAC', 'AAG', 'AAT', 'ACA', 'ACC', 'ACG', 'ACT', 'AGA', 'AGC', 'AGG', 'AGT', 'ATA', 'ATC', 'ATG', 'ATT', 'CAA', 'CAC', 'CAG', 'CAT', 'CCA', 'CCC', 'CCG', 'CCT', 'CGA', 'CGC', 'CGG', 'CGT', 'CTA', 'CTC', 'CTG', 'CTT', 'GAA', 'GAC', 'GAG', 'GAT', 'GCA', 'GCC', 'GCG', 'GCT', 'GGA', 'GGC', 'GGG', 'GGT', 'GTA', 'GTC', 'GTG', 'GTT', 'TAA', 'TAC', 'TAG', 'TAT', 'TCA', 'TCC', 'TCG', 'TCT', 'TGA', 'TGC', 'TGG', 'TGT', 'TTA', 'TTC', 'TTG', 'TTT'], Actual: ['CAG', 'CGA', 'TCC', 'TTT', 'CAT', 'AGC', 'TAT', 'AAA', 'TAG', 'CTG', 'ACG', 'CCT', 'TTC', 'A