<a href="https://colab.research.google.com/github/byunsy/bioinformatics-algorithms-py/blob/main/BA_2D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Greedy Motif Search Problem

### Function

In [1]:
def ProfileMostProbable(text, k, profile):

    bound = len(text) - k + 1
    bases = "ACGT"
    max_prob = 0
    most_prob_kmer = text[0:k]

    for i in range(bound):
        kmer = text[i:i+k]
        prob = 1
        for j, nucleotide in enumerate(kmer):
            index = bases.index(nucleotide)
            prob *= profile[index][j]
        if prob > max_prob:
            max_prob = prob
            most_prob_kmer = kmer

    return most_prob_kmer

In [2]:
def CountMotif(motifs, k):
    bases = "ACGT"
    profile = [[0]*k, [0]*k, [0]*k, [0]*k]
    for string in motifs:
        for i, nucleotide in enumerate(string):
            index = bases.index(nucleotide)
            profile[index][i] += 1

    return profile

In [3]:
def ProfileMotif(motifs, k):
    counted = CountMotif(motifs, k)
    
    return [[i/len(motifs) for i in lst] for lst in counted]

In [4]:
def ScoreMotif(motifs):
    score = 0
    for i in range(len(motifs[0])):
        i_bases = [string[i] for string in motifs]
        max_count = max(i_bases.count('A'), i_bases.count('C'),
                        i_bases.count('G'), i_bases.count('T'))
        score += (len(i_bases) - max_count)

    return score

In [5]:
def GreedyMotifSearch(dna, k, t):
    best_motifs = [string[:k] for string in dna]
    bound = len(dna[0]) - k + 1
    for i in range(bound):
        motifs = []
        motifs.append(dna[0][i:i+k])
        for j in range(1, t):
            profile = ProfileMotif(motifs, k)
            kmer = ProfileMostProbable(dna[j], k, profile)
            motifs.append(kmer)
        if ScoreMotif(motifs) < ScoreMotif(best_motifs):
            best_motifs = motifs
            
    return best_motifs

### Test Cases

In [6]:
# Create a function for test suite
def TestSuite(function, cases):
    print("*"*50)
    print("TEST SUITE\n")
    passed = 0
    for i, case in enumerate(cases):
        dna, k, t, answer = case
        result = function(dna, k, t)
        if result == answer:
            print("- Test Case {} Passed. Expected: {}, Actual: {}"
                  .format(i+1, answer, result))
            passed += 1
        else:
            print("- Test Case {} Failed. Expected: {}, Actual: {}"
                  .format(i+1, answer, result))
    print("\n{} out of {} passed.".format(passed, len(cases)), end=" ")
    print("END OF TEST SUITE.")
    print("*"*50)

In [7]:
# Create test cases to pass into test suite
case1 = (["GGCGTTCAGGCA", "AAGAATCAGTCA", "CAAGGAGTTCGC", "CACGTCAATCAC", "CAATAATATTCG"], 3, 5, ["CAG","CAG","CAA","CAA","CAA"])

case2 = (["GCCCAA","GGCCTG","AACCTA","TTCCTT"], 3, 4, ["GCC","GCC","AAC","TTC"])

case3 = (["GAGGCGCACATCATTATCGATAACGATTCGCCGCATTGCC",
          "TCATCGAATCCGATAACTGACACCTGCTCTGGCACCGCTC",
          "TCGGCGGTATAGCCAGAAAGCGTAGTGCCAATAATTTCCT",
          "GAGTCGTGGTGAAGTGTGGGTTATGGGGAAAGGCAGACTG",
          "GACGGCAACTACGGTTACAACGCAGCAACCGAAGAATATT",
          "TCTGTTGTTGCTAACACCGTTAAAGGCGGCGACGGCAACT",
          "AAGCGGCCAACGTAGGCGCGGCTTGGCATCTCGGTGTGTG",
          "AATTGAAAGGCGCATCTTACTCTTTTCGCTTTCAAAAAAA"], 5, 8, ["GAGGC","TCATC","TCGGC","GAGTC","GCAGC","GCGGC","GCGGC","GCATC"])

case4 = (["GCAGGTTAATACCGCGGATCAGCTGAGAAACCGGAATGTGCGT",
          "CCTGCATGCCCGGTTTGAGGAACATCAGCGAAGAACTGTGCGT",
          "GCGCCAGTAACCCGTGCCAGTCAGGTTAATGGCAGTAACATTT",
          "AACCCGTGCCAGTCAGGTTAATGGCAGTAACATTTATGCCTTC",
          "ATGCCTTCCGCGCCAATTGTTCGTATCGTCGCCACTTCGAGTG"], 6, 5, ["GTGCGT","GTGCGT","GCGCCA","GTGCCA","GCGCCA"])

case5 = (["GACCTACGGTTACAACGCAGCAACCGAAGAATATTGGCAA",
          "TCATTATCGATAACGATTCGCCGGAGGCCATTGCCGCACA",
          "GGAGTCTGGTGAAGTGTGGGTTATGGGGCAGACTGGGAAA",
          "GAATCCGATAACTGACACCTGCTCTGGCACCGCTCTCATC",
          "AAGCGCGTAGGCGCGGCTTGGCATCTCGGTGTGTGGCCAA",
          "AATTGAAAGGCGCATCTTACTCTTTTCGCTTAAAATCAAA",
          "GGTATAGCCAGAAAGCGTAGTTAATTTCGGCTCCTGCCAA",
          "TCTGTTGTTGCTAACACCGTTAAAGGCGGCGACGGCAACT"], 5, 8, ["GCAGC","TCATT","GGAGT","TCATC","GCATC","GCATC","GGTAT","GCAAC"])

cases = [case1, case2, case3, case4, case5]

TestSuite(GreedyMotifSearch, cases)

**************************************************
TEST SUITE

- Test Case 1 Passed. Expected: ['CAG', 'CAG', 'CAA', 'CAA', 'CAA'], Actual: ['CAG', 'CAG', 'CAA', 'CAA', 'CAA']
- Test Case 2 Passed. Expected: ['GCC', 'GCC', 'AAC', 'TTC'], Actual: ['GCC', 'GCC', 'AAC', 'TTC']
- Test Case 3 Passed. Expected: ['GAGGC', 'TCATC', 'TCGGC', 'GAGTC', 'GCAGC', 'GCGGC', 'GCGGC', 'GCATC'], Actual: ['GAGGC', 'TCATC', 'TCGGC', 'GAGTC', 'GCAGC', 'GCGGC', 'GCGGC', 'GCATC']
- Test Case 4 Passed. Expected: ['GTGCGT', 'GTGCGT', 'GCGCCA', 'GTGCCA', 'GCGCCA'], Actual: ['GTGCGT', 'GTGCGT', 'GCGCCA', 'GTGCCA', 'GCGCCA']
- Test Case 5 Passed. Expected: ['GCAGC', 'TCATT', 'GGAGT', 'TCATC', 'GCATC', 'GCATC', 'GGTAT', 'GCAAC'], Actual: ['GCAGC', 'TCATT', 'GGAGT', 'TCATC', 'GCATC', 'GCATC', 'GGTAT', 'GCAAC']

5 out of 5 passed. END OF TEST SUITE.
**************************************************
