<a href="https://colab.research.google.com/github/byunsy/bioinformatics-algorithms-py/blob/main/BA_2E.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Greedy Motif Search with Pseudocounts Problem

### Function

In [7]:
def ProfileMostProbable(text, k, profile):

    bound = len(text) - k + 1
    bases = "ACGT"
    max_prob = 0
    most_prob_kmer = text[0:k]

    for i in range(bound):
        kmer = text[i:i+k]
        prob = 1
        for j, nucleotide in enumerate(kmer):
            index = bases.index(nucleotide)
            prob *= profile[index][j]
        if prob > max_prob:
            max_prob = prob
            most_prob_kmer = kmer

    return most_prob_kmer

We change the two helper functions:
- CountMotif() to CountMotifPsuedo()
- ProfileMotif() to ProfileMotifPseudo()

In [8]:
def CountMotifPseudo(motifs, k):
    bases = "ACGT"
    profile = [[1]*k, [1]*k, [1]*k, [1]*k]
    for string in motifs:
        for i, nucleotide in enumerate(string):
            index = bases.index(nucleotide)
            profile[index][i] += 1

    return profile

In [13]:
def ProfileMotifPseudo(motifs, k):
    counted = CountMotifPseudo(motifs, k)
    
    column_sum = 0
    for nums in counted:
        column_sum += nums[0]
    
    return [[i/column_sum for i in lst] for lst in counted]

In [10]:
def ScoreMotif(motifs):
    score = 0
    for i in range(len(motifs[0])):
        i_bases = [string[i] for string in motifs]
        max_count = max(i_bases.count('A'), i_bases.count('C'),
                        i_bases.count('G'), i_bases.count('T'))
        score += (len(i_bases) - max_count)

    return score

In [17]:
def GreedyMotifSearchPseudo(dna, k, t):
    best_motifs = [string[:k] for string in dna]
    bound = len(dna[0]) - k + 1
    for i in range(bound):
        motifs = []
        motifs.append(dna[0][i:i+k])
        for j in range(1, t):
            profile = ProfileMotifPseudo(motifs, k)
            kmer = ProfileMostProbable(dna[j], k, profile)
            motifs.append(kmer)
        if ScoreMotif(motifs) < ScoreMotif(best_motifs):
            best_motifs = motifs
            
    return best_motifs

### Test Cases

In [18]:
# Create a function for test suite
def TestSuite(function, cases):
    print("*"*50)
    print("TEST SUITE\n")
    passed = 0
    for i, case in enumerate(cases):
        dna, k, t, answer = case
        result = function(dna, k, t)
        if result == answer:
            print("- Test Case {} Passed. Expected: {}, Actual: {}"
                  .format(i+1, answer, result))
            passed += 1
        else:
            print("- Test Case {} Failed. Expected: {}, Actual: {}"
                  .format(i+1, answer, result))
    print("\n{} out of {} passed.".format(passed, len(cases)), end=" ")
    print("END OF TEST SUITE.")
    print("*"*50)

In [19]:
# Create test cases to pass into test suite
case1 = (["GGCGTTCAGGCA", "AAGAATCAGTCA", "CAAGGAGTTCGC", "CACGTCAATCAC", "CAATAATATTCG"], 3, 5, ['TTC', 'ATC', 'TTC', 'ATC', 'TTC'])

case2 = (["AGGCGGCACATCATTATCGATAACGATTCGCCGCATTGCC",
          "ATCCGTCATCGAATAACTGACACCTGCTCTGGCACCGCTC",
          "AAGCGTCGGCGGTATAGCCAGATAGTGCCAATAATTTCCT",
          "AGTCGGTGGTGAAGTGTGGGTTATGGGGAAAGGCAGACTG",
          "AACCGGACGGCAACTACGGTTACAACGCAGCAAGAATATT",
          "AGGCGTCTGTTGTTGCTAACACCGTTAAGCGACGGCAACT",
          "AAGCGGCCAACGTAGGCGCGGCTTGGCATCTCGGTGTGTG",
          "AATTGAAAGGCGCATCTTACTCTTTTCGCTTTCAAAAAAA"], 5, 8, ["AGGCG","ATCCG","AAGCG","AGTCG","AACCG","AGGCG","AGGCG","AGGCG"])

case3 = (["GCACATCATTAAACGATTCGCCGCATTGCCTCGATAGGCG",
          "TCATAACTGACACCTGCTCTGGCACCGCTCATCCGTCGAA",
          "AAGCGGGTATAGCCAGATAGTGCCAATAATTTCCTTCGGC",
          "AGTCGGTGGTGAAGTGTGGGTTATGGGGAAAGGCAGACTG",
          "AACCGGACGGCAACTACGGTTACAACGCAGCAAGAATATT",
          "AGGCGTCTGTTGTTGCTAACACCGTTAAGCGACGGCAACT",
          "AAGCTTCCAACATCGTCTTGGCATCTCGGTGTGTGAGGCG",
          "AATTGAACATCTTACTCTTTTCGCTTTCAAAAAAAAGGCG"], 5, 8, ["AGGCG","TGGCA","AAGCG","AGGCA","CGGCA","AGGCG","AGGCG","AGGCG"])

case4 = (["GCACATCATTATCGATAACGATTCATTGCCAGGCGGCCGC",
          "TCATCGAATAACTGACACCTGCTCTGGCTCATCCGACCGC",
          "TCGGCGGTATAGCCAGATAGTGCCAATAATTTCCTAAGCG",
          "GTGGTGAAGTGTGGGTTATGGGGAAAGGCAGACTGAGTCG",
          "GACGGCAACTACGGTTACAACGCAGCAAGAATATTAACCG",
          "TCTGTTGTTGCTAACACCGTTAAGCGACGGCAACTAGGCG",
          "GCCAACGTAGGCGCGGCTTGGCATCTCGGTGTGTGAAGCG",
          "AAAGGCGCATCTTACTCTTTTCGCTTTCAAAAAAAAATTG"], 5, 8, ["GGCGG","GGCTC","GGCGG","GGCAG","GACGG","GACGG","GGCGC","GGCGC"])

case5 = (["ACGAGAACTGTAATGGAGACCAATCGGGTCGTATGTACGACACGGATCTTCTGTATCGATCATCGCTTAACTTATACGATCTCATTCTCACGACGATCCTCAACCCCGGATACCCGCACTGCCTCAATCCGAAGTACTGCGTAGTACTTTACCCTT",
          "ACCTGTGATCACTCAGAGAACAGAGGATCCGGGTTGGATGTCAGTGTTATGCCAAGAAACGAGACCTAAGGTGCCGTCCCCGGCGGAATGCTTCTCGCTTCCCCTTCTAAAGGGTCCTGGCAAAATGCTTGTGACTTTGAATGCCCTCACTGAACT",
          "AAATGTGAAACCTATATCAGTCATTATACCGGCGCGATGTTAACTCGCACCTGATTGCAAGGTCACTGATCGCGTCACTACACTAGAGTTTATTCATACCTGCATGGGGGGCATGATGGATGAATTTTAACTAGGTGATCGTGACCAATGTTCACC",
          "TTTAACCGAATGAGAAGGGTTTCGTTTTAGGCCGTCGATCCGCCGCTTCTTCTCTCGACTGAATCGGAGGTTTTAATGTGTGTGCTAAAGTGAACTGCCTAATAACCCGCTAGGGGGATGATTTATTGTATCTGCAATGTACGGTGGTACATCCCG",
          "GCTTTCCCGTGTTCTTCGTGCGCTCGGGACCAGAGATCAGAACTAGTGCTAAAGCCATGGAAGCATTTAGCCGGCCGATGTAGTAAAGTTGCCCATATTTCTCCCTAAACCAGCGTATACGTCGAAAACTTTCTTCACTCCACTATGTATAGATGG",
          "CGCTTAGCGCATCCTCGCTAAACTAATTTGTAGGAGCAATCGCATGTCGACACCGAGGAAGACAACAAGTACATTAGTTACACGCTTCTTACTGGGGATAAAAATCTAGGATCGCGTGATCGGCGTGCTCCGGCGTCAGGTGACTTGATGGCCCCA",
          "AGTCATGACAATTCGCACGAGGTGACTTTCAATCGACTTACGTACTGCGCGTCGATGTCGCTTCACTCCACTAATACCCATATTCCTAACACGCCAGTACGGTTCAGAGTCGGCGGCTGAGGGGCCCTAGAAACGAGACACCCTAGAGGCTCTGGG",
          "ATAATAAACGAGATTTAGTGCTCCAGGTCCCTCTAACTTCGCTAGACTGTTCACGGTACTTAGAGTAGCTCAGAAATCGCCTGTCTTCGGGTTGGTTTTTTCAGGAGGTGCTCTGTGCGTTAACATACCAAGCCATAGCTGCTTTTCCTCTACAAA",
          "CCTACCGGAGGACTTCACTGAACTAAGTACTGGGGTGCTAAGGTCGAACGAGATGACTGGACCCTACTCTCTACGGGACCGACGCCCCAGGGCTTAATTCATATTGACTAGATTTATGATAATAATAGACTCGGCGGTTTGTAGCTTTCCCCTGAA",
          "CCGTTACTCCGCCGCTGCTGATCCTACTCCACGTGGGGCCCCCCAATATGCACATCTTATCGTCCCTGGACTGGCAGTTGATGCGAAATAATATTCGTGGGTATGATAACGCGCTATACTGATAGAACCACGGGGACTCCTGTATTCGTCTCGCCA",
          "ATTCGTGGGTTGAAGCCTTTAAACGGGATGGCCAAGTTGATTGGGTCTAATTGATATTAATTCTGGTGTACTGTACGAGACCGGTGCAGCACGGACGGGCGGTTTCAACAATACTCGTGCGCCTGGAGGGTACCTCGCTGAACTCGCATATCAGGT",
          "TGGTTACCCTCTCTTCACTTAACTCTTGTATCAAGACGTTTCTGTGAGACAAAGCAATGGCCGGACTTTGGGGCGCGTGCTCTGGAGATCCAAGCACTCGAGGTCAGCGGTATAATATAACGCATCCAACATGCAGACTGTGCGTGGGGGCCCAAA",
          "CCTTAGCGGTGTCGGCCATCATTTATCGAGCTGGAACTCCGGTGGGTAACGTGGATCCGCAGCAGGCCTTACCGTCACTTAACTTGTTACTAGAACATACGTGGAACCTATGCATGATCGAGATAGAGTGCGCTCCGCGGTACACCGCGCTCTATA",
          "AGTTACAACACGACAGGAACTATTTGCTAGGCGTTACATCTCTTTACTTTTAGCCCCTGGATTTTGAACGCATGTCAACACGTTCCACCATGGGTATAAGAATGCATGGACAGGGTTAATGAATGTGTCTCGGTCCGTTAGCTGTTACAATATACC",
          "CTTAGCAGCCCACGTCGCTGGACTGTGTTATATTACGAGGTCGAATAGTGAGGTTAACAGTCTCCGTTGTAACTTAATCCCGATATCACGCAGTGTATATGGTCGCTGTAGCTTTCTGGGCAGCTCGCACACCGCCAATTCGCAGAGGCGACCAGA",
          "TGGTAATAGGCTTCGAAATAACTCTTGGATTGCAACGAAGGTCCGAGCCTTCTCTGCACTTGGATACATTTTGGACATATGAAAGGATGGGTGCTCGGGATGGGACTTTGGTTGCCTGCAAGACGGCGAGACCACCTTACTGAAACCAACATCTTA",
          "TTACCGGTAATCTCTGATCGCCCATGCCGTCAGGTGCCTTAATTTAAGCGAGAGCTAAATAGAAAGCTGCGCGGTTTTAGCAAAATGAAGTATCAGGAATAACATGGGTTAATGTCACAAACCTGAGGGTTACCTCTCTGCACTCCAGGTCCAGCA",
          "AATTCAGCTGGTCAGTCACCACAACGTCTCTAGACTCGCACACCCAACTATTATATCACGTACAAGCCGCCCCACAACCGGCATGATAATGTCTGCACGGCCCAACTAACACGCCAGATGACGTACTTTTCGCGGCAGAGCAGTATTCGAACTCAA",
          "CGTACCCGTTATACAAGCACCATCTACAAAACGTTAGGTGTCAACGATCGTGGGCCGGACTTAGGGGTGAGACCTTAAAGCACACATTCCCTCCCACATCACTTCACTTGTCAAAAATAAAGTCGAATGATGACTACCTCAATTCCTCGCGAAAGC",
          "GCTGACACGTATTACGAACCGAGACCAGGCGCGACCCCAACCTGGACTAACAGCTATCCCTTTGTTACTTGGCACGTACGGTCAAATCCCGGTGGAGTTATTTACCGAGGGTGCGCCATGCATCGCTCAACTCCTAATGGTCGCCTGTACTTGGTC",
          "CGGCCTTGGTCCAATTCATCGTAACATTCTGTGAATCTACGGGTACTACTCGACCACGCTTGCTTCGCTGAGCTGTACCGCAAAATCGAATGGACCCAATAATCTGAATCCTTCGGTATACATCACTAGACTCACGATTCAGTATGCCCTCAATCA",
          "ATCCGAAACATTCAGTTCCGATGGCAACGACGACCACCCAGCACGCATCATCACTCGACTGACCCTGCTCGAATACAGGCGTATCTAACAACCAGCGGATCCAGGGCCCATGCTGAGGCTATTGTCACTCCCGCCCACCCTGTATGTATTCGGATT",
          "CCCGGATCGCCCGGTCAAGCACTCCAGGGCTTCAGCGCTGTGTACCTCTCTGCACACTCCGGTGGTGGGCCCCGTCCCTACACTGCGTATTCAAGTGATAGCTCGTACGATCCGCATCGAAGGCTGACTGCCCCTCTACAACAGTGCGCTCGCACT",
          "CCCGTATGTAGGTGATTAGACCCACCAAGCAATCCGCATGTTGCGTCACCGTAGATATATGCAGCGGTCATTCTTCGCTTGACTCTCTGACTTGCCCATTTAGAAACTATCCACTTAATTCCCCTTGGACTTGGGGCCTGTTTTCGGTCGCTTTGT",
          "CATTTCTATAAAGCTACAATAATAATCCGCGCTGTCGGCAGACGTGGTACCGACCCTACTCCTACCGTTTGAGAGATGGAGGGTCTTCCCTGAACTAACGGCATGCATGAGAGGGGTACGACCCTGGTACTTCTGAAACCAGCATCCGCGGCGACG"],
         12, 25, 
         ["CATCGCTTAACT","CCTCACTGAACT","CGTCACTACACT","CTTCTCTCGACT","CTTCACTCCACT","CCTCGCTAAACT",
          "CTTCACTCCACT","CTTCGCTAGACT","CTTCACTGAACT","CGTCCCTGGACT","CCTCGCTGAACT","CTTCACTTAACT",
          "CGTCACTTAACT","CATCTCTTTACT","CGTCGCTGGACT","CTTCTCTGCACT","CCTCTCTGCACT","CGTCTCTAGACT",
          "CATCACTTCACT","CATCGCTCAACT","CATCACTAGACT","CATCACTCGACT","CGTCCCTACACT","CTTCGCTTGACT",
          "CTTCCCTGAACT"])

cases = [case1, case2, case3, case4, case5]

TestSuite(GreedyMotifSearchPseudo, cases)

**************************************************
TEST SUITE

- Test Case 1 Passed. Expected: ['TTC', 'ATC', 'TTC', 'ATC', 'TTC'], Actual: ['TTC', 'ATC', 'TTC', 'ATC', 'TTC']
- Test Case 2 Passed. Expected: ['AGGCG', 'ATCCG', 'AAGCG', 'AGTCG', 'AACCG', 'AGGCG', 'AGGCG', 'AGGCG'], Actual: ['AGGCG', 'ATCCG', 'AAGCG', 'AGTCG', 'AACCG', 'AGGCG', 'AGGCG', 'AGGCG']
- Test Case 3 Passed. Expected: ['AGGCG', 'TGGCA', 'AAGCG', 'AGGCA', 'CGGCA', 'AGGCG', 'AGGCG', 'AGGCG'], Actual: ['AGGCG', 'TGGCA', 'AAGCG', 'AGGCA', 'CGGCA', 'AGGCG', 'AGGCG', 'AGGCG']
- Test Case 4 Passed. Expected: ['GGCGG', 'GGCTC', 'GGCGG', 'GGCAG', 'GACGG', 'GACGG', 'GGCGC', 'GGCGC'], Actual: ['GGCGG', 'GGCTC', 'GGCGG', 'GGCAG', 'GACGG', 'GACGG', 'GGCGC', 'GGCGC']
- Test Case 5 Passed. Expected: ['CATCGCTTAACT', 'CCTCACTGAACT', 'CGTCACTACACT', 'CTTCTCTCGACT', 'CTTCACTCCACT', 'CCTCGCTAAACT', 'CTTCACTCCACT', 'CTTCGCTAGACT', 'CTTCACTGAACT', 'CGTCCCTGGACT', 'CCTCGCTGAACT', 'CTTCACTTAACT', 'CGTCACTTAACT', 'CATCTCTTTACT', 'CGTCG