<a href="https://colab.research.google.com/github/byunsy/bioinformatics-algorithms-py/blob/main/BA_2F.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Randomized Motif Search Problem

### Function

In [1]:
def ProfileMostProbable(text, k, profile):

    bound = len(text) - k + 1
    bases = "ACGT"
    max_prob = 0
    most_prob_kmer = text[0:k]

    for i in range(bound):
        kmer = text[i:i+k]
        prob = 1
        for j, nucleotide in enumerate(kmer):
            index = bases.index(nucleotide)
            prob *= profile[index][j]
        if prob > max_prob:
            max_prob = prob
            most_prob_kmer = kmer

    return most_prob_kmer

In [2]:
def CountMotifPseudo(motifs, k):
    bases = "ACGT"
    profile = [[1]*k, [1]*k, [1]*k, [1]*k]
    for string in motifs:
        for i, nucleotide in enumerate(string):
            index = bases.index(nucleotide)
            profile[index][i] += 1

    return profile

In [3]:
def ProfileMotifPseudo(motifs, k):
    counted = CountMotifPseudo(motifs, k)
    
    column_sum = 0
    for nums in counted:
        column_sum += nums[0]
    
    return [[i/column_sum for i in lst] for lst in counted]

In [4]:
def ScoreMotif(motifs):
    score = 0
    for i in range(len(motifs[0])):
        i_bases = [string[i] for string in motifs]
        max_count = max(i_bases.count('A'), i_bases.count('C'),
                        i_bases.count('G'), i_bases.count('T'))
        score += (len(i_bases) - max_count)

    return score

In [5]:
import random

def RandomizedMotifSearch(dna, k, t):
    motifs = []
    for i in range(t):
        rand_num = random.randint(0, len(dna[0])-k)
        motifs.append(dna[i][rand_num:rand_num+k])
    best_motifs = motifs
    while True:
        profile = ProfileMotifPseudo(motifs, k)
        motifs = [ProfileMostProbable(text, k, profile) for text in dna]
        if ScoreMotif(motifs) < ScoreMotif(best_motifs):
            best_motifs = motifs
        else:
            return best_motifs

### Test Cases

In [8]:
# Create a function for test suite
def TestSuite(function, cases):
    print("*"*50)
    print("TEST SUITE\n")
    passed = 0
    for i, case in enumerate(cases):
        dna, k, t, answer = case

        # An alteration for this specific function only
        best = function(dna, k, t)
        for j in range(5000):
            result = function(dna, k, t)
            if ScoreMotif(result) < ScoreMotif(best):
                best = result 

        if best == answer:
            print("- Test Case {} Passed. Expected: {}, Actual: {}"
                  .format(i+1, answer, best))
            passed += 1
        else:
            print("- Test Case {} Failed. Expected: {}, Actual: {}"
                  .format(i+1, answer, result))
    print("\n{} out of {} passed.".format(passed, len(cases)), end=" ")
    print("END OF TEST SUITE.")
    print("*"*50)

In [11]:
# Create test cases to pass into test suite
case1 = (["CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA",
          "GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG",
          "TAGTACCGAGACCGAAAGAAGTATACAGGCGT",
          "TAGATCAAGTTTCAGGTGCACGTCGGTGAACC",
          "AATCCACCAGCTCCACGTGCAATGTTGGCCTA"], 8, 5, ['TCTCGGGG', 'CCAAGGTG', 'TACAGGCG', 'TTCAGGTG', 'TCCACGTG'])

case2 = (["AATTGGCACATCATTATCGATAACGATTCGCCGCATTGCC",
          "GGTTAACATCGAATAACTGACACCTGCTCTGGCACCGCTC",
          "AATTGGCGGCGGTATAGCCAGATAGTGCCAATAATTTCCT",
          "GGTTAATGGTGAAGTGTGGGTTATGGGGAAAGGCAGACTG",
          "AATTGGACGGCAACTACGGTTACAACGCAGCAAGAATATT",
          "GGTTAACTGTTGTTGCTAACACCGTTAAGCGACGGCAACT",
          "AATTGGCCAACGTAGGCGCGGCTTGGCATCTCGGTGTGTG",
          "GGTTAAAAGGCGCATCTTACTCTTTTCGCTTTCAAAAAAA"], 6, 8, 
         ["CGATAA","GGTTAA","GGTATA","GGTTAA","GGTTAC","GGTTAA","GGCCAA","GGTTAA"])

case3 = (["GCACATCATTAAACGATTCGCCGCATTGCCTCGATTAACC",
          "TCATAACTGACACCTGCTCTGGCACCGCTCATCCAAGGCC",
          "AAGCGGGTATAGCCAGATAGTGCCAATAATTTCCTTAACC",
          "AGTCGGTGGTGAAGTGTGGGTTATGGGGAAAGGCAAGGCC",
          "AACCGGACGGCAACTACGGTTACAACGCAGCAAGTTAACC",
          "AGGCGTCTGTTGTTGCTAACACCGTTAAGCGACGAAGGCC",
          "AAGCTTCCAACATCGTCTTGGCATCTCGGTGTGTTTAACC",
          "AATTGAACATCTTACTCTTTTCGCTTTCAAAAAAAAGGCC"], 6, 8, 
         ["TTAACC","ATAACT","TTAACC","TGAAGT","TTAACC","TTAAGC","TTAACC","TGAACA"])

cases = [case1, case2, case3]

TestSuite(RandomizedMotifSearch, cases)

**************************************************
TEST SUITE

- Test Case 1 Passed. Expected: ['TCTCGGGG', 'CCAAGGTG', 'TACAGGCG', 'TTCAGGTG', 'TCCACGTG'], Actual: ['TCTCGGGG', 'CCAAGGTG', 'TACAGGCG', 'TTCAGGTG', 'TCCACGTG']
- Test Case 2 Passed. Expected: ['CGATAA', 'GGTTAA', 'GGTATA', 'GGTTAA', 'GGTTAC', 'GGTTAA', 'GGCCAA', 'GGTTAA'], Actual: ['CGATAA', 'GGTTAA', 'GGTATA', 'GGTTAA', 'GGTTAC', 'GGTTAA', 'GGCCAA', 'GGTTAA']
- Test Case 3 Passed. Expected: ['TTAACC', 'ATAACT', 'TTAACC', 'TGAAGT', 'TTAACC', 'TTAAGC', 'TTAACC', 'TGAACA'], Actual: ['TTAACC', 'ATAACT', 'TTAACC', 'TGAAGT', 'TTAACC', 'TTAAGC', 'TTAACC', 'TGAACA']

3 out of 3 passed. END OF TEST SUITE.
**************************************************


**Note**: It may take several attempts to get Test Case 3 passed. I'm guessing this is partly due to the stochastic nature of randomized algorithms.

In [12]:
# Create test cases to pass into test suite
case1 = (["ACTTATATCTAGAGTAAAGCCCTGATTCCATTGACGCGATCCCTACCTCCATCATACTCCACAGGTTCTTCAATAGAACATGGGGAAAACTGAGGTACACCAGGTCTAACGGAGATTTCTGGCACTAACTACCCAAAATCGAGTGATTGAACTGACTTATATCTAGAGT",
          "AAAGCCCTGATTCCATTGACGCGATCCCTACCTCCATCATACTCCACAGGTTCTTCAATAGAACATGGGGAAAACTGAGGTACACCAGGTCTAACGGAGATTTCTGGCACTAACTACCCAAAATCCTCTCGATCACCGACGAGTGATTGAACTGACTTATATCTAGAGT",
          "CACTCCCGTCCGTCTGACGCCAGGTGCTCTACCCCGCTGATTGTCTGGTACATAGCAGCCTATAGATCACCGATGCAGAAACACTTCGAGGCAGCCGATTTCGCTTATCACAACGTGACGGAATTTGATAAACCACGTACTCTAATACCGTCACGGGCCCATCAACGAA",
          "ACAAGAACTGGTGGGGAGACTATGACACTCTAGCGGTCGCATAAGGGCCGGAAACCAGGACAAATCGATAAGATGAAGCGGGGATATAAGCCTTATACTGCGACTGGTTCCTTATATTATTTAGCCCCGATTGATCACCGATTAAAATATTCTGCGGTTTTCGAGACGG",
          "TAACCACACCTAAAATTTTTCTTGGTGAGATGGACCCCCGCCGTAAATATCAGGATTAAATGTACGGATACCCATGACCCTCCAGTCATCTACCTTCCCGTGGTGGTCGCTCAGCCTTGTGCAGACCGAACTAGCACCTGTCACATACAATGTTGCCCGCATAGATCGT",
          "ATCCGACAGAGGCAGTGAATAAGGTTTCGTTTCCTCAGAGAGTAGAACTGCGTGTGACCTTGCCTTCACCGACATCCGTTTCCAATTGAGCTTTTCAGGACGTTTAGGTAACTGATTGTCATTGCAATTGTCCGGGGGATTTAGATGGCCGGGTACCTCTCGGACTATA",
          "CCTTGTTGCCACCGATTCGCGAGCAACATCGGAGTGCTCTGATTCACGGCGATGCTCCACGAAGAGGACCGCGGCACGACACGCCCTGTACCTACGTTTCTGGATATCCTCCGGCGAGTTAATAGAGCAATACGACCTGGTCGTCGAGATCGTGTATCTAGCCCTACCT",
          "ATAGGTTAACGAATCAGGAGAGTTAATTTTACCTAGCTAGAGCGGACGGTGCCTGGCTGTATTCGCGTTTGACTTTCGGGCTCGCTGATAACTTGTGATCACCTTTTACGCTTACTGGATCCAACGATGGATCAAAGTTGAGAATTTCTGTGCCTTGGGTGTGAGCTGT",
          "CTGACGAAAGGACGGGCGGTGTACTTAGTTTGGGGTAAAATAGTTGGTATAATTCTGTGCGACAGACATTTGGTCAGGCCATACTGCCATATCGTGATGTAACTATCCACACTACGTCATAGGCCCTTGTGATCAATTAAACGTTCCTCATGCCAGGCTATCTGTTTAA",
          "GGCTTCGCGTTTAAGGCTGGATTAAGTACTCCGCCTTGTGATCTGTGATCCTCCGACCTGTGATCAGCAAGATTGGAACCTAGGTAGGCGGCGGGTCTACGCTGGCCCACAATCGTGAGTCCCCCACTCCGTAGGTTGTGGAATTTATAGACCCGCAAGGGGCACCACT",
          "AGGATGACACCCAGGATGAATCTGGATTAGGAACACCAACCCGACATATTTGTTACCGCTGCAGCATTTCGCTCTTGGACGCGTAACCCGAGATCCGTCTCGCGATCGTCACGGATCGGGATTATGCAGGCAATACCTTGTGATCACTCCGCGCTTGGTTTTGCTAGCG",
          "ACATCTCTAGTCACTTTTATTGAGCAGGTGGGCGGATTCATGATCCGGCTCTGTCGTACGTCCAACCACGGTGACATGTTCGGAGCTGTCGCCGTGGAGCAGAGATACATCGGATCTATCAATTTTACTAAGAGCAACTAGCCACGACAAACTGTGATCACCGATTGGA",
          "AATTTGCGTATCTCTAGGACTCCCTCATACAAATCAAAGCTTGGATGGGTAAGATGCCGCAGCAGCAGGTATCTCATATTGGCTATTAAGAGCCAGGCCCTATGGCCTTAGTATCACCGATCAGACGTCGCATGAGCGGGCCCGTTGTCCTATCTCTTTAGCTGCCGCA",
          "GAAGTAAAGGGGTTCCACTGCGTAGAGCGTGCCCCTCTGGTGTGCCGTACTGTTATGGTGATACAGCTTCCTTATACCCCTCGTAAAGCGGCTAATGGTCCTAATGAATGCCCTTGTGAAATCCGAATCGCTTTACAATTGCGTTCGGCGGAATGCAGTCACCAGTGTT",
          "TACACTACGCGTTATTTACTTTTACTGAGTCCTTGTCGCCACCGAACGAGGATTGTTCATTGTATCCGGAGATTAGGAGTTCGCATCGCTGACACAGCCAGTTCGTAGCAAATACCGCTGGCCCTGGGCACTCCAGATCAGAACTACTAGCCCTAAACTCTATGACACA",
          "TTGGGTCTCGATCCCTCTATGTTAAGCTGTTCCGTGGAGAATCTCCTGGGTTTTATGATTTGAATGACGAGAATTGGGAAGTCGGGATGTTGTGATCACCGCCGTTCGCTTTCATAAATGAACCCCTTTTTTTCAGCAGACGGTGGCCTTTCCCTTTCATCATTATACA",
          "TTTCAAGTTACTACCGCCCTCTAGCGATAGAACTGAGGCAAATCATACACCGTGATCACCGACCCATGGAGTTTGACTCAGATTTACACTTTTAGGGGAACATGTTTGTCGGTCAGAGGTGTCAATTATTAGCAGATATCCCCCAACGCAGCGAGAGAGCACGGAGTGA",
          "GATCCATTACCCTACGATATGTATATAGCGCCCTAGTACGGCTTCTCCCTTGCAGACACGCAGGCGCTGTGCGCTATCGGCTTCCTCGGACATTCCTGGATATAAGTAACGGCGAACTGGCTATCACTACCGCCGCTCCTTAAGCCTTGGTTTCACCGACGATTGTCGT",
          "TAGTAGATTATTACCTGTGGACCGTTAGCTTCAAGACCGAAACGTTGGTGATGCTACTTAAATGTCAAGAGTTGCGAAGTTGGGCGAAGCACATCCGTACTCCCAAGTGGACGATCGATAGATCCATGGAGTTTCCATCCATCTTAATCCGCCCTTTGCATCACCGACG",
          "TACAAGGCACAAACGAGACCTGATCGAACGGTGCACGGTCGAGGCAGCGAGATAAATGTACATTGAGAGCACCTTGTGATTTACGACCTGCATCGAAGGTTTCTTGGCACCCACCTGTCGTCCGCCAGGGCAGAGCCGACATTATATGACGCTGATGTACGAAGCCCCT"],
         15, 20, ["CATGGGGAAAACTGA","CCTCTCGATCACCGA","CCTATAGATCACCGA","CCGATTGATCACCGA","CCTTGTGCAGACCGA","CCTTGCCTTCACCGA","CCTTGTTGCCACCGA",
                  "ACTTGTGATCACCTT","CCTTGTGATCAATTA","CCTTGTGATCTGTGA","CCTTGTGATCACTCC","AACTGTGATCACCGA","CCTTAGTATCACCGA","CCTTGTGAAATCCGA",
                  "CCTTGTCGCCACCGA","TGTTGTGATCACCGC","CACCGTGATCACCGA","CCTTGGTTTCACCGA","CCTTTGCATCACCGA","CCTTGTGATTTACGA"])

cases = [case1]

TestSuite(RandomizedMotifSearch, cases)

**************************************************
TEST SUITE

- Test Case 1 Passed. Expected: ['CATGGGGAAAACTGA', 'CCTCTCGATCACCGA', 'CCTATAGATCACCGA', 'CCGATTGATCACCGA', 'CCTTGTGCAGACCGA', 'CCTTGCCTTCACCGA', 'CCTTGTTGCCACCGA', 'ACTTGTGATCACCTT', 'CCTTGTGATCAATTA', 'CCTTGTGATCTGTGA', 'CCTTGTGATCACTCC', 'AACTGTGATCACCGA', 'CCTTAGTATCACCGA', 'CCTTGTGAAATCCGA', 'CCTTGTCGCCACCGA', 'TGTTGTGATCACCGC', 'CACCGTGATCACCGA', 'CCTTGGTTTCACCGA', 'CCTTTGCATCACCGA', 'CCTTGTGATTTACGA'], Actual: ['CATGGGGAAAACTGA', 'CCTCTCGATCACCGA', 'CCTATAGATCACCGA', 'CCGATTGATCACCGA', 'CCTTGTGCAGACCGA', 'CCTTGCCTTCACCGA', 'CCTTGTTGCCACCGA', 'ACTTGTGATCACCTT', 'CCTTGTGATCAATTA', 'CCTTGTGATCTGTGA', 'CCTTGTGATCACTCC', 'AACTGTGATCACCGA', 'CCTTAGTATCACCGA', 'CCTTGTGAAATCCGA', 'CCTTGTCGCCACCGA', 'TGTTGTGATCACCGC', 'CACCGTGATCACCGA', 'CCTTGGTTTCACCGA', 'CCTTTGCATCACCGA', 'CCTTGTGATTTACGA']

1 out of 1 passed. END OF TEST SUITE.
**************************************************
