# DNA Manipulation
 
In this notebook, we are going to take input of sequence data and manipulate it in various ways in order to eventual perform sequence alignment and mutation analysis


In [1]:
sequence = 'AATGGGGAAAATAGATGGTGCCGGTACCTTCGACTGTCATTCCCGTTAGGCTAAGGATTAAGGATACAGTTGGATCTTAAGTCCTACGGACCATTGTTCGGTCGGTCCGGCCCCGATGGGATTGTTCTCTATAGAGACCCCCAGTGATAGATGTCTGCCTCAATACAAAGACACAAATGTGACATATGAACCATACGCCGGCCGCGGGATTATGCGTCCATAGCTTAGCAAGCCGCAAGAGTTGATGACCGGAGTGTTGCAAACGGCGATGACAAAGGAGAGTGGAGCGCCGCTACGTACACTAATGGTGTTGGCATTGCCATTCATACGTCGCCCTGCTTAAGAAATCGTCCTGTCTACTACTTATACGCAATCGCTCGTTAGAGCGTTTTGTTCCATATGAGGTGGGTAAACGGATGGATTTCAATGAGCGTATTGTATCTGACCAATTCAACGGAATCTAAACCCTGTCTTCGGTTCCCGCAGCCGCTGACAGCACTGCTTGGAGCATATTGCTGATCATATTAGGGACACAACCTTCATTGAGTGGGGGGTGGAAGACCTTGCGTCAGACGCAGATTGGGGCTGCTCGTTAACTGTCCTTCACCGTAAGCTTGCGTTCTAGTTTCATTATACGGGCGGTTGGGTGTTGCGCGGCGTAGCAGTAATTGGTGGGGTCTATCGACTATATATAAAACGTAGAAACACCACTCGAGTCCCGGTGAGGCTGAGCTCTGTCAAATCCCGACAAGCATCGCCAAGACACATCTCCATGTTGTCACTGGTCCCCTAATCTCTTTAGGTGCCCGCAGGCCACGCCTTT'

In [209]:
def nucleotideCount(sequence):
    if not set(sequence) == set("AGTC"):
        raise TypeError("Invalid nucleotide in sequence")
    nuc_count = []
    for i in set(sequence):
        nuc_count.append((i, sequence.count(i)))
    return nuc_count

In [210]:
nucleotideCount(sequence)

[('A', 197), ('G', 215), ('T', 213), ('C', 198)]

In [4]:
def transcribe(sequence):
    rnaseq = sequence.replace('T','U')
    return rnaseq

In [5]:
transcribe(sequence)

'AAUGGGGAAAAUAGAUGGUGCCGGUACCUUCGACUGUCAUUCCCGUUAGGCUAAGGAUUAAGGAUACAGUUGGAUCUUAAGUCCUACGGACCAUUGUUCGGUCGGUCCGGCCCCGAUGGGAUUGUUCUCUAUAGAGACCCCCAGUGAUAGAUGUCUGCCUCAAUACAAAGACACAAAUGUGACAUAUGAACCAUACGCCGGCCGCGGGAUUAUGCGUCCAUAGCUUAGCAAGCCGCAAGAGUUGAUGACCGGAGUGUUGCAAACGGCGAUGACAAAGGAGAGUGGAGCGCCGCUACGUACACUAAUGGUGUUGGCAUUGCCAUUCAUACGUCGCCCUGCUUAAGAAAUCGUCCUGUCUACUACUUAUACGCAAUCGCUCGUUAGAGCGUUUUGUUCCAUAUGAGGUGGGUAAACGGAUGGAUUUCAAUGAGCGUAUUGUAUCUGACCAAUUCAACGGAAUCUAAACCCUGUCUUCGGUUCCCGCAGCCGCUGACAGCACUGCUUGGAGCAUAUUGCUGAUCAUAUUAGGGACACAACCUUCAUUGAGUGGGGGGUGGAAGACCUUGCGUCAGACGCAGAUUGGGGCUGCUCGUUAACUGUCCUUCACCGUAAGCUUGCGUUCUAGUUUCAUUAUACGGGCGGUUGGGUGUUGCGCGGCGUAGCAGUAAUUGGUGGGGUCUAUCGACUAUAUAUAAAACGUAGAAACACCACUCGAGUCCCGGUGAGGCUGAGCUCUGUCAAAUCCCGACAAGCAUCGCCAAGACACAUCUCCAUGUUGUCACUGGUCCCCUAAUCUCUUUAGGUGCCCGCAGGCCACGCCUUU'

In [6]:
def complement(sequence):
    basecomplement = {'A': 'T', 'C': 'G', 'T': 'A', 'G': 'C', 'U': 'A'} 
    bases = list(sequence) 
    complement = [basecomplement[base] for base in bases] 
    return ''.join(complement)

In [7]:
complement(sequence)

'TTACCCCTTTTATCTACCACGGCCATGGAAGCTGACAGTAAGGGCAATCCGATTCCTAATTCCTATGTCAACCTAGAATTCAGGATGCCTGGTAACAAGCCAGCCAGGCCGGGGCTACCCTAACAAGAGATATCTCTGGGGGTCACTATCTACAGACGGAGTTATGTTTCTGTGTTTACACTGTATACTTGGTATGCGGCCGGCGCCCTAATACGCAGGTATCGAATCGTTCGGCGTTCTCAACTACTGGCCTCACAACGTTTGCCGCTACTGTTTCCTCTCACCTCGCGGCGATGCATGTGATTACCACAACCGTAACGGTAAGTATGCAGCGGGACGAATTCTTTAGCAGGACAGATGATGAATATGCGTTAGCGAGCAATCTCGCAAAACAAGGTATACTCCACCCATTTGCCTACCTAAAGTTACTCGCATAACATAGACTGGTTAAGTTGCCTTAGATTTGGGACAGAAGCCAAGGGCGTCGGCGACTGTCGTGACGAACCTCGTATAACGACTAGTATAATCCCTGTGTTGGAAGTAACTCACCCCCCACCTTCTGGAACGCAGTCTGCGTCTAACCCCGACGAGCAATTGACAGGAAGTGGCATTCGAACGCAAGATCAAAGTAATATGCCCGCCAACCCACAACGCGCCGCATCGTCATTAACCACCCCAGATAGCTGATATATATTTTGCATCTTTGTGGTGAGCTCAGGGCCACTCCGACTCGAGACAGTTTAGGGCTGTTCGTAGCGGTTCTGTGTAGAGGTACAACAGTGACCAGGGGATTAGAGAAATCCACGGGCGTCCGGTGCGGAAA'

In [8]:
def reverse(sequence):
    reverse = sequence[::-1]
    return reverse

In [9]:
def GC_content(sequence):
    GC_percent = (sequence.count('G') + sequence.count('C')) / len(sequence)
    return GC_percent * 100.
GC_content(sequence)

50.18226002430134

In [10]:
fasta = """>Rosalind_0814
GGGTCGTGGAAATACCCCTTTCCCACATCTAAGCTCAAAAGGAGTTCCACCCACCTTACT
GCCTTCAGGGCGTCTATGCTGGCCCTTCGGTTGCGTAGTTTAGACTAGACATCGCGCTAA
CGGGAAAAGGAACGCTAGATCCTTATCGGGAGAATTCAAAACGAATAGCCCACCGACCAT
CTATACGAGCGCTCGCTTACCACCAAAAGAGAATTGGTCCTCTATAGGTATATGTAACGT
GGGAGGGTCGAAGTGTCACGACGCATTGGGTCATATCATATCACATATTATACGACGTAA
TTCACCGGGTGGATTCGTGAAATCTAAACTTAAACCGACGGTCCGTTACACGCTAGGCAA
TAAATTTCAGCGAACGGCATGGTAACCTCACGCCCGATTTACCGGCCACGGATAAACACC
AATACGACGAGAGTTGTCTCTCAAGGCGCTCAGTTGTGGATGTTACGCTCATTTCACAAA
TCAATCGTTGCGTTCATAAGTGGGGACGCGTCGCTTCCCGCGAGTACCTAGTAAGTATTG
GAAACACATTCTGTAAGCTACCTGGGGTATTCTCTGGAGCTTATTAAGACCAAATGCATC
GAACAAGCGCATTCTAAAGTATTACATCATCAGCATGAGCCTTTAGAGGATCTATTGAGC
TGGGACATATGGAACTATGACTTCATACCTAACTTCCACCTTATGCCTCAGGCTAAGTCA
TGCTCGGGTACAGCCGTAGGGACATACATACGGAGCACTATTTTGTTCTTCTACGAACAT
AGTCTATAGCCCGAGTCAATTTAGATAATACTACAACAAACAGTGTACTCTGCGATATAT
TCCATTCCCACTGAATTCTTGTTTGGACATGGATATCTGTGTCCCAGCAGAATCCAGATT
CAGCCAGCTATGCGGGAATGAAAATATGAACTCCCGATCTGACACGTGGCTCGGTATTT
>Rosalind_7750
ACGCGCTTCGTTCTATTACAGCTAATGTCATCAGAACTTGTGCTATTCACAGTGAGCCGT
AACAAGAACCGGCCACTAAGGAGGGGTCAGGATCAGCATTATCTTTATGCATACATTTAG
ATAGCCTAACTATTGGGAAAGCACAGCGCTCCTTAACAGTGACAAGACACGGCACCAGGG
GTAGCAGGACACACAGGGGATATAACGGCAATGCTTTTAGGATAATAACCGCCTGGTTAT
CTTGTCCTGTGGACGTAAAGAACCTGTAGGATTAGAAGGAACAGGCACGTTCTGTGACTA
TCACGAATTGCCGAGTAACGAGTACACCGGCGTGACATCATAACCCTCCTCTCCGTGGGA
GGGTCCGCGAGGAGCCCGCTTTTAGGAGGTTGACTTCTATTCGTTAGTCGTTTTCCTGTT
CGTCTCCGAATCATTGAAAACACTTATCGGGGGCGAGCCGGTCGAGTTCCCCTAACCCAC
CATAAGTCTACTTTTGGCATAGCCGGTGGAACTCTATGTGCGAGATCATCTACCTAAGAA
CCACATTCAGGCTAATCCCAGCCTAGTAGGGAGCACCCCGCGTAATAAATAGATTTACCG
GATATCCTCAAGTGTTGCAACTCGGCCAGCTGTCAGGCCAAATCCTTGCGGCAGAGAGGG
TGAGGTTTTGCAAACCCACTCATAGCTAACTTAGCTCCGGTTCTCATCGGTAACATACAG
TTCATCTACTTTTAGGCCGAGTTCTACCCCGAGTGAACATCCCTTAGACTTTGTTCCTTT
TATATGACGCGATCTGCTGATTGGTATCG
>Rosalind_0569
TCATACACGATGCGCTAGTCCCGCCTTCCTGCTCTACCTCCCTCTCCCTAACTTCGATAA
TCTACCTCGACTACTTGCGCCCGCCGTAGAGTGGCTTAGAAAAGACAGTGGCGGCTCCTT
TCCGAGTATGCCTGGCCATTCTTTGTCGCATTGCCTCTAAGAAAGCCTCATTGCACCTCG
AGAAGTTTCAATCCTCATCTGTCTCCTGGCCCAACTTTACACGACTCTAGAACATGACAG
ACAATGCAGAGTTTGCTCATCATCGTTGGACAGCCTTCTGAAGTGCTCTATCGATTGTCA
TTTAACAAGGGATTCATCGTATCTCTTATACCTCGCGAGCACCGTCGAATCTTGTAGCTG
CGCACCCCTTGGAATCCGAGATAGCTTAATCCAGAGTCTGATCGGTGTTTCTTGTGGCCT
TCAAAGTTCAGAGGGTTGAGCCATATAGAAGGCGAGCGGCCCCACTGGGGTGTTCGGCAG
AGCACCGTCCTCACCCTCTGCAAGAGATGTTGCAGACCCGCCCAACAGTGCAGACGTTAC
GACGGAAGCGAATGAGTCAAATCTGTTGGCACAAATTCTGGAGCCGACGGATACTATCGT
AACTTTAGCATGGGCGGTAACTGCCTCGATACCCGGCGTTAATGTGTTTGTCCAGGGATT
TCTTATAACCGATGGCCACATTACTATAAGGTGTTTCTGCAAACGCGTAAAATGCTGCAG
CAAATGCTCTGCTTAATCCCTCACCCTGCCCAATAGTACTACATTCTCAGTAAGCCTGTT
CTCGAAGGGTGATCAGAAATCTCAAGA
>Rosalind_8621
AGCCATCCACAAATTCTGGGTAGGCTCCCAAGTCTCAGCATCATGGGGTAGATCAACTGA
TGTTTATCGATTGTGCTCATCAGAAGTAAGGTGGCTTTGCTATACTAACCGCCAATCTCC
ATCTATGTCCCCCACTCGGATTATCGACGGGCAATCGGGTCCTGATGGTTATGATACATT
TTGCGCAATAGATCCCAAAGTTCGGGGCTTGCATAGTATTCTTGGCGGAAAACTCAACTG
TAGTTAAGGCTACACTCTATTATCGCCTTCTCCTTAACTCGCGGAATTATCAGTATGGCG
CCTGATCTGCCTTCCCAGATAAAACTCCTATCAAGTTGCTTCCTGACCGCCGCGGCGCCT
GGATCTGTGACCACGAGACTCCCGTATAGGCTAAACGGAACATCCCCCGTAGGCTAGAGC
CGTGCCTGGGATGTGTTGTGTCGGACGTTCCGCTGTCTGGCGTAATCTGACAGCTTTCCT
AGTGCGCCTCCAACACGGGGCATAGTATGACCGTTGACACTGTTAAAGCACTCGTCGATT
ACAGTGCCGTTTGGCTGTCCGCTATTCTTGCCGGGGACGCATTGGGTATAGCTAATACAA
CTTACTCGTGCTCGACCTACTGGGGGCTATAACAGCCGCGAAGATGACAACGAAACACCA
ATGCACGGCATTAGCTGAATGGGCGTCGTTATCTAAATCGTTCCACGCATGTCTAATCTA
GACTTAGAAAGTAAAAGCACAACTGTCATAAGGGGCCGGATCCCGCTGACCATGAGTCAA
AGGGGTAGCGATAATTCCAGGTTACGCTCGGCCAAGGTATACGGGCTATCGAG
>Rosalind_1457
AAAAACGTTGATGCGGTGGATCGTACTATAATGGTTTCTGCTCGCTCCTCTTATTTCGCG
CAAAGAACATGCTGGCACTATAAGGGCAGTCTCGTGGGTAAAGACGGGCCAGACAGGACC
TCGATTTGGCGAAAGTGATGGGCGTGCATGTCGCCTTTTACGTACCATATGTAGGTAGGC
GACGTGTAGAACCGCGGAACTATCTCTTCCCGATAGATGGTTATCCTCCACATCCCATAA
GTTCTATAGCGACATGGCGTGCGTCCGAGATGTTGACCTGTTTGTATCCTTCGAAGCAGG
ATATATTATAAAAACAAGTCATTTACTAGATCAACGGTTTCGCGGCACAATATGACCCCG
AGCTGTTTGCTCACACTGCGCCGCGTCGGTGACCGTGGAACCCATGAGCACAAAGATCGG
GGTTTGGCGTCTAAGATTTCCAGAGATCTCTATAGGAAGGCAAGTGCAAAATGAGCCTTT
CGCACTATCGGTGGTTGATGTTGCTGTTCGAATAGGTATGCACCTGAAGTGGGTTGTCGT
TGCCAGGTCCCACGTTGCTGCTAGCCCTGTCTCCACGAGTGACCCGATGCTTGCCACTCA
TGTAAGCACTAGCGCACTAGAAGTTTTCCCAACAGTGTGGGTATTCCGTCTAACCTTTAT
GAGGGTAGGCTCCGAGTGTCGGGAGGTTCTCCGTTTTATTTGATAGTAGTTGGAGGACTC
TGCTTGCATCGTACCGATCCGCGAGTCATACAGCGATCGGAAAATAGGTGTTGATGATAC
TTCAAAAGATCGCCGGCTGGGCAGATTCGATGAGCGAGACATAAGACAGGA
>Rosalind_9445
GCCTGTCGGTACCCGCGGCGCGGATATATGGACGTTCTCTTTGAAAACTCTTCGTATGGT
GTGTTGAACTAGGCTTTCCAACTGGATCGAATGACTATGCCTTGACTCTGAAAGTGGACT
ACCGCAGCGCTACGGGTACGACCTTGGCCTCTTTCCCGTTCTATCGGTTGCTATAGTACT
AATTCACACTTCTGTAAAGAAATGCGTTACGAAGCAGAGAGCTGTTGGGACGAGCATCTT
CGCTGAGTCCACATGAGATCCCAGGGCCTTTGAAGCAGTGTGCTCAAATCAGCGAGTGTG
GAGGACGAGAGCCGCCTCCAGCCGCGCTGAGGCGAGTTGCCTCAAAGTTATAGATCCTGG
TCGATCGTAGCGGCTCGATACCTTAACCCGACAAACATCAAATAACGAGATCGCTACTTG
GGACCTTAGGACAACCTTTATTATCTGGCCCAACCTGGCGCTAGATTCACTACTTATACC
CCGTCATCCCTCCTTCGATGTCGGGACATCAGGCCAGAAAATAGCATGATTATATAGCGA
ACATCCAAGAGCCAATGGCGCCATCCATGTTAGAACGCCAGCTTAGTTTAAATCGGCATA
GCCTCTAGACGGCGATAGTTGATGCCATCATTACTCAACGTAATCCTTGTAGGTCCCACC
AAGCGCATCTTAGCAAATATGACAAGTTCGACAAAGGAAGAGCAAGCGCATTAAAGAGAT
ACCAGTCGGTAGGCTCACCCCAATATCGGTTTATCCTATTCCCATCTCTATGCAGTCGTA
GTTCCTGTCGGAGGGAGATCTGTTGACTAAACCCAGTAAGATCTATCGATGGTTAAGGTA
TGCAACTGTAAGCGGAGTCCCTAGGCCCCATACGTGAATTGCTCAACAAGTGGTA
>Rosalind_3673
GGCGTGCGTCTGTAGCGCTACTGGAGCACTTTAGCGTTTCTCAGCACGCCTCTTTAAGTC
GCCCACCTTGGGTCCGATATCGGTCGGTCAATGTACAAATTCGAATACACCGTCGAACGT
CTCTAGTGAGTTAGAGCGGGGTGGCTAAAGCCTTGAACTCATTACCTCAACAGGTACCGA
AACTTCCCGAACCCGTATGCTGCATTTAGTTTACTACCGTCGCTCGGTGCTTAAGCTAAA
CGACCGCGGAGCCGACAGCGCCGTACATCGACGTCGCAGCGGGTCGGCAAGGGATTTACT
TCTGCAGCTGTTTCTAGCGGTTCAGTCCGCGGAGCTCTGCTAAAGCATCTAATCAGATCC
TCAGATCGCTCTAGAATCTCAGACTGTATTTGGACTAGACGGAGCGTTAGTATAAAAACT
TCGACACCCTCCGAGATCGCTAAATGCCCTCGCGCTTTAGGAATATGATATCTCATAAAC
TGGCAAGGCACAGTATACTACATTTATCAAAGAATGTTTAAGCTTGTTAGCCAAAACTGA
CCATAACGCCGGGCACGCTGGATGGTAGCACGGAAACGTACCGGACCCTAATATGTAATT
CACAGCGGATGGTCAAAATATCCTTTCGCCGATAACCCTGGGAAGACACTATTATCTTCG
GATGGTGTGGTATCGGTCATGCAGTTTCCAGATCCGTTTGCGCAGACTATAATGGTCTAA
TATGCGACGGAACCCGTAAATAAGTGAGACGTCACTCTCGCGACGTGATGGTAAGCAGGA
CTACCACATCTATTCGTCTGTTACTGGGTGCCATGCTCGTTCAGGCCCTATCTTGGATGG
ACGATCCTACAGCACCATTTAGTAG
>Rosalind_3842
CGAAATTAGAGAGGTGGGCGTGCATAATGATCAGTGACTCGTAGTTCAAAACTTCGCAGG
AGAGATGTGTCCTGACGCCCGCACTGCTAGCACTCTTAAGGGGTCGCAGGCACGGTCATC
TGCCACAGCTCCCTGATGTCAGGTATTGGTAGTGGGCTGGAAGACCCGCGGACCGACCCA
CTCTATAAGGTTACATAGTTGAAGGACTATCTCATTAGAACCAACAATTTCCAACCTCTA
AATCTAAGCGCGAATATGCTTCACAAAACTTGCGCACTCAGTGCTGCCTAGCATTAAAGA
AGGGTAGACCGGTCTTCCTGCCTATGTCCACCCCTTAGGAGAAAGTAATAACCACACAAC
GGCCAGTGGAGCTAGATCACCTCAGTGGTTACTTTGTAGTAGAGCGATCGCGCCTCAAAT
AGGTCACCCCAGTATGGGTCCAAGACGTTCCTAACATCCCTGAGATCTATACTTGAGTAC
ATGCTAAAAACCACCCTCGCAATTATTGCACCGTTATTTGGAGCTCGATGCCACAGCTCT
TTCCGGGGCGATGGTATGATTTTTTTTCGCCGTGTAATACGCACTTGACGTGATAATTCG
GGCTTCCGAATGCAATGTACGTTTATTCAATCCATGATTGGTGGTACCTTATACTGGCTG
ATCTCGGCGATAATCTATGACCGCATGTAAGGAGTGAGTCTCGCAGTGGAGCCGACGAAG
GTAATGCCGTAAATAATAAGCTAGTACCTTAGTATACGAACTAGCATCGAGTCGACGGCG
AACACCAATCCATGGGTCACGCTCGAACTTATGTAGTTTTCAACAGCCTACACCAACGGA
CCCTCGACTCATGTGGAGAGTTCAACGGTAGAAAGCCTTTAAGGTGACACAATATACCGC
ATCGGGGGTGCCCAATAAGGTTACCAATGACGAACAGGATTTACCAGACTTAGCCTTTTA
ACGTAATTTGTCTCGGCCAAGGTACCTATC
"""

def parse_fasta(fasta):
    results = {}
    strings = fasta.strip().split('>')

    for s in strings:
        if len(s) == 0:
            continue

        parts = s.split()
        seqID = parts[0]
        bases = ''.join(parts[1:])

        results[seqID] = bases
        
    return results

results = parse_fasta(fasta)
results = dict([(k, GC_content(v)) for k, v in results.items()])

highest_k = None
highest_v = 0

for k, v in results.items():
    if v > highest_v:
        highest_k = k
        highest_v = v

print(highest_k)
print('%f%%' % highest_v)

        
#for id, seq in sequences.items():
#    print(id)
#    print(seq)
#    print("*"*42)
    

Rosalind_8621
50.420168%


In [11]:
parse_fasta(fasta)

{'Rosalind_0569': 'TCATACACGATGCGCTAGTCCCGCCTTCCTGCTCTACCTCCCTCTCCCTAACTTCGATAATCTACCTCGACTACTTGCGCCCGCCGTAGAGTGGCTTAGAAAAGACAGTGGCGGCTCCTTTCCGAGTATGCCTGGCCATTCTTTGTCGCATTGCCTCTAAGAAAGCCTCATTGCACCTCGAGAAGTTTCAATCCTCATCTGTCTCCTGGCCCAACTTTACACGACTCTAGAACATGACAGACAATGCAGAGTTTGCTCATCATCGTTGGACAGCCTTCTGAAGTGCTCTATCGATTGTCATTTAACAAGGGATTCATCGTATCTCTTATACCTCGCGAGCACCGTCGAATCTTGTAGCTGCGCACCCCTTGGAATCCGAGATAGCTTAATCCAGAGTCTGATCGGTGTTTCTTGTGGCCTTCAAAGTTCAGAGGGTTGAGCCATATAGAAGGCGAGCGGCCCCACTGGGGTGTTCGGCAGAGCACCGTCCTCACCCTCTGCAAGAGATGTTGCAGACCCGCCCAACAGTGCAGACGTTACGACGGAAGCGAATGAGTCAAATCTGTTGGCACAAATTCTGGAGCCGACGGATACTATCGTAACTTTAGCATGGGCGGTAACTGCCTCGATACCCGGCGTTAATGTGTTTGTCCAGGGATTTCTTATAACCGATGGCCACATTACTATAAGGTGTTTCTGCAAACGCGTAAAATGCTGCAGCAAATGCTCTGCTTAATCCCTCACCCTGCCCAATAGTACTACATTCTCAGTAAGCCTGTTCTCGAAGGGTGATCAGAAATCTCAAGA',
 'Rosalind_0814': 'GGGTCGTGGAAATACCCCTTTCCCACATCTAAGCTCAAAAGGAGTTCCACCCACCTTACTGCCTTCAGGGCGTCTATGCTGGCCCTTCGGTTGCGTAGTTTAGACTAGACATCGCGCTAACGGGAAAAGGAACGCTAGATCCTTATCGGGAG

In [89]:
codon_table = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
    "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
    "UAU":"Y", "UAC":"Y", "UAA":"STOP", "UAG":"STOP",
    "UGU":"C", "UGC":"C", "UGA":"STOP", "UGG":"W",
    "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
    "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
    "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
    "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
    "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
    "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
    "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
    "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
    "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
    "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
    "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
    "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G"}

rna_string = """AUGGGCUUUCGCGACGCCUUUCCAGGUAGCUAUUCCGUAUGGGUAGAUGCGGUUCAAAUAGUCACUGACGUGUCUCCAUAUGCAAAGGACCGGCCCAGUCAGCAAUCGCGAUGUGCAAUAUGUUCCUCAAUCGAGACUAGUACCAGUAGUCUUGCCCGGUGUCGUUUUCAGGUUCCCUCUACAGAAAAGGAAGUUAAUGCUCGAAAAAGGAGCGAUGCAUCCGGUUCCCUUAAAGAUGUUUUGCUGUUAGUCAAGGUGCCCAGAACACAUCCGUGGGGCUCUUCUUGUGGGUUCCCAUUAUGCGCUUCGUGCUGUCCAAGAGUCUCGGACCGGUCGUUGCGCGUCAUUAAUACGUUGUUACGAUUCCGAAUUGUCGAGUUGAACCCCCGCCUUCCGCAUAGUUGUAAAUACCUGUUCUGUUCCAGUAUACAAUCUCUACUAGCUAUUAGAAGAUUAUUUGAUAUGCAGAGGCAACUUGUGGACUCACACUGUUGUAUUUGUGUAGACACUAUUACCGAGACUAACCCUCAGCGAGAGGCCACACGCGUAUCAGAUCGACUCGGCGAUGGUGUGCAAGGAUCGCAAUCCACGCGGCGUAUACCAAUCUUGGCGAAUGGUCGUAAUAGGGAUCUGGGUCCCGGGAUGCACUUCAAGCAUCAUAGCUUUUUGGAUUACUCCGGGCGCUUGAUAUAUCCCAAGAUGUGCUGCCAAAAUCGGCCUGGUGUGAUGGAUCUAUCUGAUUACUAUUCCGUUUACGGAUCAAGAGGAUUCAACUGGGGGAUGCGCCCGCAUGUCGUCUCGAAGACCAGGAUACUUACAAGUGAAGCAGCUCUAUGCCCCAAUUUUCUCGCGAUGAUCACGGCGGGCGGGCUUGACCUUAGUAACGUGGCGAAAAGAAUCUCCGAGAUCCGGCUCUUAAUUCAGGUCAAGCCUUAUGGUGGAUUUCCGUGGUCACCGACGAGCACCGCAUCUCACGGCUUAGACGAUCGGCUGAGGGUGCUGUGUCUCGCACCUGACAGGGUGAGCGAGGAAAUACCCCCUACAGGAAAGGGGCCAAAGGAAGCGCACCUGACGUACAGAUCCCGAUCAUACAAAUUUGUCACUCCAGCAUUACGAGCUUCAUACUGGCUGCAGCAAUCUGGUAAACCCUGCAGCAUGGUUUGCGUGAGUGGAGCCCCUUGUUUCACGUCUACGUCGCGCAAAUUCGGAAUCUCAACACACCACCUUGUGCCGUCGGAGCAAAUGUUUCCCACUCUAUCAAUUCGAGUAGUUCCCGCGUUACUGAAGCGGCCUGUUGUAACACCGGUCUAUCACAGGGGGAUGUCCAACCAUCAGGCUACGAAAAACAUCGAAAACCCCAAUGUUCCUCUCCAGUAUGGCGCUUACACAGUGGCCCUGGGGGAGGGUGUCACUCCUUCCCUCUGCAGGGGAUGGCAUGCAGUUAGCGCUGAGUCAGCCGAAGAAUUCUACCCAUGUAGUUCCGUAAGGGAACUAUGGGCCCCAUUAUAUGCGUUUGUCGUAGAAGCACCUAGUUCGAAGCCGCUGCCACUCAUGAGGUAUCCGAUCUGUAGUUUGGACGACAGAUGCCACCUGGGAACUGCGUACAAUGCAAUGACAUCUUUAAUACCAUAUGAUGCGGUCCAUCUAGGGAGAAGGCCUGUGUGCGUGAUUGUCGUGUUGAAUUGCUACGUGACAUUCUGUGGUAACUGCAACAAUUUUACGGCCGGCCUACGUCUGAAGAAGUUGCCUAUGCAUUGUAACUGGGCUUUCUGCGUGAUUACGAGCGGAUACCCGUCCCCAAACUGGCAAACUACGUCUACUGUCUGGAUACCUAGUACGGCGACUGCUUUGCUAAUCCUCGCUACUCGUUACCCGUCGCUGUUAUGUGAAGGCGGCGCUGAAGGUCCGGAUGCGGUCAGGACGACGUUUUUCUGGAAAUCGCCGCCUGGAAAAGCACAGGAGAGCUGUGGCAGCUUAGUCCCUGUCGUGUUCUGUCUGGUGUACAUUCCUGUGCCCGCAAACGUAUUCGCAACAGUGAGCGCACUCUUUACGCUACACUCCUUAGUUGGGUCGACUCAGUUCCAAACGAGUUGUCUAGUCACGGGAUCAGUUUUAAGUGAGUAUUCUUACAGUAAGAAUACUGUUCACCAGCUACUACUGUUUAAGAGGGGAGGACUUCUCAUGGUGGACCCAGCAUCGUUGAAACCUCUCAAGACGGGGCUUAGCGCUCAUGUUGCCGGGAAUGAUAAUUGUGCACUCUGUGACGUCUUACCUAUAAGACGGAUGUCGAGUUGCUUAAAUCUAUCACCCUCUACUACUCAUUGGGACAGCUACUUCAGAAACGCCGGCUACCACACUAAUCUACGCAGGGACGGUGCGGUCCGCCCCAUCUGCGCGGUCCUGGCCGUAAAACGUUUUCUACUCUUCCCAAGGGCUGAUCACAAUCAUCCGGUACAGUGGGUUCUUGAACACAACCUACAGCGCGGAAGUCAGCGCCGGUGGCCUCCUAUACUGGAGAUGGAUUUCGUUGUGGCUUCGGUAUUAAGCACGGCGAGAUUAGAAUGGCCCGGACGUAGGGUAAGCCGAAAUCUAAUGGGUAACAUAGCUCGACCCCACUGUCCGGAGGGUGAGCCAUGUGGACGCGCUGUAACAAAGAAUAAUUCAAUCGGUCUGUAUGAAGACCUAACAUUAAUGUCCCCAGGGGGACCACCGGCUGCCCGCGCAUUUCGGAUAUAUUUGCAUGCCUCCAUGUUAAUCGAGUUUUACUGUUGUCGGAUCCCCGGAGGCUUAAGCGUUAACGCUACAAUAUGCCUCUUUGCUAACAGAUCUUCCUUUCUAUUAGUGGUUAGGGUUGGCGACGCACAAGGUGUCGGCUGGUUACCGACAGCGAUCACUGUGAGAGGUUCUUACCAUAUAAGAUACUCAAGCUGUAAACUAGCCAUUCAUACGGAGGGCAUGUCCAAGGCCCGGUCGUUCUACCUUGCUAUAAAACCUGCCCUCAAGGCUGGCUCUGGCUCCGCGAGCACCAGGUCCGCCCGGUCUAGUUGUCUUACCCGGACGAGACUGACUACAACCAGUAGCAUUUGUAAGGACACAGAUAGACAUCCUCAAUGUUUACGAGCGGUGAGGCGCCCUUCAGUUCGACGCAGCAGGAAUUGGAUUACGGGCCCGUACGCGCCAAAGCAGGGGGACAAUAGCUCCUUGGAUGCCACGACUAUUGGUAGAAUGAAUGCCUGGUCCAAAAGGAUGCCACAGUUUCGUCAUUACCGGCUCCAAUCACCAGCAGGAGCCCCGCGGGACCAUUCUAGUUUCGUCCACGAUAUGGGGUUCAGCAGAGGGACGAUACGAACACCAAAUGGCUACUAUUAUGGCAGGCUACUCUUGCCCUUGAGUCUCAAUGUACCCUACAGAGCUGCAGUAGAGAUCAAGAGCCUCGGAUUUGUUUUGAUUCGAGCGGCUAUCCCAACCUGGAUUCGAAUGAAUAAAUUUUAUCCACCUAUGGCAACGAUGGGUAGGAGUGAGUAUCGCGGCACUGAGCUGCACGGGACGGGCUGUUCUAGCGUAGUCCCAAGAUACGUGAGCUUUAAACCGUAUGCGAUGGAACCCCUUGGCGGUGCACCUGUAGUGGCCCCCACUUAUGCACACUUGUUCAAUACUUGCGGGGGCCAGUACACUAGGCCUAUCGUUCAUACUCGUAACCGCUAUAAGAAAAUGUCUUAUACAGGGCGCACUAUACUCAGUGUAAACUCGCGUACCUUUACUAGUAAUUGGUCGGGCUACGACUUGGUUCAAACCACCCUAAGCAGCCGUGUACUCAAUGCAUGGAUGCUGGAUGCUACUCGGUUUGGUGAUAUUCUUACGUUUGUAGGCUUUACUGCCGCUUUGCCGCUUUCACCCUGGACAUUUUCCAUAGCUAGAAUGGAAUCCGAAGUCAACUCCGCCCGCUCGCAUCUGGGUAUAUCAUCAGACGGUUUCGGCUGUGGUCUCUGUGCCUUUGUCUCACCGCGUAGAUCUGAAAGACUAAUGAGGUGGGGCAGUCCUAGUUCCAUUAGAGUUAAUAAUACCUUUGUUUUACAAAAAGAUGCACGAGCAGGACAUGCCGUCCCUUCCACGCGGAGACCCGCGCAAAGGUCCUCACCUUGCCGAACGCCUUUAAUGUUUGACCCCCUCAUACACAUGUCUCAGGCUAACGCUCGACGGCUUCGUUAUGCACACGUAUCGCAUACGCACAGGCUUAUAACACCACGUUUUGCAGCUAACCGCCUCUGGCCUCCGCUGAUCAAGGCGCAAGCCCGAUAUCUCCGGUGGCCCUUACCUAAGUGUCUCAUUCCCUUUCCGCCGAAGGCGCGGUUUAUGUUCCUAAAAGCGCGUCUCCAUGUACUUUCAUAUAAGUUCGAGACCUAUAUCCGCAAUUCCGGGAUUCGGAAGAACCGGGUCCGAGGCUGUAACGAUUUCUCGACUAGGUUCACCUCAGUGUUCCGCAGAUAUCCUAUCAAAAUCUCUGUACGUUGCUGCGUCCAUAUAAUUGGAGCGCACCUCCCUCCUUCUAUUUACAUAUAUAGCAUUAGAACAGAGAAGUGCAAGGCAAGGGAAGAAUCGCUGAUAAUAUUUUCGAGACCAUCCUUGCGAGCUUUUAUGAUCUGGGAAGGAACCAAUACGACUCAAGCUGCCAGUUAUUUUAUACAGGACACUUUUGGACAUAGCGGACCGCCAACGAUUAGAUCACAGGAGCGAAAUCGUCAAUACUCGCAGAAGAGAAGCGCCACACACUUCUAUAGUCGAAUUGGUGUAGGGCAGCUAAGAAGCAGCGGAUAUAGAUUUAUGACUCCGGCAUUGGCGGACCUUAGUCUCACUGACCUUAGGAUAUGGGUGACCCUCGAGGCCUGUUCACAGUCUGGGCAAAACCCUACGCAUUCCGUAAGCUUCUUUCAGGCCCUCGGUCUUGUUGACCCCAGACGAAUACCGCAGGGGUUGGCCGGCCACUCAGGCUCAACCCCAACGGUACUUAGAGGUAGGUGUUCCACGAUCAAACGUCGCGCGGCUGAAACAAUAACUUCUAGCCCGAAGCAAAGCCGCAUUUCUAGAUCCACUAUCUUUGACAGGUACGCCUACCUUUUGUACGGGUUAACCGUGCGCCGAAUAGGCCUCUGUCAGAUGAUAGGCGUUAACUUUAAUACUCAAUGGCUUUUCCGAUGGGAAGCCAACUACUUCUCAGUUACAUCGGAUUGUGCCGUCACCAUCCUAUGCAAUGGGGGCCCCGGAAUCGCCGCGUCAGCCUUGGUCGGUCCCCGUUGUGGCUGCACCAGUCCUACCAGGCUGACCUGUGAGAUUGUGCCAUUGUCACGCCGGCCUGGGAUGAAAAAGUCAAGGACACGAAUCUGGCUAACAUCAGUAUCGACGUACCGUCCCGACCCCGUUGAAGCCGUGGAACCGAAUCGUUCAAGCCGCGCCGGUUGUGGGGGGCAGCUAUUGAGGCCGACGGUGCCAAAGGGUACGGUUGGCUCUUCAAUUGACUCGACACCUUAUCACGACGGCCAACAAUGGCCUUCCUUACUGACUUGGGUCAAACCCAGUCUUCACGACGCACAACAUGUGGGUUCACGGGCAUACACUGUCGACGGGUACCCCGAUUACCUAAACGACCUGCACACACCCCGGCAGAAAGCGCAAUGUGAUUUGGAUUUAAGCUUAGGUAUGGGCCCGAAUGAGACUGCACCUUUUCGCUCGGUCUGGUCCCAUACGAGUACGAUCCGGCCAAGAUCUACACUACCUAGUGUGAAGUUCCUCUAUAGGGUGGAGCGCAAUGUGCUAGCGGACUUUCUUGUAACAGAACAAGUUAGAAUCUUAGGCGUAUUGAUUAUUGAUGCGGAGCUACGAGGGAGUUCCCAGUUUCAGUUGAAAAAACGGUCAUGCGACAGGGUAUUGACUCAGCUAACCUAUUUGGAUUUACAGCUCCUACCCGAGCCGCGGUUGGGAUUCGUUCUGACUUGCUCUGCAAAUCCACGAAGUCGCAUAAGCCAUGGCUCCCACGCGAUCAACCGCGGCGUAGAAAAUAAAUAUUCUAGCAUCGGUUCAACAAAAAGCAAGACACUGGCGAGAGUUUUUUCGACUCCGUCAAGAGCAGACUCUCUACCUCUAAAUACCUCUUUAAAGGACACGUUUACCAUCAUUCUACUUCUUUUCUGCGGACGUUGGGCCUCCAAUGCCUUUAUCACAGCCCUAACGUACCAUUCUCCACUAGAGUACGUGAGACGCCCCAUGUCUCCCGCCCGCGGUGAAGCGUGUGACUAUGAUCAGCAUGUGUUCUGGUAUCCCGAUCCAGGAUCCUAUAAACCUUAUUUGAAGGGGUCGUCCGUGCCCAAGGUGCAAGUUCCGGCCGGCAGGAUGGCAAGAUUACGCUCACUCUACAAGGUAGCACGGGGGUUUGGCCGAAGAGUUGAGAUUAACCUCAUUAUAGCAGCAUCUUUGCAUUUCUUACACAUUCAACUCUAUACAACUCUUCUUUUAGAUUUCCUGGCAACGUGUCUCGGUAUGUCACUUGGCCUCCGUACGGGGUGCGACCCAUUUUGUUAUCCCCUGUGUUUUAUCCUAGCAACACAUAGGCCGGUAUGGCCCCGAAGCUGUGGGCUGGUCCCGCGAUCUUACGGAAAUCCUUGCUUAACGAACAUUGGUUGCGGCAUCAUUACCUUUCGGGGGUUAGGAACGGUAAAUAACCCAAGGUUAUUGCGUAGAACACCAGCCGUUAGUUUAUGGGUGAGGAUGCUUCACCAACUUCUUUUCUCACCCUGCCAAAUAGUACGGGAUCACGGUCACGCGUCGUUCGGCAUAUACAUAUCCGCGCGCCGUAAGGUCCCGAUCCAUUCAUAUGCCGGCCUAGCGAUUGGAGCAUGCAAUCGCCCAGAGUACAUUAUUACAAGCGGUCCAGCACGUCAAGUAUUAACGUCGGCGACCGAUAAAGUCAUCGCCACCGAACUGGCCAACUCUUUUGGGCUCUCGUUUCUGUGCUGUUUGACCACUCGACGGCACAGUAACGUCGUAUUGUGCCUAUGCGGCUCUGGUGGAUGUCGCAGCUGGGGGGAGGAUGAGGCCAGGAGUUUUCUAUGCACUUUGGAGGGCCCAGUAAGGCUCACGUCAGAAUUAGCAACGAAUUAUGGGAUAAGGAUCGCACCGGUGUUAAGUAAAAAAGUGCGAGAUGGUGCUUCAUCGCUCGGUUCUGUUAAUCACUCUAUGACACACUUGAAACAGGGCCUAGAAUCUCACCAUGAUUGUUUGAAGGUCACAGGGCCGUAUUGGGGCGGCGAUCACCAAGGGUUUGCCCCUCAUACGAUGGAUUCCUGUAUCCUAAGUCUGAUCUUAGGUCUGCCCAUGUGUCCCAACAACCUUCAUUGUGCUGGGGAUCACACAUUCAAGUCAAAGUGUGAACGUUCACGCUGUUACGAAUGUUUGAUUGCCUGGACAAUAAUACGUAUGCUGUGUUGUAGGUUCACAAGAACUGUUGUUUCUUGUGUCUUCCUUCACAGACCACGAGCUGCCAGAGGAAUUCAAGGGGCACAGCGCCCAACUAUACGUGUGGUCCCGGAUACGGCCUGCAGGCAAUGCAGCGAGAGCAAUUAUUGGUAUCUGUCUAUAGGCCAAUACUUAACACUAAAAGUCCCAUGUGGCCUAGUGGCUCUGCCUGUGCUAAAGAAUGUAUUUCAAUGUUGUCAAGCGCCUUUGGCGACGGCGAGGAGGCACAAGUCUAGCGGAAGAUCUAUAGUCAGGAUUGCGAUGGCAAGGGACAUGAAGACGGUUGGGCGAAGAAUUAUUAAACCCCGCCCCCUAGGGACGCGGGCGGUGUUUAUCCCAGCUGCACUGGAACUCCGGGUCGCGCAUUGGAUCGGAAUCCGGUCGACCCGCGAAAACACACCUAUCUCACUCCGGGGGUGUUACCAAACACGAGCGGCCGCACAUGGAGAUAGAUCAGGGGCCUUGGCCAACAUCAACGACAAGGUGUCACCUAUCCCGAAGAUAUAUCUUCACACCAUUGAGAGAAUGUCCGACAGCAGAGGAUGCUAUGUGACAUUUACGGCGUUUCGCAAUCCGGUGAGGGCCACGAACAACGCAAAGACUCCACACUGCUUUGGGUCUUCCUUUGAUAUCCCAGGCAGCCUGGAAACCCUACUCGUUCUCUGUGAGGCUUGGAAUCAGCUGCAAGUAACAGACAAGGUCAUUAACCAGCUUGAUACCGUUGGAGCA"""
rna_string2 = '''AUGGGCUUUCGCGACGCCUUUCCAGGUAGCUAUUC'''
def codons(s, rf = 0):
    if rf == 0:
        stoppoint = len(s) - (len(s) % 3) 
        codons = [s[i:i+3] for i in range(rf, stoppoint, 3)]
        return codons
    if rf == 1:
        stoppoint = len(s[rf:]) - (len(s[rf:]) % 3) 
        codons = [s[i:i+3] for i in range (rf,stoppoint, 3)]
        return codons
    if rf == 2:
        stoppoint = len(s[2:]) - (len(s[2:]) % 3) 
        codons = [s[i:i+3] for i in range (rf,stoppoint, 3)]
        return codons
    if rf == 3 or rf == 4 or rf == 5:
        new_s = transcribe(complement(s))
        stoppoint = len(new_s[rf-3:]) - (len(new_s[rf-3:]) % 3) 
        codons = [new_s[i:i+3] for i in range (rf-3,stoppoint, 3)]
        return codons

codons(rna_string2, rf = 3)
  

['UAC', 'CCG', 'AAA', 'GCG', 'CUG', 'CGG', 'AAA', 'GGU', 'CCA', 'UCG', 'AUA']

In [87]:
def translate(mrna):
    protein = ""
    for i in range(0, len(mrna) - (len(mrna) % 3), 3):
        symbol = codon_table[mrna[i:i+3]]
        if symbol == "STOP":
            break
        else:
            protein += symbol
    return protein

In [192]:
translate(rna_string2)

'MGFRDAFPGSY'

In [15]:
def sixframes(s):
    """
    Translate sequence into protein in all 6 reading frames
    """
    for rf in reversed(range(3)):
        #codons(s, rf = rf)
        protein = " "*rf
        for i in codons(s, rf = rf):
            
            if not codon_table[i] == "STOP":
                protein += ((" ") + codon_table[i] + " ")
            else:
                protein += ' * '
        print(protein)
    print(s)
    print(transcribe(complement(s)))
    for rf in range(3,6):
        #codons(s, rf = rf)
        protein = " "*(rf-3)
        for i in codons(s, rf = rf):
            
            if not codon_table[i] == "STOP":
                protein += ((" ") + codon_table[i] + " ")
            else:
                protein += ' * '
        print(protein)
    

In [19]:
sixframes(rna_string3)

   P  C  S  *  L  R  L  H  G  D  D  P  A  T  W  I  R  V  S  F  G  I  S  L  N  D  P  S  S  I  S 
  A  M  *  L  T  Q  V  T  W  G  *  P  R  D  L  D  *  S  L  F  W  N  K  P  E  *  S  E  *  H  L 
 S  H  V  A  N  S  G  Y  M  G  M  T  P  R  L  G  L  E  S  L  L  E  *  A  *  M  I  R  V  A  S  Q 
AGCCAUGUAGCUAACUCAGGUUACAUGGGGAUGACCCCGCGACUUGGAUUAGAGUCUCUUUUGGAAUAAGCCUGAAUGAUCCGAGUAGCAUCUCAG
UCGGUACAUCGAUUGAGUCCAAUGUACCCCUACUGGGGCGCUGAACCUAAUCUCAGAGAAAACCUUAUUCGGACUUACUAGGCUCAUCGUAGAGUC
 S  V  H  R  L  S  P  M  Y  P  Y  W  G  A  E  P  N  L  R  E  N  L  I  R  T  Y  *  A  H  R  R  V 
  R  Y  I  D  *  V  Q  C  T  P  T  G  A  L  N  L  I  S  E  K  T  L  F  G  L  T  R  L  I  V  E 
   G  T  S  I  E  S  N  V  P  L  L  G  R  *  T  *  S  Q  R  K  P  Y  S  D  L  L  G  S  S  *  S 


In [17]:
def find_start_codons(s, c ='AUG'):
    return [i for i in range(len(s)) if s.startswith(c, i)]

In [18]:
rna_string3 = "AGCCAUGUAGCUAACUCAGGUUACAUGGGGAUGACCCCGCGACUUGGAUUAGAGUCUCUUUUGGAAUAAGCCUGAAUGAUCCGAGUAGCAUCUCAG"
find_start_codons(rna_string3)

[4, 24, 30, 75]

In [222]:
fasta2 = """>Rosalind_3050
GCCCCCTCTCCTGCCATGGTCCAATCTACTGAAATGCCAGCCGCATGCGAGGCTGAACGT
TCCCTTACCTTTAACCTCATGGATCCTGTTCACCGGGACAAGACAAATTCTCGTTTTTAG
AGCACTATCCGTCGTTGCGAAAATAACTGCAAATGACCTGGAGTTATGGAACGAATCCTT
CACCTGCTAGGTACTGAGCGGGGAATGCGATCAAGTGTACGGCGTCGAATAAAAACAGTG
ACCCTTCTCCCCGCCTAACAGCCACGGTACCCCACCTTTGAGACGAAGAGTTTTCAGGTT
GAGGGCGCACCGACACCGCACTCATTTTGGACATTTTGCACCCCCCGAAGAGCGGCTTTG
CCAGCATCGCCCGGTCAACTGAGCGAAATATTTCAGCCGGTCTGTTCTGCTTGACACGTA
GACTCATGCCCGGTCGTACTTCGGCGTCAACGAGAGGCTAGCTAGCCTCTCGTTGACGCC
GAAGTACGACCGGGCATCGCCTTCAGTAGCTAGCTGGTCATGCAGATGGGAAACAGTGAC
CAAATGCAACGGCTACCGGGATTGCCTAGGGACTATTGAGTCGGACAGATCAGACGAGAT
AACCCACCCAATTTTGCATCAGGCCGATTACGGGAACTTGCAATCCCTCCCCAGTTTTGG
ATTATCTTCAGGTTCGAAGGGAAAAGGAACCGTGAATGGTAGAAAAAAGTCACTTCTAGA
GAGGCCAAAAGTATAATCTGATTTCACATGCATCTTGGCTACCCCTCGTGAGTAGCCCTA
GGTCTGCGCGGGGGCCGGTATTGCGAACCCATGTGGGTGCACATATGAGGGGGGAAGTAA
GGCTCAGCCATCGATAGCCTCCGGCATCCTTCCGCCACTGCGCTAGTTTCGTTTCCTCCA
GCGGACAAAATGGCGGATGACG"""


#print(fasta2)
def orf_protein(fasta):
    parsed = parse_fasta(fasta)
    parsed_list = list(parsed.values())
    results = []
    for v in parsed_list:
        comp = complement(v)
        reverse_comp = reverse(comp)
        rna1 = transcribe(v)
        
        rna2 = transcribe(reverse_comp)
        
        indices_forward = find_start_codons(rna1)
        
        indices_reverse = find_start_codons(rna2)
        
        length = len(rna1)

        for i in indices_forward:
            found_stop = False
            protein = ""
            for j in range(i, length, 3):
                if len(rna1[j:j+3]) == 3:
                    codon = codon_table[rna1[j:j+3]]
                    if codon == "STOP":
                        found_stop = True
                        break
                    if not codon:
                        break
                    protein += codon
            if found_stop:
                results.append(protein)
        #return results
        for i in indices_reverse:
            found_stop = False
            protein = ""
            for j in range(i, length, 3):
                if len(rna1[j:j+3]) ==3:
                    codon = codon_table[rna2[j:j+3]]
                    if codon == "STOP":
                        found_stop = True
                        break
                    if not codon:
                        break
                    protein += codon
            if found_stop:
                results.append(protein) 
    return set(results)
        #return (" ".join(str(s) for s in tempset))

In [249]:
def orfProtein_test(fasta):
    parsed = parse_fasta(fasta)
    parsed_list = list(parsed.values())
    results = []
    for k, v in parsed.items():
        comp = complement(v)
        reverse_comp = reverse(comp)
        rna1 = transcribe(v)
        
        rna2 = transcribe(reverse_comp)
        
        indices_forward = find_start_codons(rna1)
        
        indices_reverse = find_start_codons(rna2)
        
        length = len(rna1)

        for i in indices_forward:
            found_stop = False
            protein = ""
            for j in range(i, length, 3):
                if len(rna1[j:j+3]) == 3:
                    codon = codon_table[rna1[j:j+3]]
                    if codon == "STOP":
                        found_stop = True
                        break
                    if not codon:
                        break
                    protein += codon
            if found_stop:
                results.append(protein)
        #return results
        for i in indices_reverse:
            found_stop = False
            protein = ""
            for j in range(i, length, 3):
                if len(rna1[j:j+3]) ==3:
                    codon = codon_table[rna2[j:j+3]]
                    if codon == "STOP":
                        found_stop = True
                        break
                    if not codon:
                        break
                    protein += codon
            if found_stop:
                results.append(protein) 
        print ([k, set(results)])
        #return (" ".join(str(s) for s in tempset))

In [254]:
orfProtein(fasta5)
#translate(transcribe(fasta5[15:249]))

set()

In [224]:
set(orfProtein(fasta2)) == set(possible_a + possible_b)

True

In [132]:
fasta5 = """GCCCCCTCTCCTGCCATGGTCCAATCTACTGAAATGCCAGCCGCATGCGAGGCTGAACGT
TCCCTTACCTTTAACCTCATGGATCCTGTTCACCGGGACAAGACAAATTCTCGTTTTTAG
AGCACTATCCGTCGTTGCGAAAATAACTGCAAATGACCTGGAGTTATGGAACGAATCCTT
CACCTGCTAGGTACTGAGCGGGGAATGCGATCAAGTGTACGGCGTCGAATAAAAACAGTG
ACCCTTCTCCCCGCCTAACAGCCACGGTACCCCACCTTTGAGACGAAGAGTTTTCAGGTT
GAGGGCGCACCGACACCGCACTCATTTTGGACATTTTGCACCCCCCGAAGAGCGGCTTTG
CCAGCATCGCCCGGTCAACTGAGCGAAATATTTCAGCCGGTCTGTTCTGCTTGACACGTA
GACTCATGCCCGGTCGTACTTCGGCGTCAACGAGAGGCTAGCTAGCCTCTCGTTGACGCC
GAAGTACGACCGGGCATCGCCTTCAGTAGCTAGCTGGTCATGCAGATGGGAAACAGTGAC
CAAATGCAACGGCTACCGGGATTGCCTAGGGACTATTGAGTCGGACAGATCAGACGAGAT
AACCCACCCAATTTTGCATCAGGCCGATTACGGGAACTTGCAATCCCTCCCCAGTTTTGG
ATTATCTTCAGGTTCGAAGGGAAAAGGAACCGTGAATGGTAGAAAAAAGTCACTTCTAGA
GAGGCCAAAAGTATAATCTGATTTCACATGCATCTTGGCTACCCCTCGTGAGTAGCCCTA
GGTCTGCGCGGGGGCCGGTATTGCGAACCCATGTGGGTGCACATATGAGGGGGGAAGTAA
GGCTCAGCCATCGATAGCCTCCGGCATCCTTCCGCCACTGCGCTAGTTTCGTTTCCTCCA
GCGGACAAAATGGCGGATGACG""".replace("\n", "")

DNA_CODON_TABLE = {
    'TTT': 'F',     'CTT': 'L',     'ATT': 'I',     'GTT': 'V',
    'TTC': 'F',     'CTC': 'L',     'ATC': 'I',     'GTC': 'V',
    'TTA': 'L',     'CTA': 'L',     'ATA': 'I',     'GTA': 'V',
    'TTG': 'L',     'CTG': 'L',     'ATG': 'M',     'GTG': 'V',
    'TCT': 'S',     'CCT': 'P',     'ACT': 'T',     'GCT': 'A',
    'TCC': 'S',     'CCC': 'P',     'ACC': 'T',     'GCC': 'A',
    'TCA': 'S',     'CCA': 'P',     'ACA': 'T',     'GCA': 'A',
    'TCG': 'S',     'CCG': 'P',     'ACG': 'T',     'GCG': 'A',
    'TAT': 'Y',     'CAT': 'H',     'AAT': 'N',     'GAT': 'D',
    'TAC': 'Y',     'CAC': 'H',     'AAC': 'N',     'GAC': 'D',
    'TAA': 'Stop',  'CAA': 'Q',     'AAA': 'K',     'GAA': 'E',
    'TAG': 'Stop',  'CAG': 'Q',     'AAG': 'K',     'GAG': 'E',
    'TGT': 'C',     'CGT': 'R',     'AGT': 'S',     'GGT': 'G',
    'TGC': 'C',     'CGC': 'R',     'AGC': 'S',     'GGC': 'G',
    'TGA': 'Stop',  'CGA': 'R',     'AGA': 'R',     'GGA': 'G',
    'TGG': 'W',     'CGG': 'R',     'AGG': 'R',     'GGG': 'G'
}


def translate_codon(codon):
    protein = None
    if len(codon) == 3 and codon in DNA_CODON_TABLE:
        protein = DNA_CODON_TABLE[codon]
    return protein


def reverse_complement(dna):
    lookup = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
    return ''.join([lookup[c] for c in reversed(dna)])
def possible_protein_strings(s):
    results = []
    indices = []

    l = len(s)
    
    for i in range(l):
        protein = translate_codon(s[i:i+3])
        if protein and protein == 'M':
            indices.append(i)
    
    for i in indices:
        found_stop = False
        protein_string = ''

        for j in range(i, l, 3):
            protein = translate_codon(s[j:j+3])

            if not protein:
                break

            if protein == 'Stop':
                found_stop = True
                break

            protein_string += protein

        if found_stop:
            results.append(protein_string)
    
    return results

possible_a = possible_protein_strings(fasta5)
possible_b = possible_protein_strings(reverse_complement(fasta5))
print ("\n".join(set(possible_a + possible_b)))

MTWSYGTNPSPARY
MRG
MSKMSAVSVRPQPENSSSQRWGTVAVRRGEGSLFLFDAVHLIAFPAQYLAGEGFVP
MRSSVRRRIKTVTLLPA
MVQSTEMPAACEAERSLTFNLMDPVHRDKTNSRF
MGNSDQMQRLPGLPRDY
MWVHI
M
MLAKPLFGGCKMSKMSAVSVRPQPENSSSQRWGTVAVRRGEGSLFLFDAVHLIAFPAQYLAGEGFVP
MPAACEAERSLTFNLMDPVHRDKTNSRF
MCTHMGSQYRPPRRPRATHEG
MQMGNSDQMQRLPGLPRDY
MRGEVRLSHR
MSAVSVRPQPENSSSQRWGTVAVRRGEGSLFLFDAVHLIAFPAQYLAGEGFVP
MPGRTSASTRG
MQNWVGYLV
MAEPYFPPHMCTHMGSQYRPPRRPRATHEG
MQRLPGLPRDY
MHLGYPS
MDPVHRDKTNSRF
MVEKSHF
MPEAIDG
MSLRVKQNRPAEIFRSVDRAMLAKPLFGGCKMSKMSAVSVRPQPENSSSQRWGTVAVRRGEGSLFLFDAVHLIAFPAQYLAGEGFVP
MHVKSDYTFGLSRSDFFLPFTVPFPFEPEDNPKLGRDCKFP
MTS
MGSQYRPPRRPRATHEG
MRLAFQ
MERILHLLGTERGMRSSVRRRIKTVTLLPA


In [228]:
fasta3 = """>Rosalind_3072
ATGCCGTTAGTAGAGACAAAAGGGCTGTCTCCTCGTTCTACTTTGCAGTTGCGATCAGTG
GACGACAAGCAATTGAGCAGGGTGGACCTACTCACATACCCGCTCCAACTGTAAGTGAAA
ACCGCTGCAGGTTTCCAACTGCCTGGCCTAGTTTCAGCAGGGTGCAACCAATGGCGCCGT
CCCACTTTAGTTAGATCTTGCCGTATTGTACTCATTAATGTTTGTCGTATTGGCCGAAAT
CTTGGGCGAACCGTGGCTAGCTCGCCAACAAATTGAGATCCGAGCTGAACGTGCACGGAA
CCAGACGACAACCTGCCGGGGACCGTATAGCCACCACCTCTCGAAATAAAATCCCCTGTA
TCCCTGATCGATATGGTTGGCCTGTGCGCGGGGGAGCACCAAGGATGGGTGCCCATTGCC
CAGCTTTCCCCATGGAAAGCCCTCGTTTGACTGCAGTACGCGCATTGCTCCCGACAGCTT
CCGTCATGTGTCAATGGAGGACCGAGAGCCTAGGGCAGTGCTGTATCCCAATAATCACGT
CCTCCTGAGTGAGTTCGGACTGGGAGTCCCTGCTCCGGTACGGTGGATCCATGTCTACGG
ACTCCACCTTACGGAACCATACGTCCCCGAGTGTGTTGTAGGGACGAGATACTACGGTAC
CCACTCTCGGTGCTTCCGTCTCACCCAATCGTCCAAATTGGACATAATTCTACTGTGGAC
CCGAGAAAAACAAATGTCTGTTCCGCCGTAGGCCAACATCACTGGGACATATTCCATTGG
AATCATTTTGCCCGGGGTGCGCAACTCGTAGACACTGACACGGACCCGCATTGAGATCTC
ATCATCCGCATGCAAGGGTCCGTAACAAGTGTTGGGCAAGTTATGTATCAGAGACTAGAA
ATAGAATATGGTGTGATAGGTTCTACGAGGCCTAATGTAA
>Rosalind_0490
CCGTATAGCCACCACCTCTCGAAATAAAATCCCCTGTA
>Rosalind_6688
GACCCGCATTGAGATCTCATCATCCGCAT
>Rosalind_5855
GAGCACCAAGGATGGGTGCCCATTGCCCAGCTTTCCC
>Rosalind_3365
TAATTCTACTGTGGACCCGAGAAAAACAAATGTCTG
>Rosalind_6838
CCGGGGTGCG
>Rosalind_2061
CTGTATCCCAATAATCACGTCCTCCTGAGTGAGTTCGGACTGGGAGTCC
>Rosalind_9992
TCCCCGAGTGTGTTGTAGGGACGAGATACTACGGTA
>Rosalind_5058
CGAACCGTGGCTAGCT
>Rosalind_7737
TCAGTGGACGACAAGCAATTGAGCAGGGTGGA
>Rosalind_3330
GTACTCATTAAT
>Rosalind_1226
TGCAGGTTTCCAACTGCCTGGCCTAGTTTCAGCAGGGTGCAAC
>Rosalind_7917
ATATGGTGTGATAGGTT
>Rosalind_8016
TTTGACTGCAGTACGCGCATTGCTCCCGACAGCTTCCGTCATGTGTCA
"""

def splice_introns(s):
    parsed_dict = parse_fasta(s)
    dna_seq = ""
    subs = []
    for key, value in parsed_dict.items():
        if len(value) >= len(dna_seq):
            max_key = key
            dna_seq = value.replace("\n", "")
    for key, value in parsed_dict.items():
        if len(value) < len(dna_seq):
            subs.append(parsed_dict[key])     
    for sub in subs:
        dna_seq = dna_seq.replace(sub, "").replace("\n", "").strip()
    return translate(transcribe(dna_seq))

In [229]:
parse_fasta(fasta3)
splice_introns(fasta3)

'MPLVETKGLSPRSTLQLRPTHIPAPTVSENRQWRRPTLVRSCRIVCRIGRNLGRQQIEIRAERARNQTTTCRGSLIDMVGLCAGHGKPSMEDREPRAVLLRYGGSMSTDSTLRNHTPTLGASVSPNRPNWTFRRRPTSLGHIPLESFCQLVDTDTARVRNKCWASYVSETRNRLRGLM'

In [230]:
fasta4 = """ATGCTCAAGATGCGCCAATCTCGCGTTTTTGTGTGCGCTCCAAACTCTTGGCTTTCGCGG
GAGCGTCGGGGTAGCAGACTTTTGTCAACAATGATTTCTGACGCTAAGTAGGACTTACCA
GAGCGCGAAGTACTAAACCTATGCGTGTCCAGAGAACGAACTCTAGTATATCCAAGATGG
ACCAGGGGAATTCAC
GGCCGAGCTCTCGACCAATCAAGGATTAGCCCGTATGGCT
TCAAAAACACTTCAAAATCATCAATCAGCACCAAATCTTGGTCCG
AGCGTGTTATTGGTACTAGAGCTTCATCGTGCGGCGCAGCGATCGTTGTATAAGGGAGAG
GATAATCCTTGACGATCGCCCAGTTGTTAGATGTTTAGGCCGTCGCGTGACACACGACTG
CGGATAGAAGTAGTCGCAGGCTTCAACTATGGCAGATTACTACGTGCGGGAATCGGCCGT
ATTGCCAAGGATA
GAGTGGGGCAGGAGGATAGACGGATAATATCGGGCTGACTCTCAGCTTCGCACACCGATA
TTAGGTGAAAAGGGCCACGTGTTTCGAGAGGGAAACATGGGAGGATTGGAGGCACTCATA
GGCCCGCTTCCGAGTCCGAAGGGGGAGGGATAAGGAAGCCTGCAAACCGAGTGAAAATGG
TGTGGCTACCTCGATATCCGCAACACAA
CTGCATATCAACCGTCATAGGCGGTACTGGATATTT
ATCTTTCGGAGCGATGCCCGGGCAATCCAGCTAGTTCTCCC
TGTCGTTTACTAA"""
fasta4 = fasta4.replace("\n", "")

In [231]:
print(fasta4)

ATGCTCAAGATGCGCCAATCTCGCGTTTTTGTGTGCGCTCCAAACTCTTGGCTTTCGCGGGAGCGTCGGGGTAGCAGACTTTTGTCAACAATGATTTCTGACGCTAAGTAGGACTTACCAGAGCGCGAAGTACTAAACCTATGCGTGTCCAGAGAACGAACTCTAGTATATCCAAGATGGACCAGGGGAATTCACGGCCGAGCTCTCGACCAATCAAGGATTAGCCCGTATGGCTTCAAAAACACTTCAAAATCATCAATCAGCACCAAATCTTGGTCCGAGCGTGTTATTGGTACTAGAGCTTCATCGTGCGGCGCAGCGATCGTTGTATAAGGGAGAGGATAATCCTTGACGATCGCCCAGTTGTTAGATGTTTAGGCCGTCGCGTGACACACGACTGCGGATAGAAGTAGTCGCAGGCTTCAACTATGGCAGATTACTACGTGCGGGAATCGGCCGTATTGCCAAGGATAGAGTGGGGCAGGAGGATAGACGGATAATATCGGGCTGACTCTCAGCTTCGCACACCGATATTAGGTGAAAAGGGCCACGTGTTTCGAGAGGGAAACATGGGAGGATTGGAGGCACTCATAGGCCCGCTTCCGAGTCCGAAGGGGGAGGGATAAGGAAGCCTGCAAACCGAGTGAAAATGGTGTGGCTACCTCGATATCCGCAACACAACTGCATATCAACCGTCATAGGCGGTACTGGATATTTATCTTTCGGAGCGATGCCCGGGCAATCCAGCTAGTTCTCCCTGTCGTTTACTAA


In [None]:
translate(transcribe(fasta4))

In [None]:
s1 = 'CCGAACACACGAGCAAGGCCCTCTGTCGCTTTAGGGCAATCGAACCCCTGAACAGCCATTGTACGCTCCTTCCCTGCATGGTAGAATGGTGATATGACACCCCTTGGTTCTATGTGGGCAAAAGTCACACATCCTTAGCGATATACCCTGCGTCTCAACGTCACCCGAAAACTCCCTCCGTACGGAAACGCAACTCTGGCTGAACACTTAAGGAGTGCTATGGCTTACGCGCCCGGAATCTGGAAAGTAATGACCCGTCTTTTGGTACTTGTTGAACCGGGGGGGCTATTCTAGTATTTTTAACGAGAGCAAGTCTGGTGATGATTATCGACCTGTATTGAGTAGCCATGACTGCAATTCAAATGGGTCAGGAAACACCCTCGCCTTGCTCCAATAGTGCCTTGTTAACGAGAGATTTCTCTAGGGGTAGCTGCCGGGTAGGTGTGCCTCGTGCGTTACTTAGAATCGCAATCGCCGACTGTAATGATACACAGGCGGCCACGGCTTTACTTGTGAGACCATCTGACTAGGACATCAAAGTCCCTATACGTCCGATTATGGTTAGCCCTTAGGATCTATTTCATCTCTTTTTCCGAGCTGAGGATCACAACTTGTCACGGCACTTAACGCTAGAAGACAGGTGTCAACAGCTCTGCCAGTTTTAGCCAAGTGGGAGGCTTTGAAACCTGCTTGATGCCCCACCCACCACGTACGGTCTTAAATACCTTATCCACGGTTGGCTATCTTTACTGCTTGGCAATAGTACGCCTGAATTTTACAACTTCGACTCATCGACACTTCTACACGACCTGCGTACAAATTAGGGTACTAGTTATGTGTACGATCAAGTGAACGGTTAAATTCCGATACCTACTGCACCGCGAGTCGTGTAGTCGTAAAGAGTGGACTTTCCTGGG'
s2 = 'CGGACGAAAGCAGTGAGTCCCGCTGCAACAGTAAAGTGAGCATACGACAGGGCACTCTTATTACGCTCCGTGCCTCAAGGCTAGAGGGATGCTACCGTACCCGACGTTCTTTAGTGGACAAAATTCGCAATCGTTCCACCCACTTTCTTCCATGCTATCCTTCTGCGCACCCGTGCTCGTTACGGAAACGACTCAATCGCCATAAGCATTAAGTCTTCAGTTGCTGGTATAGCCGAAATCACCAATTGCATGCTCTGGTTTTTGCGCAATCGTGTCCGTAGGGGGCGCTTGACCGATTTATAATTACCGCCCAACCAGTGTTGAGTACCACAACCCAAACGCTAGATTCGGCGGAGCTACCAGGGAATTCCTAATCACCCCCCACCGAATCCACCATTATATTTCTTACGAGAGAGTGCGTCTTGCCAAGCGGCACTTAAGCTGCATGCCGTGGGATGTTACCGCTTGTCATGTGCCACTCGGATGTCCGAATTGGCGCATCGAAATTAGGTGACCGGTCAGCCCCCGAGGTCATCATTAAACCCGTGCCTATATTTTCGCGAAGAACTCATAAGCTATTTCCTCAGCGCTATCGAGAAGATCGTTCGAACGTAGAACGGCGCTTTGGGCTGAAAGACCGGGAGACCATGGGGAGTGACGGGTAACCAAGCGGTGAGTTTCTACGCCTGGGCAACGTCCACGCCGGGAATTTGGCCCCTTAACACTGGATACACGGTTCGCAATGTTATCTCCGTTTCATTCCACCTCATGTGTACTACACCTGCTATCTATCATGGCCACGAATGATCCTTCCTGTAACTGAGTGTTAACGGCATTTGTATGGGCAGCTGGATGCGCGACTTCCGGAACGTCCTGAACCCCGAGTTAGTGCCGAGTATAGGGGACAGACAATTGTG'

def hammingdistance(s1, s2):
    distance = 0
    for i in range(0, len(s1)):
        if not s1[i] == s2[i]:
            distance += 1
    return distance

In [None]:
hammingdistance(s1, s2)

In [180]:
sample_fasta = """>Rosalind_0023
AGCCGGGTTTGCGCCTTCATCGGCGTGCGCGTTGAGTCCTTTGCATCGGGACTCAAAGAG
GTCACGTAACTGGACCAAGTACTTGATCGAATTTGTGTAACTCGGCGGGAAAGCTTACTC
TGCCTCCTGATGCCGGGTGAACCTACGCATACAAATGTTTGCACCAACCAAGTGTTTTCT
ATGCGCCTTCACAACGCATAGTCGCGACTAATGCGTATTGCTCGACCAGTTAAAAGGTCT
GGGTCAATGACCTAAAGAATCTGGCGGCCGCGAATACATGCTCGGACCTGTACCCCGGAA
TGCGACTATCTACACTGAGTGGCAGGCTCAAGGAATCTGCGACACGCTTACCGACGAAGA
CCAACCTCCTTAGTATTTCAGTAATCCTTTTTTAGCACGCCTCTGGCACCTAAGCTCGCG
TGATTCGGGTTCCCGCGGTGGGGGCTTTGGTTCCGAGCCAACCGCCCGGACTTAGCACAA
AAACCCTATTACCCTGTCCCAATGAGTTACCCTTGACTCACAAACTTCACAAGTCTCTAC
ACTAGAGGAATAACATCGCACAGAGATTACTTCGGTAATAATCTGCCTGGTGCTTCCTGG
GGGCGGTGGAGCAGTGCAATCCGCAGGTCTTTTGGGAACAGAGTCGATACAACTGTTTGC
CCTGGTGGGCGGGTCCGAGCTCTTCGGTTATAGACTGGGGCAAGACTGACAACAATGCCC
TTCGCACGGTACCAGCTCACCACTGGTATGCATGAGGGCTATGGGTGGCTTCGAACACAC
CAATGGCCGCAGATTAAACTATTGTATGAGTCGTTCGCCTTGATAATATCCTAAATTCTG
CCCGAACGACTCTGCGTCCGGCCGTAAACGCGACTCAACCGCCAAAAAGTGATTCGGATA
TCACGGCTCGGGGTTGTGCATCTGTAAAGTAGTGTCGCCCCACGGATTACGCGAGCAGTG
GCAGCTATCAGTCCATAGGCCGCGCTTCTCACGTGTAATC
>Rosalind_2819
CGCTGCCGTTCTATAGAATGGGTCGATTCATCAAAACCGCTGCATGTCTAATCTGCTCGT
TTTGGTTCCGCACGCAAAACATTACTAGGTCGAAATCTGCACGTGGACATACGTGCCAAT
AGTCATGGCCTCAACCATGTGTGCTTGGTAAGGTACCCTTGGTGAATGCGAGATATGAAT
GCGGCCCCATATCCGGCTTAGCTCTTCACAAGTGTGTCAGTAAGGCGTAATGGGCTTATA
CATGATTAACGACTTGCGTCTCCCTAGGTTGGCCGATAGCAACGGGCTTGAAGATAGGAT
CATGGTGGACTGAAACGAAACACCCCAACTTCTAGGCTACTATTATAGCTGGGGCCGGGT
CCCTTGCGCACTAGCGCGATCCATTGAGCGACTCGGATGGCACTGCATCATGCCCTGAGT
GTTTCAGAGCCTATTCCATTTTGAGACTAGAAAAGAATTTGCGCGTGGTAGACTGCCAGA
CCGAAATGGAGATCTCATGTCCCCTTGGAACTTTTTCTGGCTCTGATCAAAGCTCCGTCG
TCGCGGGAGTGAATAGGTAACGCAAAGGTGGTCAACGAATATTTACTTTACGCTACTACG
CTTCACACGACTTTTTGCGCGTCCGAGAAGATTGAATCGTAGGTGACCGCTCGCCGGTAT
TCGTCCCTGCGACAGATAACATTCGACCAACCTGCTCGAGTAGATATACAAGGTAACGTT
GGTGTGGGCATGGTGTTATATTAAGACGATCTAAAACACAGCATGGCTTCGGGGTCTCGT
GCAACCCAAAGCTATAAGAACCTCTATGGTTGATCGAATTTGTGTAACTCGGCGGGAAAG
CTTACTCTGCCTCCTGATGCCGGGTGAACCTACCTCAAGCTTGCTACCCCTGAAGTACAC
CATATGAGGCCTAAAATACTGGGTCGCTTACTATTTCATGTTTGAACTTTAGTCATCTTA
GAGCATAGTCGGGGACGATGACGGAACAAACCACTCCGGA
"""
#sample_fasta = sample_fasta.replace("\n", "")
#print(sample_fasta)

def find_common_motif(fasta):
    parsed = parse_fasta(fasta)
    parsed_list = list(parsed.values())
    print(parsed_list)
    common_motif = "G"
    index = parsed_list.index(min(parsed_list, key=len))
    print(index)
    shortest = parsed_list[index]

    for i in range(0, len(shortest)):
        n = 0
        present = True
        while present:
            for each in parsed_list:
                if shortest[i:i+n] not in each or n > 1000:
                    present = False
                    break
            if present:
                common_motif = max(shortest[i:i+n], common_motif, key=len)
                
            n += 1
    return common_motif

find_common_motif(sample_fasta)
    

['AGCCGGGTTTGCGCCTTCATCGGCGTGCGCGTTGAGTCCTTTGCATCGGGACTCAAAGAGGTCACGTAACTGGACCAAGTACTTGATCGAATTTGTGTAACTCGGCGGGAAAGCTTACTCTGCCTCCTGATGCCGGGTGAACCTACGCATACAAATGTTTGCACCAACCAAGTGTTTTCTATGCGCCTTCACAACGCATAGTCGCGACTAATGCGTATTGCTCGACCAGTTAAAAGGTCTGGGTCAATGACCTAAAGAATCTGGCGGCCGCGAATACATGCTCGGACCTGTACCCCGGAATGCGACTATCTACACTGAGTGGCAGGCTCAAGGAATCTGCGACACGCTTACCGACGAAGACCAACCTCCTTAGTATTTCAGTAATCCTTTTTTAGCACGCCTCTGGCACCTAAGCTCGCGTGATTCGGGTTCCCGCGGTGGGGGCTTTGGTTCCGAGCCAACCGCCCGGACTTAGCACAAAAACCCTATTACCCTGTCCCAATGAGTTACCCTTGACTCACAAACTTCACAAGTCTCTACACTAGAGGAATAACATCGCACAGAGATTACTTCGGTAATAATCTGCCTGGTGCTTCCTGGGGGCGGTGGAGCAGTGCAATCCGCAGGTCTTTTGGGAACAGAGTCGATACAACTGTTTGCCCTGGTGGGCGGGTCCGAGCTCTTCGGTTATAGACTGGGGCAAGACTGACAACAATGCCCTTCGCACGGTACCAGCTCACCACTGGTATGCATGAGGGCTATGGGTGGCTTCGAACACACCAATGGCCGCAGATTAAACTATTGTATGAGTCGTTCGCCTTGATAATATCCTAAATTCTGCCCGAACGACTCTGCGTCCGGCCGTAAACGCGACTCAACCGCCAAAAAGTGATTCGGATATCACGGCTCGGGGTTGTGCATCTGTAAAGTAGTGTCGCCCCACGGATTACGCGAGCAGTGGCAGCTATCAGTCCATAGGCCGCGCTTCTCACGTGTAA

'TTGATCGAATTTGTGTAACTCGGCGGGAAAGCTTACTCTGCCTCCTGATGCCGGGTGAACCTAC'

In [179]:

s = """>Rosalind_0023
AGCCGGGTTTGCGCCTTCATCGGCGTGCGCGTTGAGTCCTTTGCATCGGGACTCAAAGAG
GTCACGTAACTGGACCAAGTACTTGATCGAATTTGTGTAACTCGGCGGGAAAGCTTACTC
TGCCTCCTGATGCCGGGTGAACCTACGCATACAAATGTTTGCACCAACCAAGTGTTTTCT
ATGCGCCTTCACAACGCATAGTCGCGACTAATGCGTATTGCTCGACCAGTTAAAAGGTCT
GGGTCAATGACCTAAAGAATCTGGCGGCCGCGAATACATGCTCGGACCTGTACCCCGGAA
TGCGACTATCTACACTGAGTGGCAGGCTCAAGGAATCTGCGACACGCTTACCGACGAAGA
CCAACCTCCTTAGTATTTCAGTAATCCTTTTTTAGCACGCCTCTGGCACCTAAGCTCGCG
TGATTCGGGTTCCCGCGGTGGGGGCTTTGGTTCCGAGCCAACCGCCCGGACTTAGCACAA
AAACCCTATTACCCTGTCCCAATGAGTTACCCTTGACTCACAAACTTCACAAGTCTCTAC
ACTAGAGGAATAACATCGCACAGAGATTACTTCGGTAATAATCTGCCTGGTGCTTCCTGG
GGGCGGTGGAGCAGTGCAATCCGCAGGTCTTTTGGGAACAGAGTCGATACAACTGTTTGC
CCTGGTGGGCGGGTCCGAGCTCTTCGGTTATAGACTGGGGCAAGACTGACAACAATGCCC
TTCGCACGGTACCAGCTCACCACTGGTATGCATGAGGGCTATGGGTGGCTTCGAACACAC
CAATGGCCGCAGATTAAACTATTGTATGAGTCGTTCGCCTTGATAATATCCTAAATTCTG
CCCGAACGACTCTGCGTCCGGCCGTAAACGCGACTCAACCGCCAAAAAGTGATTCGGATA
TCACGGCTCGGGGTTGTGCATCTGTAAAGTAGTGTCGCCCCACGGATTACGCGAGCAGTG
GCAGCTATCAGTCCATAGGCCGCGCTTCTCACGTGTAATC
>Rosalind_2819
CGCTGCCGTTCTATAGAATGGGTCGATTCATCAAAACCGCTGCATGTCTAATCTGCTCGT
TTTGGTTCCGCACGCAAAACATTACTAGGTCGAAATCTGCACGTGGACATACGTGCCAAT
AGTCATGGCCTCAACCATGTGTGCTTGGTAAGGTACCCTTGGTGAATGCGAGATATGAAT
GCGGCCCCATATCCGGCTTAGCTCTTCACAAGTGTGTCAGTAAGGCGTAATGGGCTTATA
CATGATTAACGACTTGCGTCTCCCTAGGTTGGCCGATAGCAACGGGCTTGAAGATAGGAT
CATGGTGGACTGAAACGAAACACCCCAACTTCTAGGCTACTATTATAGCTGGGGCCGGGT
CCCTTGCGCACTAGCGCGATCCATTGAGCGACTCGGATGGCACTGCATCATGCCCTGAGT
GTTTCAGAGCCTATTCCATTTTGAGACTAGAAAAGAATTTGCGCGTGGTAGACTGCCAGA
CCGAAATGGAGATCTCATGTCCCCTTGGAACTTTTTCTGGCTCTGATCAAAGCTCCGTCG
TCGCGGGAGTGAATAGGTAACGCAAAGGTGGTCAACGAATATTTACTTTACGCTACTACG
CTTCACACGACTTTTTGCGCGTCCGAGAAGATTGAATCGTAGGTGACCGCTCGCCGGTAT
TCGTCCCTGCGACAGATAACATTCGACCAACCTGCTCGAGTAGATATACAAGGTAACGTT
GGTGTGGGCATGGTGTTATATTAAGACGATCTAAAACACAGCATGGCTTCGGGGTCTCGT
GCAACCCAAAGCTATAAGAACCTCTATGGTTGATCGAATTTGTGTAACTCGGCGGGAAAG
CTTACTCTGCCTCCTGATGCCGGGTGAACCTACCTCAAGCTTGCTACCCCTGAAGTACAC
CATATGAGGCCTAAAATACTGGGTCGCTTACTATTTCATGTTTGAACTTTAGTCATCTTA
GAGCATAGTCGGGGACGATGACGGAACAAACCACTCCGGA
""".split(">")[1:]
for i in range(len(s)):
    s[i] = s[i].replace("\n", '')
    while s[i][0] not in "ACGT":
        s[i] = s[i][1:]
# ^^^^^^^^^^^^^ all of that to format in FAST in array

#Get shortest of DNA strings
index = s.index(min(s, key=len))
print(s)

motif = 'A'
shortest = s[index]

#cycle over the DNA string letters
for i in range(len(shortest)):
    n = 0
    present = True
    while present:
            #cycle inside over all other DNA strings and if it's present in there considered a motif and length gets increased by 1
        for each in s:
            if shortest[i:i+n] not in each or n>1000:
                present = False
                break
        if present:
            motif = max(shortest[i:i+n], motif, key=len)
        n += 1
print(motif)

['AGCCGGGTTTGCGCCTTCATCGGCGTGCGCGTTGAGTCCTTTGCATCGGGACTCAAAGAGGTCACGTAACTGGACCAAGTACTTGATCGAATTTGTGTAACTCGGCGGGAAAGCTTACTCTGCCTCCTGATGCCGGGTGAACCTACGCATACAAATGTTTGCACCAACCAAGTGTTTTCTATGCGCCTTCACAACGCATAGTCGCGACTAATGCGTATTGCTCGACCAGTTAAAAGGTCTGGGTCAATGACCTAAAGAATCTGGCGGCCGCGAATACATGCTCGGACCTGTACCCCGGAATGCGACTATCTACACTGAGTGGCAGGCTCAAGGAATCTGCGACACGCTTACCGACGAAGACCAACCTCCTTAGTATTTCAGTAATCCTTTTTTAGCACGCCTCTGGCACCTAAGCTCGCGTGATTCGGGTTCCCGCGGTGGGGGCTTTGGTTCCGAGCCAACCGCCCGGACTTAGCACAAAAACCCTATTACCCTGTCCCAATGAGTTACCCTTGACTCACAAACTTCACAAGTCTCTACACTAGAGGAATAACATCGCACAGAGATTACTTCGGTAATAATCTGCCTGGTGCTTCCTGGGGGCGGTGGAGCAGTGCAATCCGCAGGTCTTTTGGGAACAGAGTCGATACAACTGTTTGCCCTGGTGGGCGGGTCCGAGCTCTTCGGTTATAGACTGGGGCAAGACTGACAACAATGCCCTTCGCACGGTACCAGCTCACCACTGGTATGCATGAGGGCTATGGGTGGCTTCGAACACACCAATGGCCGCAGATTAAACTATTGTATGAGTCGTTCGCCTTGATAATATCCTAAATTCTGCCCGAACGACTCTGCGTCCGGCCGTAAACGCGACTCAACCGCCAAAAAGTGATTCGGATATCACGGCTCGGGGTTGTGCATCTGTAAAGTAGTGTCGCCCCACGGATTACGCGAGCAGTGGCAGCTATCAGTCCATAGGCCGCGCTTCTCACGTGTAA

In [190]:
fasta6 = """>Rosalind_7543
AGTTTCTGCCGGCTCTCACATTGGTGGCGCGTTACTTCAGGAACGCTGGTTGTGATCAAA
AGCTAAAACCATCAGTCATACCTCTCGGAACTGT
>Rosalind_8012
AGGAATGAGAGTGGGGTTGTGCTGTACGCTTAGAACCTCCTCTGGTAATCAAGACAATTC
AGGTTACTTGAAACACCTTGTAGGTAAACATCGTCC
>Rosalind_4254
GATACAATGCTGCTACGCGCCCCCGCAGGCAACGTCGGTAGCTCGTGAATTCAAGCCCCG
CGCATCCCCAAAGGTTTGACACAACACC
>Rosalind_8629
AGTCTAGAGCAGGATAGGTAAATGCCAACAAAGGGTACACGTTTATGCGTTAATGGTAAA
GTGCCACTCTCACCTCGACGAC
>Rosalind_3137
TGGTCCCAAGTAGCGTTCTCCATTCCTTATCACGGTGGCGAGATAAGGCGACGCGTGTGT
ACTCGTCAATCGATGTGACGGATCTCCCGCTT
>Rosalind_2840
GAATCATTTTACTCGTGAGGACAGAGAGCTCACACTGCTGGCATATCCGACCGGCTGTGG
CGAGTGAAATAAACAGGGACCGGCGTCCAGGAGA
>Rosalind_2432
CCAAAAATAAGGTGACAGCGTTAAACTCGATGTCGCTAATGTAACCCACATCGTCCATTC
CTGTGCGTGGACCGAGACTGTAA
>Rosalind_1961
CTGGTCTAACCCTATTGACAGACCACATCGACAACAAGGCCCACACGCTCGTCCAGTTAC
ACTCATAGGTCAGCCATGCTATGTTGGTCG
>Rosalind_5487
TGACGAATACAGGAACCATCGCATGTAACATGTGCGTGGTGCCGAAAGTGGCAGGGTACT
ATTGATTTGATAGAATACAGTCTGGAGGAAGCGC
>Rosalind_3208
TCGCGAAAGGTTGAAATGGATAGGACGTGGTTCAATTAGGCCTAAAGAGTATAAGAGACA
GAAAGGGAGGGCCGATTAGCACTGTTGACAATGTACTA
>Rosalind_2046
CGTTGTACTTTCTGATTAATGTGTCTCCCGACTCTGGGTTGGCAAAGAAACAGTGTTCTT
GGGCTTTTGCCTTCTACGAAACTT
>Rosalind_7767
TTTGTAAGTTCATAGCGTGTAAGTGAGATAAGTCAGGCGATAAGACTCTTGATGTCTCCT
TTGGCATGACACGACGTAGTTAGTAGAT
>Rosalind_4496
ACATACGTAATTCACAATCATCTGATTTAAGTTAGATAAATATAGGCCCTCGTGCTATTG
GTATGTGACCCGATTGCCTTGCAGTTGTCTGGGAGCGACA
>Rosalind_1958
TAGTATGGTCTCCAGTCGCCGGATCTGGATAATTGAGGTGCATAGTCGTAACAAACTTAC
TATTGGTCAAGTTTATCAGATCTCACGGAGACCCCA
>Rosalind_4894
GCACCCTCCCGCTCAGCTATCATGTTTCGTAAACACATATGATTGGCACGTATACGCGTG
GGAGCGTACGGCTAGATTCGCAGATC
>Rosalind_6957
TTCAGAATGATCAGGGGATCTAACAGAGCTCGCGGGCGTACCTACAGTGGTTTGCTGCCT
TTTGACAATTTGTCAGTTACAA
>Rosalind_8758
TAAAGGTGCGTCTCGTCGTGTTATGCAGGGCTCTTATTTCCATTCCATACGCTGTACTCC
CCCAATACTCTGAGAGGGTGTGTCACTCATTCA
>Rosalind_3530
GATAACGATTAGGACATACCAGGGGCCATGATTGAATGAAATGAGAGACTCGAGGCAAAT
GTACCTCCGGTATCCTGCGCCCTTAAGTCTAATTCCAAGC
>Rosalind_3987
TTCGGGTGTCCTATGTGGGAAGGTGGGCGTAATGGCAGGCGCTAGGGTGGAGGTGTTATG
CCATGTATGTAAGGAAAGTGCGCCCCAATCATACC
>Rosalind_1739
AGCGGGTGCATATTAGAATCCACCCTGATCCGCCTCAGCGACTACTGACGCTGTACGATG
TTGTGTGTAATGTCGAAGAAAACAGAACCAACTTA
>Rosalind_8416
CGACTGTTTAAACTCGGTTCCATTAGCCGTGGCCTTTATTTTAGTGCATCTTAAGTTAAG
CTGAAAGTTTTATGACTTCGACGGACGCA
>Rosalind_2913
GGTTAATAACCGTAACGCTCCGGCTTCAGTGTAGCTAGTGTGCACGCCCTACTCTAAACG
CATGCTATTGAGGTTAGGGACATTCTTCCCCT
>Rosalind_2656
GTCAAGCCAGGCATCTGGAAAGGTGCCACTCTTCACACGTGGCTATGCGGGTTTTAAATC
CCTCTAGCAAACTGTGACTACGG
>Rosalind_5628
AATCAATAACCTTATGATCCCGGATATACTCGAGGGGATGCCGTTAACTCGATTTCTTCC
AGATACATGAGCTACTGCCATGT
>Rosalind_6574
AGTCTTAGCCATCTGAGACTAGTGACTTGCCAGGTCTGACCGCGCGCATAAGCGATATAA
ACACTACCGGGATCTCGCGACGGAT
>Rosalind_7014
ATTATTTAGAAAATGAGCAGGTACAGTGAGATGATTGATGGCAGCCGACACGATAGAAAA
GGTATTATAGGATTTACGGAACCA
>Rosalind_4377
CGCCTTCCTTTAAATCCGAGGTGACATACCAACCACCTGTATACAACGTTGATGAAACGG
TTCCCCTGGTAGGAGGTCCTCTTGGTGGAA
>Rosalind_0178
GTCGAAAATACACTATGGCTCGAAACGGTTCAGCACTGTTTAGCTCTAGGTTAATCTAAT
GCGCATCTGGAATCCGTGAA
>Rosalind_9495
CTCCCACTTTTTACCACGTATGCTAAGATCATTAAACTAACTTCTTATGTCTGTAAATCT
GGAAGTGTAGACTTCTATTAGATGCG
>Rosalind_7226
GTTCCATCCCTGCAGGCAAAATGTTTTTAGACAAGAACTTCGAGTTGAAAGCAGCGTCTT
GTGAAAGTTAAGAATCACACAAC
>Rosalind_7654
TCGGGGAATTACAGATAGTGTCTTCCGCACTTAATCATGATGATCGTATAGGACACTCTC
TCGTTAGCGCCCTCAATGAACACGATCTTGAACTTC
>Rosalind_9688
ATTTACAAAAACAATCTTCCGGCCTGGGATCTAGTTACAGATGTTCAGAAGGACTAGTTG
GCTGCCCGTTGAACATGTTTCGTCGG
>Rosalind_1183
TGGGCCCAAAGACTTATGGGTACGTTAGTACCTACTAGACGGCCTAATTGTACACTCATA
ATTGGCATCTGAAGCAGGTCTATATCCCGTAA
>Rosalind_0877
GGAGTTGCAACGGCATCGTCCACGAAAGAGGGGGGGCCGGTCATCAATACCTCTAAGGTA
CAGATTCATAACCGGTCTATCCGGC
>Rosalind_6536
TTTAAGGAAGTCGATGTGGCGGCTGTTAGCGGGACGCATAGTAACTCTTCGTGCACCACT
GCAGCGCCTCCCTTGGAAGTGTTCCCGTGCTAG
>Rosalind_9708
TTATGTTGCGCTGATTGCTGCCCCTGTGCAGCTAGAACAAAGGGTATGAAGACCAACTGT
TAAAAGAGATACGTCAGTAAATTACAACGGC
>Rosalind_1383
AGCTGTCCTACGCCAGGTCGCGAAACTTGAAGTCTTGACAACGCCTGTGATCAGTGGCAT
ACGACACTCAGTGCCCGGTAGTGGATTT
>Rosalind_3030
ACTACTGATAACGCCAATCCATTCTTTTTAACTAACCGTCAATAGAGTTCATTGGTCCAC
CACTTGGCCTAGTGGGACAGATCCGC
>Rosalind_1949
ACGAGGGGCCCTTACATTTGGGGGGTTGTCAGTTCGCCATATGGGACTCGCCTGAAAAAT
ATAAACGGGTCTGTAGAGATCTTTATC
>Rosalind_0240
TCAATAGCCCTACGACTTTTCTGCACGTCAACCCTATCGGGGACTGGCAGGCCCGTTATT
ACGCGTAGATTGAATCAATGGTTTCTAAACGACCC
>Rosalind_2852
TGGTTTATGGTAATGTAAAGGACGAAGTAACTCTTAGAAGGACATATGAGTACGGCTAAC
ACTTACTATCACCTTCCTACAC
>Rosalind_4869
CGTGCAGCGCAAGATATGACTGTGAATATAAACGTCTCGGTAACCTGCATCTAGCGGGTC
TCAGACCGCCATATTATAGCCGTCAGT
>Rosalind_8598
TAGAGCCCAATTCCTTCCACTTAAAGTCCGCGGATGTGTCTGTACGGCGGATGGAAATGT
CGAGGATAGCTTTACTGATG
>Rosalind_2528
ACTGTCGAAGACATACAGGCTTAGTGAGCTGCGACAATGTTCGTAACGCAAGATGATCGG
GTGGTTCAACGATTCAGCCACCT
>Rosalind_2022
GGTCCTCGGATTCTTGTTCTAACGCGGAAGGGATCTGTAGTAGACGATTTCGGTTGTATC
GCCGCCTCAGGATATATCGTTCATATCGACATAC
>Rosalind_7036
ATTTACTGGTGCGTTAATGGTCAAGTCCTAAAAAGACAATTTCGCCAACTCGCAGAACCA
TGTCCGGTAGACGTGCTGGTAGTAGTCGAATAGTAG
>Rosalind_8849
CGTCACTAATAAGGGTCTCGACCGTCCCTGTAACCACATGCCATCCCACCCGGTGTTGTA
CGTTGTAACAGGAGCGCTTAGAGGAGAGGAGCCGGC
>Rosalind_5548
CCAGACGCCTCAGCCGTATCGGTTCACCGCGTGGGGGCATAAGACTATTAGCGTAGGCAT
CAACGCACCCCTTCACTTGATTCAGAAGCGGTACGCGT
>Rosalind_3518
CAGCAGAAAGATAAGCCGGGTGATGCAGACAAAGCGCCGACATTCGAGAGGCTGCGCCCG
TGCTAGGGAGAAGCGTCCTATCCGCTGTGGAATTACAG
>Rosalind_0475
CAGTTGAAACGGTGCCTGTTTAGGTTGCGCGGCTTCATAACGTATTTATGCGCCTTCACC
AAATTCTTGGCTGGAACTGA
>Rosalind_4585
TAGGCTTTGGGTTCAGGTGTATTGGCCTGTCAAACCAAGGGCCGGTGGATATCACTGTCC
ATCAACAATTCTGAGTTTAGGGAGTGGCC
>Rosalind_5126
TGGTGCCGGATTACCGCGGCATCGCTGTTTCGGCTTCATAACGTTGATAATATGCGATCA
ATGCTGAAACCAGCCCCGTGGGTCACT
>Rosalind_7804
CAGCCAGCAATTCTGGGTTGGATGGTATATAGCTGAATCATGACGGATTATGCCTAGCGC
CTGGAACACTGTTAGGCCGAGACACTCCATA
>Rosalind_9805
AAACTACTCCGGGCGTCGTCATTCGCTGATCTTATGCTACTATAGCTTATAATGCGGCAA
CAATCGTGGAATCTGCCAACAGTAAATAGATTC
>Rosalind_5472
CCTATTGAAAAACGACGGCATACGCTTCTGGGAGTTTAGTCTCTCCTTGTTTCTTAAATC
TTTAGCGCCGAGGCAATAAGCCGTATGCTC
>Rosalind_3862
CACGCTGGTATTTGTTCGTCTGGGGGGCATTCCCGTATCTTATACCGACTAACCTTTCGT
GTGTAGTTAGTGTATCCGTCGTCCGATCAGGCAACG
>Rosalind_2262
CCTTCCGACCTAATAACACTTTGTGAACGAAGTGCGGGTGGGCGCACCCAAGTACGCCTT
CGATCGTTTCACCGGCGTCTCATATGTC
>Rosalind_7261
ATAGCCCTGGGAGCAACTCGGCGTAACCAAAGGCTCGTACTGACTTCGCATGTTGCATTC
ACTGGCGGCCTACATTCGGCCAATCCTACTCAGGATTGC
>Rosalind_2345
ATGCCTTGACCCGTAAAGGCGCACAGACCTGTCCTCCTGCTTGGACTCTTGACCAATGAA
ACCGTGTAGGTTGAACGCCG
>Rosalind_2849
TCCTGATCAGGCGGTACTACTGTTCAACTTTCATGTCCTGCCACCACATACTTAATGGTG
TCAAAGATCAGGGGAAGAGCA
>Rosalind_3092
GGTCGGTCATAGGCGGCTGAACTGAGATTATCAGGCGACTATCGCCACGAAAATAGAGCA
AGCTCCTCGGCCGCGGCACGCTGAATCCGGCG
>Rosalind_4268
CCGCAGAAAAGGCCGGTCACTTCTGAGAGACAGGTGACGGTATCCGACTAAAAGATCAAG
TCGGATTTACAAATTGCGACAAAGGGTGACT
>Rosalind_3231
ACCAAGGCAATCACGACATGAAGCCATCTGCTATGTATTGTAACCTTCTATGCCTGTCTC
ATCGACCATACCACCGGTAGACGGACGTACG
>Rosalind_0076
CGTAGACTCTAAGAGACGAGGTATTGAACGGGGGGCCTTTGGAGGCGTTGCCCCACATAC
CCTGACCATTTGGCAATGCTC
>Rosalind_4624
GAAGTTAAGGAAGGGGTTCTTCAAGGACGGTAGCCTGAGACTTTCGACAGAGTACACGTT
CGCCCTCCGACCAAGTAACTTT
>Rosalind_2782
TGACTAAATATACTCAATACTATTCACCTTGGACACCGGCTAACAGCAGGGGCCGGTTGT
CGGCACGCTCTTGCGCCGCGCTCCCCTAGTGCG
>Rosalind_6335
TATCAAAACCACATAAATAACCGGATGGTTCTATTATTTTCCCTACGCGTTTCACATGAG
CCTAACCCACAAAGTCTGGATCTCACGAGTTGAGT
>Rosalind_6502
TATCTTAGTATACCCCATCTCCATAGCCTCCCCGACGTATACTATTTCGTGCACCTTCCA
CGAAGGAGGTTTGATACGTAT
>Rosalind_0832
TACTAGATGGCGAGGTCCACCCTCGCCTTTGGTAATTGCATATAGTTACCCGTTAGAAAT
CCGCGTAGGCAACCACGAGTACCGCCAAG
>Rosalind_7594
ATGCGCACATAGTTTCTAAATTGAGCGATAATGCATACAGAGTACATTTGTATCCCAACG
GCGCAGCGGAAAAGTGCTTTAGAAAGGTAGTGTTTA
>Rosalind_1870
TTGCAGTGTATGCATCTACATAGCTGTTCGATAAGTTGTCCTGCGGCTGCGCAGACCGCT
GAATGCGCCAGTAATGATGTTGCGATGCGGATAACTTTTC
>Rosalind_9082
CTGATCGACCGTCCACTATGTGCACACACGCGCTCGGAGCCGCGTAAGTACACCTTCGAC
GCAATTGAGATTCGACAGATGCCGATGGATTGGCGG
>Rosalind_7634
TGAAAGGTCCCATGTCCGCACCCATGCCTTGTGACAAGTTAAATATTAGAAAGATCTTAG
GATGAAACGCTTCGGAAAGCCGGACCTTTAGGTCGGTG
>Rosalind_0738
CGATCTCGGTACACGGAGCAAGTGTTGTAGCGATTTTATCTTTACACGTGCCATTGAGTT
TGTGGTAAGTGGAGAGAACCACCCCCTT
>Rosalind_5416
CATGTTAAGCACAATGACCTACTCTGCGCCTCTATATAGGCGCTAGGCATGACCCTTCCC
AGTATAATCACAATTAGTCTTCA
>Rosalind_4266
AGCCTCCGGCTCCAATTGCTTATTACCTGTAAAGAGTCATCGCCGATAGTAAAGATGGCT
TTGGGCCGGATTGGAAAGTTTCGTTTAC
>Rosalind_7274
GTTTTCTCAGGTCTGGCCTGCACACCGAATATAAGGATTTCTTCATGGAGTCGGCTCGTA
CATATGTTGCCCGACAGCATTTAGTACC
>Rosalind_5146
TAAATCCAACTAAGAGGGTAACATGCGGCATGGATACTATAGAAAGTAACGGGTCTCATC
GGACAGAAGCGCACGTCATAGGT
>Rosalind_2941
CACTTTGAGTGGGCGCCATCAGCCGCAACCTTAACCGTAAGTGTTGCGCGATCGGAGAAC
ACCTTTCCCGAGCACACGGCTCATAG
>Rosalind_2025
GCCACCTTTGACCAGATGAGCAGTAAGACTACCCCCTGGACACCGACAGCGTTGGCCCAT
CGCCCATTAAGGCTGGGGGAGCGA
>Rosalind_6422
GACGTGTTGTAGAACAAGGGCTAGTACATACATCATCTGCCTAAGGACCAACTAAGTCGT
CTATAGCAATCCAGTCTGGATTGCTGCGCAAA
>Rosalind_4431
TTTCAAGGCTCTTCCTAGACAAGACAGTCCACCATGGAATGATAAGGACCAGCGCTATTT
TATGATCTAGTTACAGGGTTGC
>Rosalind_7045
AGCACAGTATGGGGCGTCGAAATCTTACCTGAACAATTATTTTTGAGTCGGGACCCACCG
TTTGGGTTCCGCCGTAGGGGAACATA
>Rosalind_5001
GAGCAGCCAAGATCCTCGCGCTTGCTGGGTTCTCGCTTGCAACTGTCGATTCCTTGGGCT
ATCTCGGTACCACAAATCGTTAAC
>Rosalind_4076
CTTCTTATCAAACGATTCGCTAGGTACTTACGCCGGCTCGGGGGGTGAAAGATGTTTACT
ATCCTCTCGGGCATATTGTTGCTCTAA
>Rosalind_7100
AATATTGTAGGCACATTCGCAATGTGGATTGCTGTATGTTGCTGGCATCAGAATACAGAG
TTTGGCCACATGCCTAGCGCCGTACCTTTGCGGTCGTCA
>Rosalind_6708
TAGGGCGATCGGATTCTCCGACGAGCTGCGACGAGCGATTTTGCTGGACCGTCGCCCCCG
ATAACAAATCGGCGCGGAGGGGTTAAGACTTTTTCT
>Rosalind_1194
TCAGAGGACTTCTTATTTAAAGTCCTGTCGTTATAACCTTAACACACTCGCAAACCGGAA
CACGATAAACGACGCGAGCAGATCCGGGCCCAT
>Rosalind_6409
AGTAGGTGTACTTGAATGGTGCCCGTGACTAGCTGTCTCCATTATGTCATCGGCTGCTGA
AGTACTACCGAATCTACCGCGTGT
>Rosalind_4489
ACTAGCTCAATGGACCGGGGACAAGAAGTATTCGCTTCGCATTCTGAAGGCTTCCCGCTG
CAAGCATCCACAATAATTTGGTAGTGAACGA
>Rosalind_5989
TCAGGATGTAGTATACAGCGACCCAGTCGTCACGATCGACCCTTTCAAGGATCTGATCTA
TTAAGTGAGTATTCCGTCCAAA
>Rosalind_4712
GACAGACCCTCTGCACCATTGGTGACTGAGTAAACGCAAGCCGCGCACTGAGATGATACG
GTGTACCCCAGGGCCTTAGTAACG
>Rosalind_2436
GCCTTAACAATTTATTTTAGGAGAGACAGCTAATCGATTTTAACAGGACCACGTCTAATC
AGTTTCCTAGCATGCATGGCAGCTT
>Rosalind_0099
CCAAATCATAACATGTTACCTGACGTACGTTCTCGGCATTAGAGGGCCCCGAAGATGACG
TCGTCACACCCTACCCTCCTACAGATATT
>Rosalind_9647
ACTCGAACTAGGCAGGTGATAATGTGTACTTGATGACCAAGGGAAAAGCCAGTTCTCCAC
GGCCATGGATTATTACGATTACGCC
>Rosalind_4170
TGCGCAGACAAGTTCTGCCACTTTAGCGGCATCCTAGTCCTGTCGTCGGAAACTGCTTAA
CGCTTGAAGTAAAGCAATCGGGCACAGAACATAATC
>Rosalind_1556
GCCGCTAACTTGTTACCCATCTGACAGCTGGCGAAGTAGCATATTTTCCTTCACCGGTAA
GGGTGTCTTAGAGCGAGTCGGGCACGTGAATGCGGT
>Rosalind_5676
TTTAGGGGAGGACGTGCTAGCACAGCTACTGGACCCCGTTTCGATCCGGTTGTCTACTGT
GGAATATTCATTTATGGGATGCCTCGGGCCACGCCACGCT
>Rosalind_9899
AATTTAACACGGGGCGAGCAGGGTTCGACTAGGTCGAGTGATTTGCATCTTCAGCCTTCC
CACGACTTGTAATTGTGTGGGAATCCAATGAAGTG
>Rosalind_2494
TGACTCTCATACTTTATACACGATGCGAGGACGAGCCGCGTATTAGTGGCTCCCCCGTAT
CCCGAGCAGCGCTGCACCCCGCTCTGGGGTGTG
"""
def overlap_graphs(fasta, n = 3):
    parsed = parse_fasta(fasta)
    results = []
    parsed_list = [(k, parsed[k]) for k in parsed]
   
    #parsed_list = parsed_list = list(parsed.values())
    for k1, v1 in parsed_list:
        for k2, v2 in parsed_list:  
            if k1 != k2 and v1.endswith(v2[:n]):
                results.append((k1, k2))
    return results

for edge in overlap_graphs(fasta6, n = 5):
        print(edge[0], edge[1])


Rosalind_7543 Rosalind_2528
Rosalind_1961 Rosalind_3092
Rosalind_4268 Rosalind_2782
Rosalind_4268 Rosalind_2494
Rosalind_2849 Rosalind_5001
Rosalind_3208 Rosalind_0832
Rosalind_5989 Rosalind_2432
Rosalind_5989 Rosalind_0099
Rosalind_7100 Rosalind_8849
Rosalind_5628 Rosalind_5416
Rosalind_7045 Rosalind_4496
Rosalind_7036 Rosalind_6409
Rosalind_9495 Rosalind_7594


In [None]:
class sequence(string):
    def __init__(self):
        

In [251]:
for k in codon_table.keys():
    k = k.replace("U", "T")
    

In [252]:
codon_table

{'AAA': 'K',
 'AAC': 'N',
 'AAG': 'K',
 'AAU': 'N',
 'ACA': 'T',
 'ACC': 'T',
 'ACG': 'T',
 'ACU': 'T',
 'AGA': 'R',
 'AGC': 'S',
 'AGG': 'R',
 'AGU': 'S',
 'AUA': 'I',
 'AUC': 'I',
 'AUG': 'M',
 'AUU': 'I',
 'CAA': 'Q',
 'CAC': 'H',
 'CAG': 'Q',
 'CAU': 'H',
 'CCA': 'P',
 'CCC': 'P',
 'CCG': 'P',
 'CCU': 'P',
 'CGA': 'R',
 'CGC': 'R',
 'CGG': 'R',
 'CGU': 'R',
 'CUA': 'L',
 'CUC': 'L',
 'CUG': 'L',
 'CUU': 'L',
 'GAA': 'E',
 'GAC': 'D',
 'GAG': 'E',
 'GAU': 'D',
 'GCA': 'A',
 'GCC': 'A',
 'GCG': 'A',
 'GCU': 'A',
 'GGA': 'G',
 'GGC': 'G',
 'GGG': 'G',
 'GGU': 'G',
 'GUA': 'V',
 'GUC': 'V',
 'GUG': 'V',
 'GUU': 'V',
 'UAA': 'STOP',
 'UAC': 'Y',
 'UAG': 'STOP',
 'UAU': 'Y',
 'UCA': 'S',
 'UCC': 'S',
 'UCG': 'S',
 'UCU': 'S',
 'UGA': 'STOP',
 'UGC': 'C',
 'UGG': 'W',
 'UGU': 'C',
 'UUA': 'L',
 'UUC': 'F',
 'UUG': 'L',
 'UUU': 'F'}