# File handling – The Hidden Message

- The file "genome.fa" is a 1 million bp. piece from a bacterial genome

- Find all open reading frames >= 450 nucleotides / 150 AA
    - Remember an ORF can also be on the complementary strand!
    - An ORF starts with "ATG"
    - An ORF stops with "TAA", "TAG" or "TGA"

- Translate the ORF into an single letter amino acid sequence
    - ATG --> M

- Sort the ORFs on length (large to small)

- From the ORFs take in order the 25th AA

- What is the hidden message?

In [1]:
# Obtain the AA translation code
bases = ["T","C","A","G"]
codons = [a+b+c for a in bases for b in bases for c in bases]
amino_acids = "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"
codon_table = dict(zip(codons, amino_acids))

In [2]:
file = open("genome.fa", "r")

fileb = "CGGAGATAGACGGAGATATGA"
import re

code = []

for line in file:
    line = re.sub(">genome", "", line)
    line = line.rstrip() 
    code += line
    #if len(code) > 20:
     #   break

#print "".join(code)

rvcomp = []

from string import maketrans
complement = maketrans("acgtACGT", "tgcaTGCA")

for line in file:
    line = re.sub(">genome", "", line)
    line = line.rstrip()
    rvcomp += line.translate(complement)[::-1]
    #if len(rvcomp) > 20:
    #    break

#print "".join(rvcomp)

merged = "".join(code) + "".join(rvcomp)

#print merged
file.close()

# Open the genome file, read the first line, and concatenate the sequence
# Make the sequence reverse complement and merge at the end

In [7]:
import re

line = 'TAGGATATTAAATGGGACGAGAGACGATAGACCGAGATTAAGAACAGAGT' 

find_start = re.findall("ATG", merged, 0)
print find_start

find_start = re.search("ATG", merged, 0)
print find_start.start()

find_stop = re.findall("TAA|TAG|TGA", merged, 0)
print find_stop

find_start = re.search("TAA|TAG|TGA", merged, 0)
print find_start.start()


# Find all start codons
# Find all stop codons

['ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG', 'ATG'

In [None]:
import re

startstop = []

for i, value in enumerate(merged):
    triplet = merged[i:i+3]
    if triplet == "ATG":
        x=i+448       #from start of triplet take position of A and + 448 = in frame to find stop >450bp
        while x < len(merged):   #while x smaller than total length of the strand, perform if 
            if merged[x:x+3] == re.findall("TAA|TAG|TGA", merged, 0):  #if x codon is the same as the stop, then fill the sequence from start 
                startstop += merged[x:x+3]      #until stop in new file called startstop and exit the while loop from 
                x= len(merged)                  #this position and from this position enter the for loop again.
            else:                           #if the triplet is not the stop then add 3 nucleotides, reenter while loop
                x+=3

print len(startstop) 

# Find the first stop codon in frame after every start and check if length >= 450

In [None]:

sub = re.sub("ATG", "M", startstop, 0)

In [None]:
sub.sort(key=len)

import dna_tools

trans = []

for l in sub:
    hh = dna_tools.translate_dna(sub[l][73:75])
    trans += hh

print trans

# Get all lengths of the ORFs to sort on later
# On the sorted ORFs translate the 25th AA, 25th AA = pos 75:77, minus 2 because ATG is now 1M instead of 3nt