In [83]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [85]:
# function for finding reverse complement

def reverse_complement(sequence):
  bp_dict = {"A":"T", "T":"A", "C":"G", "G":"C"}
  comp = ''.join([bp_dict[b] for b in sequence])
  reverse_comp = comp[::-1]
  return(reverse_comp)

In [86]:
# function to transcribe DNA to RNA

def transcribe(dna_sequence):
  bp_dict = {"T":"U", "A":"A", "G":"G", "C":"C"}
  return(''.join([bp_dict[b] for b in dna_sequence]))

In [87]:
# function for translating sequence

def translate(rna_sequence, codons, a = 0):
    translated = ""

    while a < len(rna_sequence)-2:
        triplet = rna_sequence[a:a+3]
        aa = codons[triplet]
        translated = translated + aa
        a = a+3
        
    return(translated)

In [88]:
# Import the RNA codon table

codons = {}
with open ("RNA_codon_table_1.txt", "r") as myfile:
    for line in myfile:
        code = line.rstrip().split(" ")
        if len(code) > 1:
            codons[code[0]] = code[1]
        else:
            codons[code[0]] = " "

In [110]:
def peptide_encoding(dna, peptide, codons):
    rc_dna = reverse_complement(dna)
    sub_dna = find_peptide(dna, peptide, codons)
    rev_subseqs = find_peptide(rc_dna, peptide, codons)
    for i in rev_subseqs:
        sub_dna.append(reverse_complement(i))
    return(sub_dna)
    
    
def find_peptide(dna, peptide, codons):
    substrings = []
    pep_len = len(peptide)
    rna = transcribe(dna)
    
    for i in range(3):
        protein = translate(rna, codons, a=i)
        #print(protein)
        ind = 0
        while ind >= 0:
            ind = protein.find(peptide, ind)
            #print(ind)
            if ind == -1:
                break
            substrings.append(dna[(ind*3+i):(ind*3+pep_len*3+i)])
            ind += 1
    return(substrings)
        

In [111]:
# run test data set - it works!

test_dna = "ATGGCCATGGCCCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA"
test_peptide = "MA"

peptide_encoding(test_dna, test_peptide, codons)

['ATGGCC', 'ATGGCC', 'GGCCAT']

In [117]:
# run extra data set - yes it works!

extra_pep = "KEVFEPHYY"

extra_dna = ""
with open("dna-for-peptide-encoding.txt", "r") as myfile:
    for line in myfile:
        extra_dna = extra_dna + line

encode_strings = peptide_encoding(extra_dna, extra_pep, codons)

f = open("peptide-encode-test-results.txt", "w")
f.write("\n".join(encode_strings))
f.close()

In [118]:
# run the problem random dataset

random_pep = "YHMKPWKWKG"

random_dna = ""
with open("dataset_96_7.txt", "r") as myfile:
    for line in myfile:
        random_dna = random_dna + line
        
encode_strings = peptide_encoding(random_dna, random_pep, codons)

f = open("peptide-encode-probem-results.txt", "w")
f.write("\n".join(encode_strings))
f.close()

In [120]:
# Bacillus brevis genome problem - genome does not contain peptide!!

tyro = "VKLFPWFNQY"

bac_genome = ""
with open("bacillus-brevis-genome.txt", "r") as myfile:
    for line in myfile:
        bac_genome = bac_genome + line.rstrip()
        
encode_strings = peptide_encoding(bac_genome, tyro, codons)

f = open("bac-genome-encode-results.txt", "w")
f.write("\n".join(encode_strings))
f.close()