In [8]:
from Bio.Seq import Seq
from Bio import SeqIO
import random as r
import matplotlib.pyplot as plt


In [9]:
f = open('HBB_FASTA.txt')  #open a FASTA file with the gene sequence
sequence = ''
for line in f:
    s = line.split()
    if line[0] == '>': #ignore lines that do not contain gene sequences
        continue
    else:
        sequence = sequence + s[0]  #transfer the data from the file to the string "sequence"
sequence = sequence.replace(' ','')  
sequence = sequence.replace('\n','')
# makes sure there are no spaces in the string

In [10]:
# Transcription
def transcription(mutated_sequence):
    template_sequence = ''
    transcription_sequence = ''
    mutated_sequence = mutated_sequence[mutated_sequence.find('ATG'):]
    #translation starts when the ribosome encounters the start codon 'AUG' in 
    #the mRNA codon.  This corresponds to 'TAC' in the complementary strand, 
    #which corresponds to 'ATG' in the normal coding sequence.  As a result,
    #any noncoding nucleotides in the coding sequence before the start codon will
    #be discarded
    
    for i in mutated_sequence:
        if i == 'A':
            template_sequence += 'T'
        elif i == 'T':
            template_sequence += 'A'
        elif i == 'G':
            template_sequence += 'C'
        elif i == 'C':
            template_sequence += 'G'
        #Transcription occurs on the complementary strand to the normal DNA coding 
        #strand, so each DNA nucleotide is replaced with its DNA complement to 
        #generate the complementary template strand
       
    for i in template_sequence:
        if i == 'A':
            transcription_sequence += 'U'
        elif i == 'T':
            transcription_sequence += 'A'
        elif i == 'G':
            transcription_sequence += 'C'
        elif i == 'C':
            transcription_sequence += 'G'
        #These replace each DNA nucleotide with the complementary mRNA nucleotide
        #and add it to the new transcription sequence
    return transcription_sequence

In [11]:
# Translation
def translation(transcription_sequence):
    
    codons = []
    protein = ''
    for i in range(int(len(transcription_sequence)/3)):  
        codons.append(transcription_sequence[:3])  #append each set of 3 amino acids to a list of codons
        transcription_sequence = transcription_sequence[3:]  #move to the next codon

    count=0
    for i in codons:
        if i == 'AUG': 
            codons = codons[count:] 
            break
        count+=1
        #coding DNA starts at the start codon 'AUG', so this 
        #ensure that all noncoding nucleotides before this point is discarded

    for i in codons:
        if i == 'UUU' or i == 'UUC':
            protein +='F'
        elif i == 'UUA' or i=='UUG' or i=='CUU' or i=='CUA' or i=='CUG' or i=='CUC':
            protein +='L'
        elif i == 'AUU' or i=='AUC' or i=='AUA':
            protein +='I'
        elif i == 'AUG':
            protein +='M'
        elif i == 'GUU' or i=='GUC' or i=='GUA' or i=='GUG':
            protein +='V'
        elif i == 'UCU' or i=='UCC' or i=='UCA' or i=='UCG' or i=='AGU' or i=='AGC':
            protein +='S'
        elif i== 'CCU' or i=='CCC' or i=='CCA' or i=='CCG':
            protein +='P'
        elif i=='ACU' or i=='ACC' or i=='ACA' or i=='ACG':
            protein +='T'
        elif i=='GCU' or i=='GCC' or i=='GCA' or i=='GCG':
            protein +='A'
        elif i=='UAU' or i=='UAC':
            protein +='Y'
        elif i=='CAU' or i=='CAC':
            protein +='H'
        elif i=='CAA' or i=='CAG':
            protein +='Q'
        elif i=='AAU' or i=='AAC':
            protein +='N'
        elif i=='AAA' or i=='AAG':
            protein +='K'
        elif i=='GAU' or i=='GAC':
            protein +='D'
        elif i=='GAA' or i=='GAG':
            protein +='E'
        elif i=='UGU' or i=='UGC':
            protein +='C'
        elif i=='UGG':
            protein +='W'
        elif i=='CGU' or i=='CGC' or i=='CGA' or i=='CGG' or i=='AGA' or i=='AGG':
            protein +='R'
        elif i=='GGU' or i=='GGC' or i=='GGA' or i=='GGG':
            protein +='G'
        elif i=='UAA' or i=='UAG' or i=='UGA':
            break
        else:
            print('error')
        #Based on the standard genetic code, codons code for a specific amino acid
        #some codons also represent stopping points in translation, representing the 
        #end of translation, so these will break the loop to end the protein.
        #Coded amino acids are added to the string 'protein'
    return protein

In [12]:
# Printing the Amino Acid Sequence of Human Hemoglobin Beta

# The functions transcription and translation are carried out to convert the 
# gene to mRNA and then to the amino acid chain
print("Amino acid sequence of human hemoglobin beta:")
print(translation(transcription(sequence)))

Amino acid sequence of human hemoglobin beta:
MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH
