# Project Objective:  

To fetch genetic sequences of proteins from NCBI GenBank public repository and use that data to simulate the central dogma of molecular biology (the flow of genetic information within a biological system: DNA -> RNA -> Protein).

Step 1: fetch the genetic sequence of a randomly chosen protein with an id = XM_025720182.1

In [3]:
# import Entrez and SeqIO from the Biopython library
from Bio import Entrez, SeqIO
Entrez.email = "A.N.Other@example.com"  # Always tell NCBI who you are
# fetch the information about the protein with the id = XM_025720182.1
aspergillus_histones = "XM_025720182.1"
handle = Entrez.efetch(db="nucleotide", id=aspergillus_histones, rettype="gb", retmode="text")
# read the the sequence
record = SeqIO.read(handle, "genbank")
handle.close()
# Fetch the DNA sequence of the current protein and store it as a fasta file
SeqIO.write(record, "aspergillus_histones_dna.fa", "fasta")

Step 2: Locally Store the fetched data as a .txt file with an appropriate file name. 

In [1]:
# Vist the following website https://www.ncbi.nlm.nih.gov/nuccore/XM_025720182.1 
# and save the translation (sequence of amino acids) of the aspergillus_histones DNA sequence as 
# txt file (e.g.aspergillus_histones_rna.txt)
# The tranlated sequence will be used to verify our translation of the original DNA sequence 
# (aspergillus_histones_dna.fa)
file = open("aspergillus_histones_rna.txt", "a")
file.write("MARTKQTARKSTGGKAPRKQLASKAARKAAPSTGGVKKPHRYKPGTVALREIRRYQKSTELLIRKLPFQRLVREIAQDFKSDLRFQSSAIGALQESVEAYLVSLFEDTNLCAIHAKRVTIQSKDIQLARRLRGERS")

136

Step 3: Implement an Object Oriented Program. 

In [41]:
# For regular expression pattern matching
import re 
# Implement a nucleotide class with several functions to manipulate any strand of DNA sequence.
class Nucleotide:
   # A constructor that takes a DNA sequence as a String of characters (G, A, C, T).
    def __init__(self, file):
        # DNA Sequence of the provided protein
        self.dna = None
        # Name of the provided species
        self.species = None
        # Protein type of the provided protein
        self.protein_type = None
        # NCBI GenBank id value of the provided protein
        self.ID = None
        # Call parse function to parse the file and store the appropriate values in the 
        # instance variables above.
        self.parse_file(file)
    
    def parse_file(self, file):
        """Parse the file containing the DNA sequence and assign the 
        instance variables appropriately."""
        
        # Regular Expression to parse the needed information
        regex = '>([A-Z][A-Z]\_\d+\.[0-9]) ([A-Z][a-z]+ [a-z]+) [A-Z]+ \d+ ([a-zA-Z0-9-\s]+)\(' 
        result = re.search(regex, file)
        # Store the ID
        self.ID = result[1]
        # Store the species name
        self.species = result[2]
        # Store the protein type
        self.protein_type = result[3]
        
        # Iterate over the fasta file line by line and concatenate the DNA sequences into one.
        result = ""
        # Note: the first line is skipped because it does not contain DNA sequence.
        for line in file.splitlines()[1:]:            
            result = result + line
        # Store the DNA seq result 
        self.dna = result
    
    def transcribe_dna(self, dna):
        """The first phase of the Central Dogma of molecular biology. Transcribe the 
        current DNA sequence into mRNA. Returns an RNA sequences as a String containing 
        the characters (G, A, C, U)."""
        
        rna_seq = ""
        
        # Transcription process: dna base to rna base:
        # G -> C 
        # C -> G
        # A -> U (Notice: not T)
        # T -> A
        
        for dna_base in dna:

            if dna_base == 'G':
                rna_seq += 'C'
                
            elif dna_base == 'C':
                rna_seq += 'G'
                
            elif dna_base == 'A':
                rna_seq += 'U'
            
            else:
                rna_seq += 'A'
                
        return rna_seq
  
    def find_start_codon(self, rna):
        """Find the starting postion in RNA sequence and return index in string."""

        for index in range(len(rna)):
                
            if rna[index] == 'A':
                
                if rna[index+1] == 'U':
                    
                    if rna[index+2] == 'G':
                        
                        return index
                    
    def translate_rna(self, rna):
        """The second phase of the Central Dogma of molecular biology.
        Translate the RNA sequence into protein.
        Returns a sequence of amino acids as a String."""

        # find the start codon: ATG and translate each codon up until the 
        # stop codon (UAA, UAG, UGA).
        
        codon = ""
        amino_acid_seq = ""
        x = range(self.find_start_codon(rna),len(rna),3)
        
        for index in x:
            
            codon = codon + rna[index] + rna[index+1] + rna[index+2]
        
            if codon == "UAA" or codon == "UAG" or codon == "UGA":
                break
            
            # translate 
            amino_acid_seq = amino_acid_seq + self.codon_table(codon)
            
            codon = ""
            
        return amino_acid_seq
    
    def complementary_dna(dna):
        """Returns the complementary strand of the current DNA sequence"""
        print("complementing...")
    
    def verify_protein_seq(protein):
        """Returns a boolean based on whether a provided protein sequence is result of the 
        current DNA sequence. True if equal, false otherwise."""
        print("verifying...")
    
    # Takes a three letter codon and returns the corresponding Amino Acid 
    # using the table dictionary.
    def codon_table(self, codon):
        # There are 64 total codons and they each represent a particular amino acid.
        table = {
            "UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
            "UCU":"S", "UCC":"s", "UCA":"S", "UCG":"S",
            "UAU":"Y", "UAC":"Y", "UAA":"STOP", "UAG":"STOP",
            "UGU":"C", "UGC":"C", "UGA":"STOP", "UGG":"W",
            "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
            "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
            "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
            "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
            "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
            "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
            "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
            "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
            "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
            "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
            "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
            "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",
        }
        
        return table[codon]   # Return the corresponding value to the codon key

Step 4: Verify that the genetic sequence was correctly transcribed and translated. 

In [50]:
# Objective: Verify if the fetched DNA sequence, aspergillus_histones_dna, from the NCBI GenBank repository
# is transcribed and translated correctly. 
# Open the file containing the DNA sequence
file = open("aspergillus_histones_dna.fa")
dnaSeq = file.read()

# Instantiate a new Nucleotide object.
aspergillus_histone_dna_seq = Nucleotide(dnaSeq)
rna = aspergillus_histone_dna_seq.transcribe_dna(aspergillus_histone_dna_seq.dna)
protein = aspergillus_histone_dna_seq.translate_rna(rna)

file = open("aspergillus_histones_protein_seq.txt")
asp_protein =  file.read()
print(asp_protein)
print(protein)
print(rna)

"MARTKQTARKSTGGKAPRKQLASKAARKAAPSTGGVKKPHRYKPGTVALREIRRYQKSTELLIRKLPFQRLVREIAQDFKSDLRFQSSAIGALQESVEAYLVSLFEDTNLCAIHAKRVTIQSKDIQLARRLRGERS"
MIRLTGVQVTTVPGSIRRAKVPTGVPAGEVAsTVLRSGNVRAMAARSTLGSNGLLVTRGLGVRREGRRPSTLASPEVQPGGEGQKAVATRGPQATPDGAERETPVVGHAVGAVRTVVGQVPVGRAGGRGTTRENLIKDYSAKTKTNREYPsAIAPKKKKSTDDRYVLPKRLKLPKAVTQKQ
AAAGUAGAGAUUUUGUUGAUUUAGGUAGUUUGGAUUAGCCUAUUUUACCGAGCAUGAUUCGUCUGACGGGCGUUCAGGUGACCACCGUUCCGGGGAGCAUUCGUCGAGCGAAGGUUCCGACGGGCGUUCCGGCGGGGGAGGUGGCCUCCACAGUUCUUCGGAGUGGCAAUGUUCGGGCCAUGGCAGCGAGAAGCACUCUAGGCAGCAAUGGUCUUCUCGUGACUCGAGGACUAGGCGUUCGACGGGAAGGUCGCAGACCAAGCACUUUAGCGAGUCCUGAAGUUCAGCCUGGAGGCGAAGGUCAGAAGGCGGUAGCCACGAGAGGUCCUCAGGCAACUCCGGAUGGAGCAGAGAGAGAAACUCCUGUGGUUGGACACGCGGUAGGUGCGGUUCGCACAGUGGUAGGUCAGGUUCCUGUAGGUCGAGCGGGCGGCAGAGGCACCACUCGCGAGAAUCUAAUUAAAGAUUACUCAGCUAAGACAAAGACCAACCGAGAAUACCCUUCCGCUAUUGCCCCAAAGAAAAAGAAAAGUACUGAUGACCGCUAUGUACUACCCAAACGCUUAAAGUUACCCAAAGCCGUGACCCAAAAGCAAUAAAGAACGACGCGUUGUUUAUCAUCGACGGCUAAGUCGCCGACCCACCUUAACUAGUACCCCUACGUAGGCUUAUACAUGUA