In [188]:
# import Entrez and SeqIO from the Biopython library
from Bio import Entrez, SeqIO
Entrez.email = "A.N.Other@example.com"  # Always tell NCBI who you are
# fetch the information about the protein with the id = XM_025720182.1
aspergillus_histones = "XM_025720182.1"
handle = Entrez.efetch(db="nucleotide", id=aspergillus_histones, rettype="gb", retmode="text")
# read the the sequence
record = SeqIO.read(handle, "genbank")
handle.close()
# Fetch the DNA sequence of the current protein and store it as a fasta file
SeqIO.write(record, "aspergillus_histones_dna.fa", "fasta")

1

In [189]:
# Vist the following website https://www.ncbi.nlm.nih.gov/nuccore/XM_025720182.1 
# and save the translation (sequence of amino acids) of the aspergillus_histones DNA sequence as 
# txt file (e.g.aspergillus_histones_rna.txt)
# The tranlated sequence will be used to verify our translation of the original DNA sequence 
# (aspergillus_histones_dna.fa)
file = open("aspergillus_histones_rna.txt", "a")
file.write("MARTKQTARKSTGGKAPRKQLASKAARKAAPSTGGVKKPHRYKPGTVALREIRRYQKSTELLIRKLPFQRLVREIAQDFKSDLRFQSSAIGALQESVEAYLVSLFEDTNLCAIHAKRVTIQSKDIQLARRLRGERS")

136

In [190]:
# For regular expression pattern matching
import re 
# Implement a nucleotide class with several functions to manipulate any strand of DNA sequence.
class Nucleotide:
   # A constructor that takes a DNA sequence as a String of characters (G, A, C, T).
    def __init__(self, file):
        # DNA Sequence of the provided protein
        self.dna = None
        # Name of the provided species
        self.species = None
        # Protein type of the provided protein
        self.protein_type = None
        # NCBI GenBank id value of the provided protein
        self.ID = None
        # Call parse function to parse the file and store the appropriate values in the 
        # instance variables above.
        self.parse_file(file)
    
    # Parse the file containing the DNA sequence and assign the 
    # instance variables appropriately.
    def parse_file(self, file):
        # Regular Expression to parse the needed information
        regex = '>([A-Z][A-Z]\_\d+\.[0-9]) ([A-Z][a-z]+ [a-z]+) [A-Z]+ \d+ ([a-zA-Z0-9-\s]+)\(' 
        result = re.search(regex, file)
        # Store the ID
        self.ID = result[1]
        # Store the species name
        self.species = result[2]
        # Store the protein type
        self.protein_type = result[3]
        
        # Iterate over the fasta file line by line and concatenate the DNA sequences into one.
        result = ""
        # Note: the first line is skipped because it does not contain DNA sequence.
        for line in file.splitlines()[1:]:            
            result = result + line
        # Store the DNA seq result 
        self.dna = result
        
    # The first phase of the Central Dogma of molecular biology.
    # Transcribe the current DNA sequence into mRNA.
    # Returns an RNA sequences as a String containing the characters (G, A, C, U).
    def transcribe_dna(self):
        pass
    # The second phase of the Central Dogma of molecular biology.
    # Translate the RNA sequence into protein.
    # Returns a sequence of amino acids as a String.
    def translate_dna(rna):
        print("translating...")
    
    # Returns the complementary strand of the current DNA sequence
    def complementary_dna(dna):
        print("complementing...")
    
    # Returns a boolean based on whether a provided protein sequence is result of the 
    # current DNA sequence. True if equal, false otherwise.
    def verify_protein_seq(protein):
        print("verifying...")
    
    # Takes a three letter codon and returns the corresponding Amino Acid 
    # using the table dictionary.
    def codon_table(self, codon):
        # There are 64 total codons and they each represent a particular amino acid.
        table = {
            'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
            'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
            'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
            'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
            'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
            'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
            'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
            'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
            'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
            'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
            'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
            'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
            'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
            'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
            'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
            'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
        }
        # Return the corresponding value to the codon key
        return table[codon]

In [192]:
# Objective: Verify if the fetched DNA sequence, aspergillus_histones_dna, from the NCBI GenBank repository
# is transcribed and translated correctly. 
# Open the file containing the DNA sequence
file = open("aspergillus_histones_dna.fa")
dnaSeq = file.read()

# Instantiate a new Nucleotide object.
aspergillus_histone_dna_seq = Nucleotide(dnaSeq)
x =aspergillus_histone_dna_seq.codon_table('ATA')