<h3> Obtaining DNA Sequences from Various Methods</h3>

In [1]:
from Bio import SeqIO
from Bio import Entrez
Entrez.email = "bryan@gmail.com"

#### Method 1: Simple Text (String)

Creating a nucleotide sequence from a randomly generated string representing the nucleotide bases

In [5]:
import random
#Define a list of the nucleotide bases
Nucleotide = ["G", "T", "A", "C"]

# Use random.choice method from random module to generate concatenated string
# With the given length of 50, you can change it depending on how long the sequence you want to be
DNAString = "".join([random.choice(Nucleotide) for nuc in range(50)])

DNAString

'GGGCATTCGATGCCATTCCCTGGGAGCGGACGTCAGGGTCCGCTGATTTA'

#### Method 2 Parsing a URL

In [14]:
#Define the functions
from urllib.parse import urlparse

def extract_accession_number(url):
    parsed_url = urlparse(url)
    segments = parsed_url.path.split("/")
    accession_number = segments[-1]
    return accession_number

def getnucleotide_from_ncbi(url):
    accession_number = extract_accession_number(url)
    if not accession_number:
        print("Invalid URL. Please provide a valid NCBI nucleotide sequence URL.")
        return
    handle = Entrez.efetch(db="nucleotide", id=accession_number, rettype="gb", retmode="text")
    record = SeqIO.read(handle, "genbank")
    handle.close()
    return record

def save_fasta_to_file(record):
    output_file = "output_seq.fasta"
    with open(output_file, "w") as f:
        fasta_format = f">{record.id} {record.description}\n{record.seq}"
        f.write(fasta_format)




Example Usage

Let's try with taking the amoA gene of Nitrosomonas europaea (a type of ammonia-oxidizing bacteria responsible for converting ammonia into nitrite in the nitrogen cycle). Commonly studied in biological wastewater treatment system.

In [18]:
ncbi_url = "https://www.ncbi.nlm.nih.gov/nuccore/L08050.1"  

record = getnucleotide_from_ncbi(ncbi_url)
save_fasta_to_file(record)
print(record)

ID: L08050.1
Name: NTOAMOAB
Description: Nitrosomonas europaea ammonia monooxygenase acetylene-binding protein (amoA) gene, complete cds; ammonia monooxygenase (amoB) gene, complete cds
Number of features: 6
/molecule_type=DNA
/topology=linear
/data_file_division=BCT
/date=11-MAY-1995
/accessions=['L08050']
/sequence_version=1
/keywords=['acetylene-binding protein', 'ammonia monooxygenase', 'amoA gene', 'amoB gene']
/source=Nitrosomonas europaea
/organism=Nitrosomonas europaea
/taxonomy=['Bacteria', 'Pseudomonadota', 'Betaproteobacteria', 'Nitrosomonadales', 'Nitrosomonadaceae', 'Nitrosomonas']
/references=[Reference(title='Sequence of the gene coding for ammonia monooxygenase in Nitrosomonas europaea', ...), Reference(title='Sequence of the gene, amoB, for the 43-kDa polypeptide of ammonia monoxygenase of Nitrosomonas europaea', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/comment=On May 11, 1995 this sequence version replaced gi:791182.

#### Method 3: Reading a FASTA Format File 

In [None]:
from Bio import SeqIO
import pandas as pd


with open("methanogens.fasta") as fasta_f:
    id, description, sequence, lengths = [], [], [], []
    for seq_record in SeqIO.parse(fasta_f,'fasta'):
        id.append(str(seq_record.id))
        description.append(str(seq_record.description))
        sequence.append(str(seq_record.seq))
        lengths.append(len(seq_record.seq))

In [None]:
df = pd.DataFrame({
    'Accession ID': id,
    'Name': description,
    'Sequence': sequence,
    'Length': lengths
})

df.head(10)