Sequence Retrieval

In [1]:
from Bio import SeqIO
from Bio import Entrez
Entrez.email = 'A.N.Other@example.com'

#### Method 1： From a text (string)
Create a random string of DNA

In [2]:
import random
#Define a list of the nucleotide bases
Nucleotide = ["G", "T", "A", "C"]

# Use random.choice method from random module to generate concatenated string
# With the given length of 50, you can change it depending on how long the sequence you want to be
randDNAstr = "".join([random.choice(Nucleotide) for nuc in range(50)])

DNAString = randDNAstr
DNAString

'TCGCTGGTGGGCGGTATCTTAGGTAACGCTGCCACTTACCTGTAATCCAA'

#### Method 2 Parsing a URL

In [3]:
from Bio import SeqIO
from urllib.parse import urlparse

def extract_accession_number(url):
    parsed_url = urlparse(url)
    segments = parsed_url.path.split("/")
    accession_number = segments[-1]
    return accession_number

def getnucleotide_from_ncbi(url, output_file):
    accession_number = extract_accession_number(url)
    output_file = "output_seq.fasta"
    if not accession_number:
        print("Invalid URL. Please provide a valid NCBI nucleotide sequence URL.")
        return
    handle = Entrez.efetch(db="nucleotide", id=accession_number, rettype="gb", retmode="text")
    record = SeqIO.read(handle, "genbank")
    handle.close()
    return record

def save_fasta_to_file(record, output_file):
    with open(output_file, "w") as f:
        fasta_format = f">{record.id} {record.description}\n{record.seq}"
        f.write(fasta_format)




Example Usage

Let's try with the amoA gene of Nitrosomonas europaea which is a microorganism that is capable to convert ammonia into nitrite in the nitrogen cycle. This is just one of the many microorganisms which is commonly studied in the aquatic environment.

In [53]:
ncbi_url = "https://www.ncbi.nlm.nih.gov/nuccore/L08050.1"  

gb_record = getnucleotide_from_ncbi(ncbi_url, output_file)
save_fasta_to_file(fasta_record, output_file)
print(gb_record)

ID: L08050.1
Name: NTOAMOAB
Description: Nitrosomonas europaea ammonia monooxygenase acetylene-binding protein (amoA) gene, complete cds; ammonia monooxygenase (amoB) gene, complete cds
Number of features: 6
/molecule_type=DNA
/topology=linear
/data_file_division=BCT
/date=11-MAY-1995
/accessions=['L08050']
/sequence_version=1
/keywords=['acetylene-binding protein', 'ammonia monooxygenase', 'amoA gene', 'amoB gene']
/source=Nitrosomonas europaea
/organism=Nitrosomonas europaea
/taxonomy=['Bacteria', 'Pseudomonadota', 'Betaproteobacteria', 'Nitrosomonadales', 'Nitrosomonadaceae', 'Nitrosomonas']
/references=[Reference(title='Sequence of the gene coding for ammonia monooxygenase in Nitrosomonas europaea', ...), Reference(title='Sequence of the gene, amoB, for the 43-kDa polypeptide of ammonia monoxygenase of Nitrosomonas europaea', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/comment=On May 11, 1995 this sequence version replaced gi:791182.

#### Method 3: Reading a FASTA Format File

In [94]:
from Bio import SeqIO
import pandas as pd


with open("Msarcina thermophila.fasta") as fasta_f:
    id, description, sequence, lengths = [], [], [], []
    for seq_record in SeqIO.parse(fasta_f,'fasta'):
        id.append(str(seq_record.id))
        description.append(str(seq_record.description))
        sequence.append(str(seq_record.seq))
        lengths.append(len(seq_record.seq))

In [95]:

df = pd.DataFrame({
    'Accession ID': id,
    'Name': description,
    'Sequence': sequence,
    'Length': lengths
})

df.head(10)

Unnamed: 0,Accession ID,Name,Sequence,Length
0,FPAO01000001.1,FPAO01000001.1 Methanosarcina thermophila stra...,AAGGATTCCGGTAAAAGACCGGTTTGATAGAAACGGGATGTAAGCA...,518281
1,FPAO01000002.1,FPAO01000002.1 Methanosarcina thermophila stra...,CAGACTGAAAGCGAATGATTATCTTGAATTAATAGGACTTCAGAAT...,402334
2,FPAO01000003.1,FPAO01000003.1 Methanosarcina thermophila stra...,TGGAACCAGGAGTAATAGTAAAGCGAGAAACCCATATCAGGATGCA...,370502
3,FPAO01000004.1,FPAO01000004.1 Methanosarcina thermophila stra...,TGTGAAAGTTCCTTTACATCCACATTTCTTGCAGGCTCTGGGGAAC...,315083
4,FPAO01000005.1,FPAO01000005.1 Methanosarcina thermophila stra...,GGGCAGGTTATCCACGTGTTACTGAGCAGTACGCCATGTTCACGAA...,229870
5,FPAO01000006.1,FPAO01000006.1 Methanosarcina thermophila stra...,AACACTAATTATTACCCACTAATTAATTACTAATTAGTGGGATCCA...,200928
6,FPAO01000007.1,FPAO01000007.1 Methanosarcina thermophila stra...,ATTCGAGAGCAAGATCCATTAAAACAAGGATTGAAACCTGTCCTGC...,194248
7,FPAO01000008.1,FPAO01000008.1 Methanosarcina thermophila stra...,CAGCTCAAAGCAAAATCAACCCTGACTTCAAATCTGTCCTCAGGTG...,192318
8,FPAO01000009.1,FPAO01000009.1 Methanosarcina thermophila stra...,TAATTAGTGTTTGATAGCTACCACTATCAATGGATCCCACTAATTA...,178270
9,FPAO01000010.1,FPAO01000010.1 Methanosarcina thermophila stra...,TTATCCCCAGTTTCCCGGGGTTATCCCGGTCCCAAGGGCAGGTTAT...,122392


In [96]:
df.to_csv("Gene Data.csv", index=False)