In [1]:
import os
import requests
import json

import pyensembl

from Bio import Entrez
from Bio.Seq import Seq
from Bio import SeqIO

import wormbase_parasite

## Download seq from ensembl (pyensembl)

In [2]:
genome = pyensembl.EnsemblRelease(100, "mus_musculus")

In [None]:
# transcripts

Actg1_transcript = genome.transcript_by_id('ENSMUST00000071555')
Actg1_mRNA_sequence = Actg1_transcript.sequence
# Actg1_transcript = genome.transcript_by_id('ENSMUST00000071555.12')
# Actg1_mRNA_sequence = Actg1_transcript.sequence
Fermt2_transcript = genome.transcript_by_id('ENSMUST00000045905')
Fermt2_mRNA_sequence = Fermt2_transcript.sequence
Actb_transcript = genome.transcript_by_id('ENSMUST00000100497')
Actb_mRNA_sequence = Actb_transcript.sequence

In [37]:
def save_seq_as_file(gene, folder):
    filename = folder + gene + ".fasta"
    file = open(filename, 'w')
    file.write(Actg1_mRNA_sequence)
    file.close()

folder = "/home/ylee/blast/blastdb/"
gene_list = ['Actg1_mRNA_sequence', 'Fermt2_mRNA_sequence', 'Actb_mRNA_sequence']

for gene in gene_list:
    save_seq_as_file(gene, folder)

In [11]:
# exons in transcripts

Actg1_exon = genome.exon_ids_of_transcript_id('ENSMUST00000071555')     # 6 exons
Ferm2_exon = genome.exon_ids_of_transcript_id('ENSMUST00000045905')     # 15 exons
Actb_exon = genome.exon_ids_of_transcript_id('ENSMUST00000100497')      # 6 exons

print(Actg1_exon, Ferm2_exon, Actb_exon)

['ENSMUSE00000800321', 'ENSMUSE00001254186', 'ENSMUSE00001293379', 'ENSMUSE00001279808', 'ENSMUSE00001284007', 'ENSMUSE00000488442'] ['ENSMUSE00000706602', 'ENSMUSE00000408319', 'ENSMUSE00000315015', 'ENSMUSE00000314890', 'ENSMUSE00000314881', 'ENSMUSE00000315007', 'ENSMUSE00000314872', 'ENSMUSE00000314864', 'ENSMUSE00000314991', 'ENSMUSE00000314985', 'ENSMUSE00000314978', 'ENSMUSE00000314969', 'ENSMUSE00000314959', 'ENSMUSE00000314950', 'ENSMUSE00000336558'] ['ENSMUSE00000877175', 'ENSMUSE00001309702', 'ENSMUSE00001272130', 'ENSMUSE00000517504', 'ENSMUSE00000534432', 'ENSMUSE00000879080']


## Download seq from NCBI (bio.seq)

In [15]:
# Actg1, Fermt2, Actb mRNA identifiers
ensembl_transcript_ids = [
    'ENSMUST00000071555',
    'ENSMUST00000045905',
    'ENSMUST00000100497'
]
NCBI_transcript_ids = []

In [16]:
for i in range(len(ensembl_transcript_ids)):
    # API URL to look up the corresponding NCBI identifiers
    url = f"https://rest.ensembl.org/xrefs/id/{ensembl_transcript_ids[i]}?external_db=RefSeq_mRNA"

    # Make the API request
    response = requests.get(url, headers={"Content-Type": "application/json"})

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        if data:
            
            print(f"NCBI RefSeq IDs for {ensembl_transcript_ids[i]}:")
            for entry in data:
                NCBI_transcript_ids.append(entry['primary_id'])
                print(f"RefSeq ID: {entry['primary_id']}")
            
        else:
            NCBI_transcript_ids.append("No NCBI RefSeq IDs found for : "+{ensembl_transcript_ids[i]})
            print(f"No NCBI RefSeq IDs found for {ensembl_transcript_ids[i]}.")
    else:
        NCBI_transcript_ids.append("Failed to retrieve data: "+{response.status_code})
        print(f"Failed to retrieve data: {response.status_code}")

print(NCBI_transcript_ids)

NCBI RefSeq IDs for ENSMUST00000071555:
RefSeq ID: NM_009609
NCBI RefSeq IDs for ENSMUST00000045905:
RefSeq ID: NM_146054
RefSeq ID: NM_001360526
RefSeq ID: NM_001360525
NCBI RefSeq IDs for ENSMUST00000100497:
RefSeq ID: NM_007393
['NM_009609', 'NM_146054', 'NM_001360526', 'NM_001360525', 'NM_007393']


In [21]:
### Download the genbank files by NM_ transcript IDs to target folder

def ncbi_downloader(query_ls, folder):  # query_ls: a list of NM_ transcript IDs 
                                        # folder: a string to specify the target folder to download
    Entrez.email = "yjbulee@gmail.com"  # Always tell NCBI who you are
    files = [file for file in os.listdir(folder)]
    
    for NM_ in query_ls:
        file = NM_ + '.gb'
        # file = NM_ + '.fasta'
        filename = folder + file
        print(filename)                  ################################s
        try:
            if file not in files:
                # Downloading...
                input_handle = Entrez.efetch(db="nucleotide", id=NM_, rettype="gb", retmode="text")
                # input_handle = Entrez.efetch(db="homologene", rettype="fasta", retmode="text")
                
                out_handle = open(filename, "w")
                out_handle.write(input_handle.read())
                out_handle.close()
                input_handle.close()
        except:
            print("Interrupted!!There are some problems when handling "+ NM_)

In [22]:
query_ls = NCBI_transcript_ids
folder = '/home/ylee/blast/blastdb/'     ## need to be modified

ncbi_downloader(query_ls, folder)

/home/ylee/blast/blastdb/NM_009609.fasta
/home/ylee/blast/blastdb/NM_146054.fasta
/home/ylee/blast/blastdb/NM_001360526.fasta
/home/ylee/blast/blastdb/NM_001360525.fasta
/home/ylee/blast/blastdb/NM_007393.fasta


In [10]:
help(Entrez)

Help on package Bio.Entrez in Bio:

NAME
    Bio.Entrez - Provides code to access NCBI over the WWW.

DESCRIPTION
    The main Entrez web page is available at:
    http://www.ncbi.nlm.nih.gov/Entrez/
    
    Entrez Programming Utilities web page is available at:
    http://www.ncbi.nlm.nih.gov/books/NBK25501/
    
    This module provides a number of functions like ``efetch`` (short for
    Entrez Fetch) which will return the data as a handle object. This is
    a standard interface used in Python for reading data from a file, or
    in this case a remote network connection, and provides methods like
    ``.read()`` or offers iteration over the contents line by line. See
    also "What the heck is a handle?" in the Biopython Tutorial and
    Cookbook: http://biopython.org/DIST/docs/tutorial/Tutorial.html
    http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
    The handle returned by these functions can be either in text mode or
    in binary mode, depending on the data requested a

In [23]:
from Bio import SeqIO

# Path to your .gb file
file_path = "/home/ylee/blast/blastdb/NM_009609.gb"

# Parse the GenBank file
with open(file_path, "r") as file:
    record = SeqIO.read(file, "genbank")

# Display basic information
print(f"ID: {record.id}")
print(f"Name: {record.name}")
print(f"Description: {record.description}")
print(f"Annotations: {record.annotations}")
print(f"Sequence: {record.seq}")

ID: NM_009609.3
Name: NM_009609
Description: Mus musculus actin, gamma, cytoplasmic 1 (Actg1), transcript variant 1, mRNA
Annotations: {'molecule_type': 'mRNA', 'topology': 'linear', 'data_file_division': 'ROD', 'date': '30-APR-2024', 'accessions': ['NM_009609', 'NM_013798', 'XM_193346'], 'sequence_version': 3, 'keywords': ['RefSeq', 'RefSeq Select'], 'source': 'Mus musculus (house mouse)', 'organism': 'Mus musculus', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Glires', 'Rodentia', 'Myomorpha', 'Muroidea', 'Muridae', 'Murinae', 'Mus', 'Mus'], 'references': [Reference(title='Genetic determinants of micronucleus formation in vivo', ...), Reference(title='Spatial transcriptomics reveals novel genes during the remodelling of the embryonic human arterial valves', ...), Reference(title='Transgenerational epigenetic effects imposed by neonicotinoid thiacloprid exposure', ...), Reference(title='Genome-w

## Download seq from UCSC (api)

In [None]:
### download nucleotide sequence from UCSC Genome Browser via TOGAWS API, in FASTA format

def ucsc_downloader(chr_id, start, end, specie='mouse'):
    species_dict = {'human':'hg38', 'mouse':'mm10', 'worm':'ce6', 
                    'fish':'danRer11'}
    spec = species_dict[specie]
    url = f'http://togows.org/api/ucsc/{spec}/chr{chr_id}:{start}-{end}.fasta'
    response = requests.get(url)
    sequence = fasta_reader(response.text)
    return sequence 

In [None]:
ucsc_downloader(chr_id, start, end, specie='mouse')

## Sequence processing

In [None]:
### process FASTA and extract sequence, removing the formatting lines

def fasta_reader(text):     # input string containing FASTA formatted sequence
    ind = text.find('\n')
    pre_seq = text[ind:]
    real_seq = pre_seq.replace('\n', '')
    return real_seq

In [None]:
### retrieve protein-coding transcripts for a given gene from a genomic dataset

def get_protein_coding_transcripts(genome, gene_id):
    gene = genome.gene_by_id(gene_id)
    protein_coding_transcripts = []
    for transcript in gene.transcripts:
        if transcript.biotype == 'protein_coding':
            protein_coding_transcripts.append(transcript)
    return protein_coding_transcripts

In [None]:
def get_first_intron_length(genome, gene_id):
    gene = genome.gene_by_id(gene_id)
    transcripts = get_protein_coding_transcripts(gene_id)
    
    length_to_first_intron_end = 0    
    length_of_five_prime_utr = 0
    distance_to_gene_start = 0
    for transcript in transcripts:
        if gene.strand == '-':
            try:
                distance_to_gene_start_intron = gene.end - transcript.end
                first_exon_intron = transcript.exon_intervals[0][1] - transcript.exon_intervals[1][1]
            except:
                pass
            try:
                distance_to_gene_start = gene.end - transcript.start_codon_positions[-1]
            except:
                pass
        else:
            try:
                distance_to_gene_start_intron = transcript.start - gene.start
                first_exon_intron = transcript.exon_intervals[1][0] - transcript.exon_intervals[0][0]
            except:
                pass
            try:
                distance_to_gene_start = transcript.start_codon_positions[0] - gene.start 
            except:
                pass
            
        if distance_to_gene_start_intron + first_exon_intron >length_to_first_intron_end:
            length_to_first_intron_end = distance_to_gene_start_intron + first_exon_intron
        if distance_to_gene_start >length_of_five_prime_utr:
            length_of_five_prime_utr = distance_to_gene_start
        
    return length_to_first_intron_end, length_of_five_prime_utr

## Download seq from wormbase(wormbase_parasite)

In [None]:
def worm_transcript_seq(transcript_id, spliced=False, annotation=False):
    if spliced == True:
        which = 'spliced_sequence_context'
    else:
        which = 'unspliced_sequence_context'
        
    response = requests.get(f"http://rest.wormbase.org/rest/widget/transcript/{transcript_id}/sequences")
    j = response.json() 

    data_str = json.dumps(j)
    data = json.loads(data_str)
    if data['fields']['strand']['data'] == '-':
        strand = 'negative_strand'
    else:
        strand = 'positive_strand'
    
    transcript_seq = data['fields'][which]['data'][strand]['sequence']
    
    if annotation==False:
        return transcript_seq
    else:
        return data['fields'][which]['data'][strand]

In [None]:
def import_worm_genome(filename='data/caenorhabditis_elegans.PRJNA13758.WBPS17.genomic.fa'):
    chromosome_dict = {}
    with open(filename, 'r') as f:
        lines = f.readlines()
        n = 0
        string = ''
        for line in lines: 
            if line.startswith('>'):
                if string != '':
                    chromosome_dict[chr_] = string
                chr_ = line[1:].strip()
                n = 1 
                string = ''
            elif n == 1:
                string += line.strip()
        chromosome_dict[chr_] = string 
    return chromosome_dict

In [None]:
def worm_genome_seq(chr_, start, end, strand):
    try:
        api = wormbase_parasite.WormbaseClient()
        inquiry = f'{chr_}:{start}:{end}:{strand}'
        seq = api.get_sequence_for_region(inquiry, 'c.elegans')['seq']
    except:
        print('local')
        filename = [i for i in os.listdir('input/') if i.endswith('.fa')][0]
        chromosome_dict = import_worm_genome('input/'+filename)
        if strand!='-1':
            seq = chromosome_dict[chr_][start+2:end+3]
        else:
            seq = str(Seq(chromosome_dict[chr_][start+2:end+3]).reverse_complement())
    return seq

In [None]:
def worm_get_DNA_seq(gene_id, upstream=0, downstream=0):
    response = requests.get(f"http://rest.wormbase.org/rest/widget/gene/{gene_id}/sequences")
    j = response.json() 

    data_str = json.dumps(j)
    data = json.loads(data_str)
    try:
        transcript_id = data['fields']['gene_models']['data']['table'][0]['model'][0]['id']
    except:
        transcript_id = data['fields']['gene_models']['data']['table'][0]['model']['id']
    
    response = requests.get(f"http://rest.wormbase.org/rest/widget/transcript/{transcript_id}/sequences")
    j = response.json() 

    data_str = json.dumps(j)
    data = json.loads(data_str)

    if data['fields']['strand']['data'] == '-':
        strand = '-1'
    else:
        strand = '1'

    response = requests.get(f"http://rest.wormbase.org/rest/widget/gene/{gene_id}/location")
    j = response.json() 

    data_str = json.dumps(j)
    data = json.loads(data_str)
    location = data['fields']['genomic_position']['data'][0]['label']
    chr_ = location.split(':')[0]
    if strand == '-1':
        start = int(location.split(':')[1].split('..')[0]) - downstream
        end = int(location.split(':')[1].split('..')[1]) + upstream
    else:
        start = int(location.split(':')[1].split('..')[0]) - upstream
        end = int(location.split(':')[1].split('..')[1]) + downstream     
    
    #gene_length = abs(int(location.split(':')[1].split('..')[0])-int(location.split(':')[1].split('..')[1]))
    seq = worm_genome_seq(chr_, start, end, strand) 
    return seq