### Install utilities

In [None]:
#https://www.ensembl.org/Homo_sapiens/Transcript/Summary?db=core;g=ENSG00000010404;r=X:149503471-149521096;t=ENST00000521702
#https://curatedpython.com/p/pyensembl-is-openvax-pyensembl/index.html#site_related

In [4]:
# install PyEnsembl
!pip install PyEnsembl

Collecting PyEnsembl
  Using cached pyensembl-2.0.0.tar.gz (60 kB)
Collecting typechecks>=0.0.2
  Using cached typechecks-0.1.0.tar.gz (3.4 kB)
Collecting datacache>=1.1.4
  Using cached datacache-1.1.5.tar.gz (13 kB)
Collecting memoized-property>=1.0.2
  Using cached memoized-property-1.0.3.tar.gz (5.0 kB)
Collecting gtfparse>=1.1.0
  Using cached gtfparse-1.2.1.tar.gz (12 kB)
Collecting serializable
  Using cached serializable-0.2.1.tar.gz (8.4 kB)
Collecting tinytimer
  Using cached tinytimer-0.0.0.tar.gz (2.1 kB)
Collecting appdirs>=1.4.0
  Using cached appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting progressbar33>=2.4
  Using cached progressbar33-2.4.tar.gz (10 kB)
Collecting mock
  Using cached mock-4.0.3-py3-none-any.whl (28 kB)
Collecting simplejson
  Downloading simplejson-3.17.6-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (130 kB)
[K     |████████████████████████████████| 130 kB 953 kB/s eta 0:00:01
[?25hBuilding w

In [2]:
# download the human reference data
!pyensembl install --release 106 --species homo_sapiens

2022-08-02 22:58:57,256 - pyensembl.shell - INFO - Running 'install' for EnsemblRelease(release=106, species='homo_sapiens')
2022-08-02 22:58:58,402 - pyensembl.sequence_data - INFO - Loaded sequence dictionary from /home/cyrillemesue/.cache/pyensembl/GRCh38/ensembl106/Homo_sapiens.GRCh38.cdna.all.fa.gz.pickle
2022-08-02 22:58:58,883 - pyensembl.sequence_data - INFO - Loaded sequence dictionary from /home/cyrillemesue/.cache/pyensembl/GRCh38/ensembl106/Homo_sapiens.GRCh38.ncrna.fa.gz.pickle
2022-08-02 22:58:59,117 - pyensembl.sequence_data - INFO - Loaded sequence dictionary from /home/cyrillemesue/.cache/pyensembl/GRCh38/ensembl106/Homo_sapiens.GRCh38.pep.all.fa.gz.pickle


### Import Utilities

In [2]:
from pyensembl import EnsemblRelease

### Load Database

In [3]:
# EnsemblRelease object enables you to access the local database for various information
data = EnsemblRelease(release=106)

In [4]:
# Construct a Transcript object for given Ensembl transcript ID (e.g. "ENST00000369985")
obj = data.transcript_by_id(transcript_id = "ENST00000369985")
obj

Transcript(transcript_id='ENST00000369985', transcript_name='MYO6-203', gene_id='ENSG00000196586', biotype='protein_coding', contig='6', start=75749239, end=75919537, strand='+', genome='GRCh38')

In [5]:
# get coordinates
chromosome = obj.contig
start = obj.start
end = obj.end
strand = obj.strand
name = obj.transcript_name
coordinates = f"Chromosome {chromosome}{strand}: {start}:{end}"
coordinates

'Chromosome 6+: 75749239:75919537'

In [6]:
# get all transcript ids from database
transcripts = data.transcripts() # get all transcripts for all ids in the database
transcript_ids = []
for transcript in transcripts:
    transcript_ids.append(transcript.transcript_id)

In [7]:
# this function maps a list of transcript ids to genomic coordinates
def map_transcript_ids(transcript_ids):
    """
    map a list of transcipt ids to their genomic coordinates
    return (raw_coordinates, arranged_coordinates)
        raw coordinates contain all the transcript entities: name, start, end, contig, strand, etc
        arranged_coordinates coordinates contain the chromosome number, strand, start and end positions 
        as such:'Chromosome 6+: 75749239:75919537' 
    """
    
    raw_coordinates = {}
    arranged_coordinates = {}
    
    # load the ensembl release data
    data = EnsemblRelease(release=106)
    
    def extract_info(Transcript_object):
        """
        extract the essential coordinate info from a given transcript object and returns
        """
        
        extract = {}
        
        extract["contig"] = Transcript_object.contig # chromosome type
        extract["start"] = Transcript_object.start
        extract["end"] = Transcript_object.end
        extract["strand"] = Transcript_object.strand # forward strand(+) reverse strand (-)
        extract["transcript_name"] = Transcript_object.name
        extract["gene_id"] = Transcript_object.gene_id
        extract["biotype"] = Transcript_object.biotype # coding or non-coding
        
        return extract

    for transcript_id in transcript_ids:
        # create a transcript object for the given transcript id
        try:
            Transcript_object = data.transcript_by_id(transcript_id = transcript_id)
        except:
            continue
        
        # extract coordinates from transcript object
        extract = extract_info(Transcript_object)
        
        # record raw coordinates
        raw_coordinates[transcript_id] = extract
        
        # arrange coordinates into a single string 
        coordinates = f'Chromosome {extract["contig"]}{extract["strand"]}: {extract["start"]}:{extract["end"]}'
        
        # record arranged coordinates
        arranged_coordinates[transcript_id] = coordinates
    return raw_coordinates, arranged_coordinates

In [8]:
mapped_transcripts = map_transcript_ids(transcript_ids)

In [9]:
mapped_transcripts[0]

{'ENST00000000233': {'contig': '7',
  'start': 127588411,
  'end': 127591700,
  'strand': '+',
  'transcript_name': 'ARF5-201',
  'gene_id': 'ENSG00000004059',
  'biotype': 'protein_coding'},
 'ENST00000000412': {'contig': '12',
  'start': 8940361,
  'end': 8949645,
  'strand': '-',
  'transcript_name': 'M6PR-201',
  'gene_id': 'ENSG00000003056',
  'biotype': 'protein_coding'},
 'ENST00000000442': {'contig': '11',
  'start': 64305524,
  'end': 64316743,
  'strand': '+',
  'transcript_name': 'ESRRA-201',
  'gene_id': 'ENSG00000173153',
  'biotype': 'protein_coding'},
 'ENST00000001008': {'contig': '12',
  'start': 2794970,
  'end': 2805423,
  'strand': '+',
  'transcript_name': 'FKBP4-201',
  'gene_id': 'ENSG00000004478',
  'biotype': 'protein_coding'},
 'ENST00000001146': {'contig': '2',
  'start': 72129238,
  'end': 72147862,
  'strand': '-',
  'transcript_name': 'CYP26B1-201',
  'gene_id': 'ENSG00000003137',
  'biotype': 'protein_coding'},
 'ENST00000002125': {'contig': '2',
  'start

In [10]:
chrom = {}
for d in mapped_transcripts[0]:
    if mapped_transcripts[0][d]['contig'] not in chrom:
        chrom[mapped_transcripts[0][d]['contig']] = 1
    else:
        chrom[mapped_transcripts[0][d]['contig']] += 1

In [11]:
chrom

{'7': 11738,
 '12': 13531,
 '11': 14569,
 '2': 18395,
 '6': 11496,
 '16': 11509,
 '4': 10095,
 '3': 15595,
 '1': 21800,
 '8': 10095,
 '19': 14106,
 '17': 14195,
 '22': 5270,
 '5': 11716,
 '14': 8896,
 'X': 7861,
 '10': 8868,
 '18': 4746,
 '20': 5918,
 '13': 4591,
 '15': 8942,
 'Y': 841,
 '9': 8500,
 '21': 3142,
 'MT': 37,
 'GL000194.1': 2,
 'KI270727.1': 8,
 'KI270713.1': 4,
 'GL000216.2': 1,
 'KI270728.1': 7,
 'GL000220.1': 4,
 'KI270442.1': 2,
 'GL000195.1': 2,
 'GL000218.1': 1,
 'KI270711.1': 2,
 'KI270721.1': 4,
 'GL000219.1': 1,
 'KI270750.1': 1,
 'KI270734.1': 5,
 'KI270733.1': 4,
 'GL000213.1': 3,
 'KI270744.1': 1,
 'KI270726.1': 2,
 'GL000009.2': 1,
 'KI270731.1': 2,
 'GL000205.2': 1,
 'GL000225.1': 1}

### Get transcript sequences

In [32]:
transcripts[0]

Transcript(transcript_id='ENST00000000233', transcript_name='ARF5-201', gene_id='ENSG00000004059', biotype='protein_coding', contig='7', start=127588411, end=127591700, strand='+', genome='GRCh38')

In [33]:
data.transcripts_by_name(transcript_name = 'ARF5-201')

[Transcript(transcript_id='ENST00000000233', transcript_name='ARF5-201', gene_id='ENSG00000004059', biotype='protein_coding', contig='7', start=127588411, end=127591700, strand='+', genome='GRCh38')]

In [34]:
data.gene_by_id(gene_id = 'ENSG00000004059')

Gene(gene_id='ENSG00000004059', gene_name='ARF5', biotype='protein_coding', contig='7', start=127588386, end=127591700, strand='+', genome='GRCh38')

In [11]:
len(transcripts)

246511