In [1]:
import pandas as pd
import numpy as np
import os
import gffutils
import pybedtools
from collections import defaultdict

# AS structure mirrors gtf/gff file, except the last attributes column contains premrna_length and mrna_length
- one line per gene
- 2nd column contains "AS_STRUCTURE"
- mrna_lengths = (longest transcript) exons regions
- premrna_lengths = gene (longest transcript) start - stop

In [2]:
db_file = '/projects/ps-yeolab/genomes/hg19/gencode_v17/gencode.v17.annotation.gtf.db'
db = gffutils.FeatureDB(db_file)

In [3]:
def get_keys(species):
    """
    Describes the language for each kind of gtf file 
    (usually different between sources).

    :param species: string
        either one of: 'hg19','mm10','ce11','mm9','hg38'
    :return:
    """

    if species == 'ce10':
        print("species is {}".format(species))
        cds_key = 'CDS'
        utr3_key = 'three_prime_UTR'
        utr5_key = 'five_prime_UTR'
        utr_key = None
        gene_key = 'gene'
        gene_name_key = 'gene_name'
        transcript_id_key = 'transcript_id'
        type_key = 'transcript_biotype'
        exon_key = 'exon'
        gene_id_key = 'gene_id'
        gene_type_key = 'gene_biotype'
    elif species == 'ce11':
        print("species is {}".format(species))
        cds_key = 'CDS'
        utr3_key = 'three_prime_utr'
        utr5_key = 'five_prime_utr'
        utr_key = None
        gene_key = 'gene'
        gene_name_key = 'gene_name'
        transcript_id_key = 'transcript_id'
        type_key = 'transcript_biotype'
        exon_key = 'exon'
        gene_id_key = 'gene_id'
        gene_type_key = 'gene_biotype'
    else:
        print("species is {}".format(species))
        cds_key = 'CDS'
        utr3_key = None  #
        utr5_key = None  # in human/mice, this key doesn't exist
        utr_key = 'UTR'
        gene_key = 'gene'
        gene_name_key = 'gene_name'
        transcript_id_key = 'transcript_id'
        type_key = 'transcript_type'
        exon_key = 'exon'
        gene_id_key = 'gene_id'
        gene_type_key = 'gene_type'

    keys = {
        'cds': cds_key,
        'utr3': utr3_key,
        'utr5': utr5_key,
        'utr': utr_key,
        'gene': gene_key,
        'gene_name': gene_name_key,
        'transcript_id': transcript_id_key,
        'transcript_type': type_key,
        'exon': exon_key,
        'gene_id': gene_id_key,
        'gene_type': gene_type_key
    }
    return keys


In [4]:
def get_gene_to_transcript_dict(db, gene_id_key, transcript_id_key):
    genes_dict = defaultdict(list)
    for gene_feature in db.features_of_type('transcript'):
        for gene_id in gene_feature.attributes[gene_id_key]:
            for transcript_id in gene_feature.attributes[transcript_id_key]:
                genes_dict[gene_id].append(transcript_id)
    return genes_dict
    
def get_all_exons_dict(db, transcript_id_key):
    """
    Returns dictionary of exons as transcript_id:{
        [
            {'start':START, 'end':END},
            {'start':START, 'end':END},
            ...
        ]
    }.

    :param db: gffutils.FeatureDB
    :param transcript_id_key: string
    :return: 
    """

    exons_dict = defaultdict(list)
    for exon_feature in db.features_of_type('exon'):
        for transcript_id in exon_feature.attributes[transcript_id_key]:
            exons_dict[transcript_id].append(
                {
                    'chrom': exon_feature.seqid,
                    'start': exon_feature.start,
                    'end': exon_feature.end,
                    'strand': exon_feature.strand,
                }
            )
    return exons_dict


def get_all_transcripts_dict(db, transcript_id_key):
    """
    Returns dictionary of transcript_id:{'start':START, 'end':END}.

    :param db: gffutils.FeatureDB
    :param transcript_id_key: string
    :return: transcripts_dict: defaultdict(dict)
        hash of transcripts and their start/end coordinates
    """
    transcripts_dict = defaultdict(dict)
    for transcript_feature in db.features_of_type('transcript'):
        for transcript_id in transcript_feature.attributes[transcript_id_key]:
            transcripts_dict[transcript_id] = {
                'start': transcript_feature.start,
                'end': transcript_feature.end
            }
    return transcripts_dict

In [5]:
keys = get_keys('hg19')
keys

species is hg19


{'cds': 'CDS',
 'exon': 'exon',
 'gene': 'gene',
 'gene_id': 'gene_id',
 'gene_name': 'gene_name',
 'gene_type': 'gene_type',
 'transcript_id': 'transcript_id',
 'transcript_type': 'transcript_type',
 'utr': 'UTR',
 'utr3': None,
 'utr5': None}

In [6]:
genes = get_gene_to_transcript_dict(db, keys['gene_id'], keys['transcript_id'])
exons = get_all_exons_dict(db, keys['transcript_id'])
transcripts = get_all_transcripts_dict(db, keys['transcript_id'])

# The following code gives me the length of the longest transcript
- expected behavior?

In [7]:
def get_longest_transcripts(genes_dict, transcripts_dict):
    """
    Returns a dictionary of genes : longest_transcript
    """
    longest_genes_dict = defaultdict(dict)
    for gene, transcripts in genes_dict.iteritems():
        max_transcript_len = -1
        max_transcript = ""
        for transcript in transcripts:
            transcript_len = transcripts_dict[transcript]['end'] - transcripts_dict[transcript]['start']
            if transcript_len > max_transcript_len:
                max_transcript_len = transcript_len
                max_transcript = transcript
        longest_genes_dict[gene] = max_transcript
    return longest_genes_dict

In [8]:
longest_transcripts = get_longest_transcripts(genes, transcripts)

In [9]:
def get_premrna_lengths(gene_id, longest_transcripts, transcripts_dict):
    pos = transcripts_dict[longest_transcripts[gene_id]]
    print(pos['start'], pos['end'], longest_transcripts[gene_id])
    return pos['end'] - pos['start'] + 1  # (coords are 1-based inclusive still)

get_premrna_lengths('ENSG00000269416.1', longest_transcripts, transcripts)

(23582035, 23598873, 'ENST00000600643.1')


16839

# The following code gives me the length of the (most upstream - most downstream) transcript position
- expected behavior?

In [10]:
def most_upstream_downstream_positions(genes_dict, transcripts_dict):
    """
    Assumes the most downstream start < most downstream end (+, flipped for -) foreach transcript.
    Returns a dictionary of genes : 
    """
    d = defaultdict(dict)
    for gene, transcripts in genes_dict.iteritems():
        min_transcript_pos = 1000000000  # as long as we don't have any chromosomes larger than 1 billion
        max_transcript_pos = -1
        for transcript in transcripts:
            if transcripts_dict[transcript]['end'] > max_transcript_pos:
                max_transcript_pos = transcripts_dict[transcript]['end']
            if transcripts_dict[transcript]['start'] < min_transcript_pos:
                min_transcript_pos = transcripts_dict[transcript]['start']
                
        d[gene] = {'start':min_transcript_pos, 'end':max_transcript_pos}
    return d

In [24]:
upstream_downstream = most_upstream_downstream_positions(genes, transcripts)

In [12]:
def get_premrna_lengths(gene_id, upstream_downstream, transcripts_dict):
    start = upstream_downstream[gene_id]['start']
    end = upstream_downstream[gene_id]['end'] 
    return end - start + 1  # (coords are 1-based inclusive still)

get_premrna_lengths('ENSG00000095587.8', upstream_downstream, transcripts)

149313

In [48]:
def get_mrna_lengths(gene_id, exons_dict, genes_dict):
    exons_list = pybedtools.BedTool()
    total_mrna_length = 0
    for transcript in genes_dict[gene_id]:
        for exon in exons_dict[transcript]:
            exons_list.append(pybedtools.create_interval_from_list(
                [exon['chrom'],
                str(exon['start'] - 1),  # since we're converting to bedtool, use 0-based
                str(exon['end']),
                transcript,
                '0',
                exon['strand']]
            ))
    exons_list = exons_list.sort().merge()  # not strand specific
    total_mrna_length = [(e.end - e.start) for e in exons_list]
    return total_mrna_length
    # return exons_list.sort().total_coverage()

In [49]:
%%timeit
get_mrna_lengths('ENSG00000095587.8', exons, genes)

AttributeError: 'BedTool' object has no attribute 'append'

In [26]:
%%timeit
get_premrna_lengths('ENSG00000095587.8', upstream_downstream, transcripts)

The slowest run took 29.90 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 335 ns per loop


In [27]:
from tqdm import trange, tqdm_notebook, tnrange
out_file= '/home/bay001/projects/codebase/annotator/tests/create_as_structure/outputs/test/hg19.AS.STRUCTURE.COMPILED.gff'
progress = tnrange(db.count_features_of_type(keys['gene']))
with open(out_file, 'w') as f:
    for gene in db.features_of_type('gene'):
        gene_id = gene.attributes[keys['gene_id']][0]  # just take first gene_id
        f.write(
            '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                gene.seqid,
                "AS_STRUCTURE",
                "gene",
                upstream_downstream[gene_id]['start'],
                upstream_downstream[gene_id]['end'],
                '.',
                gene.strand,
                '.',
                "gene_id={};mrna_length={};premrna_length={}".format(
                    gene_id,
                    get_mrna_lengths(gene_id, exons, genes),
                    get_premrna_lengths(gene_id, upstream_downstream, transcripts)
                )
            )
        )
        progress.update(1)




Exception in thread Thread-5:
Traceback (most recent call last):
  File "/home/bay001/anaconda2/envs/brian/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/home/bay001/anaconda2/envs/brian/lib/python2.7/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/home/bay001/anaconda2/envs/brian/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



# create gencode v17 db for comparison against old

In [17]:
gtf_file = '/projects/ps-yeolab/genomes/hg19/gencode_v17/gencode.v17.annotation.gtf'
db_file = '/projects/ps-yeolab/genomes/hg19/gencode_v17/gencode.v17.annotation.gtf.db'

In [18]:
def build_db(annotation_file, db_file, force=True, disable_infer_genes=True, disable_infer_transcripts=True):
    db = gffutils.create_db(
        annotation_file, dbfn=db_file, force=force, # change to True if we need to create a new db
        keep_order=True, merge_strategy='merge', sort_attribute_values=True,
        disable_infer_genes=disable_infer_genes,
        disable_infer_transcripts=disable_infer_transcripts
    )
    
# build_db(gtf_file, db_file, disable_infer_genes=True, disable_infer_transcripts=True)

In [21]:

db.count_features_of_type('gene')

57281