# Collection of examples for annotation using the GTF database and gffutils

In [1]:
import pandas as pd
import gffutils
import os
from collections import defaultdict

In [2]:
db_file = '/projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
DATABASE = gffutils.FeatureDB(db_file)

In [3]:
def gene_id_to_name(db):
    '''
    Returns a dictionary containing a gene_id:name translation
    Note: may be different if the 'gene_id' or 'gene_name' 
    keys are not in the source GTF file
    (taken from gscripts.region_helpers)
    '''
    genes = db.features_of_type('gene')
    gene_name_dict = {}
    for gene in genes:
        gene_id = gene.attributes['gene_id'][0] if type(gene.attributes['gene_id']) == list else gene.attributes['gene_id']
        try:
            gene_name_dict[gene_id] = gene.attributes['gene_name'][0]
        except KeyError:
            print(gene.attributes.keys())
            print("Warning. Key not found for {}".format(gene))
            return 1
    return gene_name_dict

gene_id_to_name_dictionary = gene_id_to_name(DATABASE)
gene_id_to_name_dictionary['ENSG00000100320.18']

'RBFOX2'

In [4]:
def gene_name_to_id(db):
    '''
    given a gene name, returns a list of associated Gene IDs (one-to-many)
    '''
    genes = db.features_of_type('gene')
    gene_name_dict = defaultdict(list)
    for gene in genes:
        try:
            gene_name_dict[gene.attributes['gene_name'][0]].append(gene.attributes['gene_id'][0])
        except KeyError as e:
            print("Warning. Key not found for {}".format(gene))
            return 1
    return gene_name_dict

gene_name_to_id_dictionary = gene_name_to_id(DATABASE)
gene_name_to_id_dictionary['RBFOX2']

['ENSG00000100320.18']

In [5]:
def gene_name_to_transcript(db):
    '''
    given a gene name, returns a list of associated transcript IDs (one-to-many)
    '''
    genes = db.features_of_type('transcript')
    gene_name_dict = defaultdict(list)
    for gene in genes:
        try:
            gene_name_dict[gene.attributes['gene_name'][0]].append(gene.attributes['transcript_id'][0])
        except KeyError as e:
            print("Warning. Key not found for {}".format(gene))
            return 1
    return gene_name_dict

gene_name_to_id_dictionary = gene_name_to_transcript(DATABASE)
gene_name_to_id_dictionary['RBFOX2']

['ENST00000405409.2',
 'ENST00000414461.2',
 'ENST00000449924.2',
 'ENST00000262829.7',
 'ENST00000397303.2',
 'ENST00000359369.4',
 'ENST00000463509.1',
 'ENST00000416721.2',
 'ENST00000495377.2',
 'ENST00000438146.2',
 'ENST00000473487.2',
 'ENST00000408983.2',
 'ENST00000491982.1',
 'ENST00000397305.3']

In [6]:
def id_to_exons(db, identifier):
    '''
    takes the gene or transcript id and returns exon positions
    '''
    exons = []
    for i in db.children(identifier, featuretype='exon', order_by='start'):
        exons.append(i)
    return exons

id_to_exons(DATABASE,'ENST00000473487.2')

[<Feature exon (chr22:36156057-36156067[-]) at 0x2ab85e08da10>,
 <Feature exon (chr22:36157249-36157341[-]) at 0x2ab85e08db90>,
 <Feature exon (chr22:36157462-36157515[-]) at 0x2ab85e0993d0>,
 <Feature exon (chr22:36161470-36161530[-]) at 0x2ab85e099650>,
 <Feature exon (chr22:36164304-36164396[-]) at 0x2ab85e0996d0>,
 <Feature exon (chr22:36174072-36174125[-]) at 0x2ab85e9a3590>,
 <Feature exon (chr22:36177647-36177790[-]) at 0x2ab85e9a3910>,
 <Feature exon (chr22:36205827-36206051[-]) at 0x2ab85e9a3c10>,
 <Feature exon (chr22:36220367-36220420[-]) at 0x2ab860aa3090>]

In [87]:
def position_to_features(db, chrom, start, end, strand='', completely_within=True):
    '''
    takes a coordinate and returns all the features overlapping 
    (either completely contained or partially overlapping the region).
    '''
    if strand == '+' or strand == '-':
        return list(
            db.region(
                region=(chrom, start, end), strand=strand, completely_within=completely_within
            )
        )
    else:
        return list(
            db.region(
                region=(chrom, start, end), completely_within=completely_within
            )
        )
# get all features corresponding to the genomic coordinates (True if feature must be entirely contained within region)
features = position_to_features(DATABASE,'chr19', 1000000, 1001000, completely_within=True)

# print all gene names associated with these features
print([f.attributes['gene_name'] for f in features])

[['GRIN3B'], ['GRIN3B'], ['GRIN3B'], ['GRIN3B']]


In [88]:
# Feature objects embed all information as a dictionary
# See: http://pythonhosted.org/gffutils/attributes.html

for f in features:
    print('{}, {}, {}, {}'.format(
        f.attributes['gene_name'], # list of associated gene names
        f.start, # start of feature
        f.end, # end of feature
        f.attributes['transcript_type'])) # type of feature

['GRIN3B'], 1000418, 1000862, ['protein_coding']
['GRIN3B'], 1000437, 1000862, ['protein_coding']
['GRIN3B'], 1000437, 1000439, ['protein_coding']
['GRIN3B'], 1000418, 1000436, ['protein_coding']


# Use some genomic coordinate list to intersect with features
- for each region, return a list of features that are completely contained within that region.
- returns an empty list for regions that do not contain any features

In [89]:
df = pd.read_table('/projects/ps-yeolab3/bay001/annotations/small_bed.bed3', names=['chrom','start','end'], index_col=0)
df

Unnamed: 0,chrom,start,end
region1,chr1,10001000,10002000
region2,chr1,10010000,10011000
region3,chr1,10200000,10202000
region4,chr1,13000000,13003000
region5,chr1,40000000,40004000


In [90]:
features = {}
for ix, row in df.iterrows():
    features[ix] = position_to_features(DATABASE, row['chrom'], row['start'], row['end'], True)

In [95]:
for name, region_list in features.iteritems():
    for region in region_list:
        print(name, region.featuretype, region.attributes['transcript_type'][0], region.attributes['gene_name'][0])


('region5', 'exon', 'antisense', 'RP11-69E11.4')
('region4', 'exon', 'protein_coding', 'PRAMEF6')
('region4', 'CDS', 'protein_coding', 'PRAMEF6')
('region4', 'start_codon', 'protein_coding', 'PRAMEF6')
('region4', 'exon', 'protein_coding', 'PRAMEF6')
('region4', 'CDS', 'protein_coding', 'PRAMEF6')
('region4', 'UTR', 'protein_coding', 'PRAMEF6')
('region4', 'exon', 'protein_coding', 'PRAMEF6')


# Using with pybedtools

In [12]:
import pybedtools

In [66]:
interval = pybedtools.create_interval_from_list(['chr1','13000000','13003000','some_interval','0','-'])


In [67]:
def bedtool_to_features(db, interval, completely_within):
    """
    
    takes a coordinate and returns all the features overlapping 
    (either completely contained or partially overlapping the region).
    
    Parameters
    ----------
    db : sqlite3 database
    interval : pybedtools.Interval
        interval object
    completely_within : bool
        True if the features returned must be completely contained
        within the region. False if the features need only to be
        partially overlapping the region.
        
    Returns
    -------
    region_list: list
        list of Features corresponding to overlapping/contained
        features intersecting a region.
    """
    return position_to_feature(
        db,
        interval.chrom,
        interval.start,
        interval.end,
        interval.strand,
        completely_within
    )

bedtool_to_features(DATABASE, interval, True)

[<Feature exon (chr1:13002062-13002370[-]) at 0x2ab85eaef7d0>,
 <Feature CDS (chr1:13002062-13002348[-]) at 0x2ab85eaef950>,
 <Feature start_codon (chr1:13002346-13002348[-]) at 0x2ab85eaefed0>,
 <Feature exon (chr1:13000814-13001395[-]) at 0x2ab85f56ff90>,
 <Feature CDS (chr1:13000814-13001395[-]) at 0x2ab85eaeff50>,
 <Feature UTR (chr1:13002349-13002370[-]) at 0x2ab85f3c70d0>,
 <Feature exon (chr1:13002062-13002348[-]) at 0x2ab85f3c71d0>]

# Working with raw GTF files (no database)

In [119]:
gtf_file = '/projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf'
# gtf_file = '/projects/ps-yeolab3/bay001/annotations/c_elegans.PRJNA13758.WS257.canonical_geneset.gtf'
GTF_NAMES = ['chrom','source','feature_type','start','end','.','strand','.','attributes']


In [114]:

def get_feature_type_set(gtf_file):
    """
    from a GTF file, extract the set of feature_types
    (feature_types is the third column, normally)
    This might be useful for figuring out the priority for annotation.
    
    Parameters
    ----------
    gtf_file

    Returns
    -------

    """
    gtf_df = pd.read_table(
        gtf_file,
        names=GTF_NAMES,
        comment='#'
    )
    return set(gtf_df['feature_type'])


def get_attribute_type_set(gtf_file, attribute_type):
    """
    from a GTF file, extract the set of attribute_types
    (attribute_types is one of those fields contained within the 9th column)
    This might be useful for figuring out the priority for annotation.
    
    Parameters
    ----------
    gtf_file : basestring
    attribute_type : basestring

    Returns
    -------

    """

    gtf_df = pd.read_table(
        gtf_file,
        names=GTF_NAMES,
        comment='#'
    )
    regex_filter = '{} \"([\w\s\d -]+)\"'.format(attribute_type)
    return set(gtf_df['attributes'].str.extract(regex_filter, expand=False))


In [115]:
# in C elegans GFF
get_feature_type_set(gtf_file)

{'CDS',
 'exon',
 'five_prime_utr',
 'gene',
 'start_codon',
 'stop_codon',
 'three_prime_utr',
 'transcript'}

In [118]:
# in C elegans GFF
get_attribute_type_set(gtf_file, 'biotype')

{'antisense',
 'lincRNA',
 'miRNA',
 'ncRNA',
 'piRNA',
 'protein_coding',
 'pseudogene',
 'rRNA',
 'snRNA',
 'snoRNA',
 'tRNA'}

In [120]:
# in Human GENCODE
get_feature_type_set(gtf_file)

{'CDS',
 'Selenocysteine',
 'UTR',
 'exon',
 'gene',
 'start_codon',
 'stop_codon',
 'transcript'}

In [122]:
# in Human GENCODE
get_attribute_type_set(gtf_file, 'transcript_type')

{'3prime_overlapping_ncrna',
 'IG_C_gene',
 'IG_C_pseudogene',
 'IG_D_gene',
 'IG_J_gene',
 'IG_J_pseudogene',
 'IG_V_gene',
 'IG_V_pseudogene',
 'Mt_rRNA',
 'Mt_tRNA',
 'TR_C_gene',
 'TR_D_gene',
 'TR_J_gene',
 'TR_J_pseudogene',
 'TR_V_gene',
 'TR_V_pseudogene',
 'antisense',
 'lincRNA',
 'miRNA',
 'misc_RNA',
 'non_stop_decay',
 'nonsense_mediated_decay',
 'polymorphic_pseudogene',
 'processed_pseudogene',
 'processed_transcript',
 'protein_coding',
 'pseudogene',
 'rRNA',
 'retained_intron',
 'sense_intronic',
 'sense_overlapping',
 'snRNA',
 'snoRNA',
 'transcribed_processed_pseudogene',
 'transcribed_unprocessed_pseudogene',
 'translated_processed_pseudogene',
 'unitary_pseudogene',
 'unprocessed_pseudogene'}