# Collection of examples for annotation using the GTF database and gffutils

In [1]:
import pandas as pd
import gffutils
import os
from collections import defaultdict

# Create the sqlite database file from a GTF file
- You might also be able to use this with a GFF file, but YMMV
- only need to build the database file once, then you can refer back to the .db file that was created

In [2]:
def build_db(annotation_file, db_file, force=True, disable_infer_genes=True, disable_infer_transcripts=True):
    """ 
    Creates a sqlite database containing features given a GTF or GFF file.
    """
    db = gffutils.create_db(
        annotation_file, dbfn=db_file, force=force, # change to True if we need to create a new db
        keep_order=True, merge_strategy='merge', sort_attribute_values=True,
        disable_infer_genes=disable_infer_genes,
        disable_infer_transcripts=disable_infer_transcripts
    )

annotation_file = '/projects/ps-yeolab/genomes/mm10/gencode/gencode.vM10.annotation.gtf'
db_file = '/projects/ps-yeolab/genomes/mm10/gencode/gencode.vM10.annotation.gtf.db'
# build_db(annotation_file, db_file, disable_infer_genes=False, disable_infer_transcripts=False)

# This returns all the featuretypes given a GTF (database) file.

In [3]:
def get_all_featuretypes(db):
    """
    Returns the possible featuretypes (3rd col of gtf file) from a db.
    """
    features = []
    for feature in db.featuretypes():
        features.append(feature)
    return features

db_file = '/projects/ps-yeolab4/genomes/dm6/dm6.db'
DATABASE = gffutils.FeatureDB(db_file)
get_all_featuretypes(DATABASE)

['CDS', 'exon', 'gene', 'start_codon', 'stop_codon', 'transcript']

# This returns a dictionary where keys are (Ensembl) gene IDs and the values are the gene names. 
- If you wanted the gencode-style gene IDs instead, replace:

    
    ```
    gene_name_dict[gene_id.split('.')[0]] = gene.attributes['gene_name'][0]
    ```
    
    
    with:
    
    
    ```
    gene_name_dict[gene_id.split] = gene.attributes['gene_name'][0]
    ```

In [4]:
def gene_id_to_name(db):
    """
    Returns a dictionary containing a gene_id:name translation
    Note: may be different if the 'gene_id' or 'gene_name' 
    keys are not in the source GTF file
    (taken from gscripts.region_helpers)
    """
    genes = db.features_of_type('gene')
    gene_name_dict = {}
    for gene in genes:
        gene_id = gene.attributes['gene_id'][0] if type(gene.attributes['gene_id']) == list else gene.attributes['gene_id']
        try:
            gene_name_dict[gene_id.split('.')[0]] = gene.attributes['gene_name'][0]  # this is for ENSEMBL-style IDs! 
        except KeyError:
            print(gene.attributes.keys())
            print("Warning. Key not found for {}".format(gene))
            return 1
    return gene_name_dict


db_file = '/projects/ps-yeolab4/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
DATABASE = gffutils.FeatureDB(db_file)
gene_id_to_name_dictionary = gene_id_to_name(DATABASE)
# gene_id_to_name_dictionary['ENSG00000100320.18']
gene_id_to_name_dictionary['ENSG00000100320']

  "method of this object." % self.version)


'RBFOX2'

# Here is an example of some expression data

In [5]:
csv = pd.read_table(
    '/projects/ps-yeolab3/iachaim/Cleber_Organoids/6_months/AGGREGATE-WT-H09-H10_expression.csv',
    sep=','
)
csv.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0.1,Unnamed: 0,AAACCTGAGATCTGAA-1,AAACCTGCATCCTTGC-1,AAACCTGGTCAGAGGT-1,AAACGGGCAATGTTGC-1,AAACGGGCAGTTCATG-1,AAACGGGGTGTGGTTT-1,AAACGGGGTTAAGAAC-1,AAACGGGTCAGTCCCT-1,AAAGATGAGCGCTCCA-1,...,TTTGGTTAGACCTAGG-2,TTTGGTTAGCCCAACC-2,TTTGGTTGTCCATCCT-2,TTTGGTTGTGTGGTTT-2,TTTGGTTTCACCCGAG-2,TTTGTCAAGACATAAC-2,TTTGTCACACACCGCA-2,TTTGTCACAGCTGTTA-2,TTTGTCAGTTGGGACA-2,TTTGTCATCCAGGGCT-2
0,ENSG00000243485,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ENSG00000237613,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ENSG00000186092,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ENSG00000238009,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ENSG00000239945,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Write a helper function that you can apply to all rows of the expression table

In [6]:
def geneid2name(row, d=gene_id_to_name_dictionary):
    try:
        return d[row['Unnamed: 0']]
    except KeyError:
        return row['Unnamed: 0']

csv['gene_name'] = csv.apply(geneid2name, axis=1)
csv.head()

Unnamed: 0.1,Unnamed: 0,AAACCTGAGATCTGAA-1,AAACCTGCATCCTTGC-1,AAACCTGGTCAGAGGT-1,AAACGGGCAATGTTGC-1,AAACGGGCAGTTCATG-1,AAACGGGGTGTGGTTT-1,AAACGGGGTTAAGAAC-1,AAACGGGTCAGTCCCT-1,AAAGATGAGCGCTCCA-1,...,TTTGGTTAGCCCAACC-2,TTTGGTTGTCCATCCT-2,TTTGGTTGTGTGGTTT-2,TTTGGTTTCACCCGAG-2,TTTGTCAAGACATAAC-2,TTTGTCACACACCGCA-2,TTTGTCACAGCTGTTA-2,TTTGTCAGTTGGGACA-2,TTTGTCATCCAGGGCT-2,gene_name
0,ENSG00000243485,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,MIR1302-11
1,ENSG00000237613,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,FAM138A
2,ENSG00000186092,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,OR4F5
3,ENSG00000238009,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,RP11-34P13.7
4,ENSG00000239945,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,RP11-34P13.8


# Other helpful functions that pull out other kinds of information (whether or not a gene is protein coding, etc.)

In [7]:
def find_protein_coding_genes_num(db):
    """
    Finds the number of protein coding genes in the database.
    """
    all_pc_count = 0
    pc_count = 0
    genes = db.features_of_type('gene')
    for gene in genes:
        pc = False
        for gene_type in gene.attributes['gene_type']:
            if gene_type == 'protein_coding':
                pc = True
                all_pc_count += 1
        if pc:
            pc_count += 1
    return pc_count

db_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
DATABASE = gffutils.FeatureDB(db_file)

find_protein_coding_genes_num(DATABASE)

  "method of this object." % self.version)


20345

In [8]:
def gene_id_to_protein_coding(db):
    """
    returns whether or not a gene is protein coding or not.
    """
    genes = db.features_of_type('gene')
    gene_name_dict = {}
    for gene in genes:
        gene_id = gene.attributes['gene_id'][0] if type(gene.attributes['gene_id']) == list else gene.attributes['gene_id']
        try:
            gene_name_dict[gene_id] = gene.attributes['gene_type'][0]
        except KeyError:
            print(gene.attributes.keys())
            print("Warning. Key not found for {}".format(gene))
            return 1
    return gene_name_dict

db_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
DATABASE = gffutils.FeatureDB(db_file)

gene_id_to_pc = gene_id_to_protein_coding(DATABASE)
gene_id_to_pc['ENSG00000100320.18']

'protein_coding'

In [9]:
def gene_name_to_id(db):
    '''
    given a gene name, returns a list of associated Gene IDs (one-to-many)
    '''
    genes = db.features_of_type('gene')
    gene_name_dict = defaultdict(list)
    for gene in genes:
        try:
            gene_name_dict[gene.attributes['gene_name'][0]].append(gene.attributes['gene_id'][0])
        except KeyError as e:
            print("Warning. Key not found for {}".format(gene))
            return 1
    return gene_name_dict

db_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
DATABASE = gffutils.FeatureDB(db_file)

gene_name_to_id_dictionary = gene_name_to_id(DATABASE)
gene_name_to_id_dictionary['RBFOX2']

['ENSG00000100320.18']

In [10]:
def gene_name_to_transcript(db):
    '''
    given a gene name, returns a list of associated transcript IDs (one-to-many)
    '''
    genes = db.features_of_type('transcript')
    gene_name_dict = defaultdict(list)
    for gene in genes:
        try:
            gene_name_dict[gene.attributes['gene_name'][0]].append(gene.attributes['transcript_id'][0])
        except KeyError as e:
            print("Warning. Key not found for {}".format(gene))
            return 1
    return gene_name_dict

db_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
DATABASE = gffutils.FeatureDB(db_file)

gene_name_to_id_dictionary = gene_name_to_transcript(DATABASE)
gene_name_to_id_dictionary['RBFOX2']

['ENST00000405409.2',
 'ENST00000414461.2',
 'ENST00000449924.2',
 'ENST00000262829.7',
 'ENST00000397303.2',
 'ENST00000359369.4',
 'ENST00000463509.1',
 'ENST00000416721.2',
 'ENST00000495377.2',
 'ENST00000438146.2',
 'ENST00000473487.2',
 'ENST00000408983.2',
 'ENST00000491982.1',
 'ENST00000397305.3']

In [11]:
def transcript_to_gene_id(db):
    '''
    given a transcript ID, return a gene ID (expect one to one)
    '''
    genes = db.features_of_type('transcript')
    transcript_gene_dict = defaultdict(list)
    for gene in genes:
        try:
            transcript_gene_dict[gene.attributes['transcript_id'][0]].append(gene.attributes['gene_id'][0])
        except KeyError as e:
            print("Warning. Key not found for {}".format(gene))
            return 1
    return transcript_gene_dict

db_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
DATABASE = gffutils.FeatureDB(db_file)

transcript_to_gene_dictionary = transcript_to_gene_id(DATABASE)
transcript_to_gene_dictionary['ENST00000416721.2']

['ENSG00000100320.18']

In [12]:
def id_to_exons(db, identifier):
    '''
    takes the gene or transcript id and returns exon positions
    '''
    exons = []
    for i in db.children(identifier, featuretype='exon', order_by='start'):
        exons.append(i)
    return exons

db_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
DATABASE = gffutils.FeatureDB(db_file)

id_to_exons(DATABASE,'ENST00000473487.2')

[<Feature exon (chr22:36156057-36156067[-]) at 0x2b0c91d05b70>,
 <Feature exon (chr22:36157249-36157341[-]) at 0x2b0caef78f98>,
 <Feature exon (chr22:36157462-36157515[-]) at 0x2b0cae296dd8>,
 <Feature exon (chr22:36161470-36161530[-]) at 0x2b0cb021f748>,
 <Feature exon (chr22:36164304-36164396[-]) at 0x2b0cb021f4e0>,
 <Feature exon (chr22:36174072-36174125[-]) at 0x2b0cb021f2e8>,
 <Feature exon (chr22:36177647-36177790[-]) at 0x2b0cae5daf60>,
 <Feature exon (chr22:36205827-36206051[-]) at 0x2b0cb0214c88>,
 <Feature exon (chr22:36220367-36220420[-]) at 0x2b0cb0214da0>]

In [13]:
def position_to_features(db, chrom, start, end, strand='', completely_within=True):
    '''
    takes a coordinate and returns all the features overlapping 
    (either completely contained or partially overlapping the region).
    '''
    if strand == '+' or strand == '-':
        return list(
            db.region(
                region=(chrom, start, end), strand=strand, completely_within=completely_within
            )
        )
    else:
        return list(
            db.region(
                region=(chrom, start, end), completely_within=completely_within
            )
        )
# get all features corresponding to the genomic coordinates (True if feature must be entirely contained within region)
features = position_to_features(DATABASE,'chr19', 1000000, 1000100, completely_within=True)
# print all gene names associated with these features
# print([f.attributes['gene_name'] for f in features])

# Example of how to get introns from a GTF file using gffutils

In [14]:
db_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
DATABASE = gffutils.FeatureDB(db_file)

exons = id_to_exons(DATABASE,'ENST00000473487.2')  # re-use a function above
exons

  "method of this object." % self.version)


[<Feature exon (chr22:36156057-36156067[-]) at 0x2b0ced15db70>,
 <Feature exon (chr22:36157249-36157341[-]) at 0x2b0ced15da90>,
 <Feature exon (chr22:36157462-36157515[-]) at 0x2b0ced15db38>,
 <Feature exon (chr22:36161470-36161530[-]) at 0x2b0ced15dda0>,
 <Feature exon (chr22:36164304-36164396[-]) at 0x2b0cedcde048>,
 <Feature exon (chr22:36174072-36174125[-]) at 0x2b0cedcde2b0>,
 <Feature exon (chr22:36177647-36177790[-]) at 0x2b0cedcde518>,
 <Feature exon (chr22:36205827-36206051[-]) at 0x2b0cedcde780>,
 <Feature exon (chr22:36220367-36220420[-]) at 0x2b0cedcde9e8>]

In [16]:
introns = DATABASE.interfeatures(
    exons, 
    new_featuretype=None, 
    merge_attributes=True, 
    dialect=None, 
    attribute_func=None, 
    update_attributes=None
)
for i in introns:
    print(i)

chr22	gffutils_derived	inter_exon_exon	36156068	36157248	.	-	.	gene_status "KNOWN"; exon_number "9,8"; level "2"; transcript_type "protein_coding"; tag "cds_end_NF,mRNA_end_NF"; protein_id "ENSP00000475142.1"; gene_id "ENSG00000100320.18"; exon_id "ENSE00003472559.1,ENSE00001576597.1"; transcript_id "ENST00000473487.2"; havana_transcript "OTTHUMT00000319341.2"; havana_gene "OTTHUMG00000150585.10"; transcript_name "RBFOX2-014"; gene_type "protein_coding"; transcript_status "PUTATIVE"; gene_name "RBFOX2";
chr22	gffutils_derived	inter_exon_exon	36157342	36157461	.	-	.	gene_status "KNOWN"; exon_number "9,7"; level "2"; transcript_type "protein_coding"; tag "cds_end_NF,mRNA_end_NF"; protein_id "ENSP00000475142.1"; gene_id "ENSG00000100320.18"; exon_id "ENSE00003472559.1,ENSE00001583934.1"; transcript_id "ENST00000473487.2"; havana_transcript "OTTHUMT00000319341.2"; havana_gene "OTTHUMG00000150585.10"; transcript_name "RBFOX2-014"; gene_type "protein_coding"; transcript_status "PUTATIVE"; ge

# Misc scratch space for showing how to hash elements based on position. 
- Generally puts into memory, speeding up position/gene lookup.
- I've implemented this into my annotator script and RBPmaps code, but it doesn't seem to work here...

In [14]:
from collections import defaultdict


def hash_features(db):
    '''
    hashes features by position.
    '''
    genes = defaultdict(list)
    for element in db.region(seqid=chrom):
        start = int(element.start / 1000000)
        end = int(element.end / 1000000)
        genes[chrom, start, end].append(element)
    return genes
# get all features corresponding to the genomic coordinates (True if feature must be entirely contained within region)
genes = chrom_to_features(DATABASE,'chr19')
# print all gene names associated with these features
# print([f.attributes['gene_name'] for f in features])

In [15]:
start = 1000400
end = 1000440

overlapped = []

start_key = int(start / 1000000)
end_key = int(end / 1000000)

for gene in genes[chrom, start_key, end_key]:
    if gene.start > start and gene.start < end:
        overlapped.append(gene)
    elif gene.end > start and gene.end < end:
        overlapped.append(gene)
        
overlapped

NameError: name 'genes' is not defined

In [None]:
ret = DATABASE.execute("SELECT seqid FROM features").fetchall()
all_chromosomes = [r['seqid'] for r in ret]

In [None]:
from tqdm import tnrange, tqdm_notebook

genes = DATABASE.features_of_type('gene')
progress = tnrange(48440)

ct = 0
newgenes = []
for gene in genes:
    if ct > 10000:
        break
    gene.attributes['transcript_id'] = gene.attributes['gene_id']
    newgenes.append(gene)
    progress.update(1)
    ct+=1

In [None]:
DATABASE.update((n for n in newgenes), )

In [None]:
# Feature objects embed all information as a dictionary
# See: http://pythonhosted.org/gffutils/attributes.html

DEFAULT_FEATURE_TYPE_PRIORITY = [
    'UTR','gene','transcript','exon','start_codon','stop_codon','Selenocysteine', 'CDS'
]

DEFAULT_TRANSCRIPT_TYPE_PRIORITY = [
    'retained_intron', 'protein_coding','pseudogene','rRNA', 'processed_transcript', 'antisense'
]

priority = DEFAULT_TRANSCRIPT_TYPE_PRIORITY
"""
for f in features:
    pass
    print(
        '{}, {}, {}, {}'.format(
            f.attributes['gene_name'], # list of associated gene names
            f.start, # start of feature
            f.end, # end of feature
            priority.index(f.attributes['transcript_type'][0])
        ) # type of feature
    )"""

In [None]:
f_priority = DEFAULT_FEATURE_TYPE_PRIORITY
t_priority = DEFAULT_TRANSCRIPT_TYPE_PRIORITY

features.sort(
    key=lambda x: t_priority.index(
        x.attributes['gene_type'][0]
    ), reverse=False
) # sort gene type
first_filter = [
    f for f in features if features[0].attributes['transcript_type'] == f.attributes['transcript_type']
]
first_filter.sort(
    key=lambda x: f_priority.index(
        x.featuretype
    ), reverse=False
)
second_filter = [
    f for f in first_filter if first_filter[0].featuretype == f.featuretype
]
# [f.attributes['transcript_type'] for f in features]
second_filter