In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import glob
import seaborn as sns
import gffutils
from pathway import GO  # look at bfx/

In [2]:
input_dir = 'inputs/'
output_dir = 'outputs/'

In [3]:
gene_list = os.path.join(output_dir, 'GO.txt')
bg_list = os.path.join(output_dir, 'BG.txt')

In [4]:
go_file = '/projects/ps-yeolab3/bay001/annotations/hg19_ensg_to_go.txt'
hg19go = GO.GO(go_file)

In [5]:
# get the background (all genes)
db_file = '/projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
DATABASE = gffutils.FeatureDB(db_file)

def find_protein_coding_genes(db):
    """
    Finds the number of protein coding genes in the database.
    """
    all_pc = []
    pc_count = 0
    genes = db.features_of_type('gene')
    for gene in genes:
        pc = False
        for gene_type in gene.attributes['gene_type']:
            if gene_type == 'protein_coding':
                for geneid in gene.attributes['gene_id']:
                    all_pc.append(geneid.split('.')[0]) # just get the ensembl-style IDs, not gencode
    return set(all_pc)

protein_coding_genes = find_protein_coding_genes(DATABASE)

  "method of this object." % self.version)


In [6]:
print(len(protein_coding_genes)) # number of protein coding genes. make sure it's reasonable (~20,000)
with open(bg_list, 'w') as f:
    for geneid in protein_coding_genes:
        f.write('{}\n'.format(geneid))

20345


In [7]:
genes = pd.read_table(
    os.path.join(input_dir, '764.01v02.IDR.out.0102merged.bed.annotated'),
    names=['chrom','start','end','name','score','strand','region','geneid']
)
genes.head()

Unnamed: 0,chrom,start,end,name,score,strand,region,geneid
0,chr5,138665777,138665826,5.290644,3.686779,+,3utr|contained|ENSG00000015479.13,ENSG00000015479.13
1,chr20,30824074,30824134,3.887475,5.240236,+,3utr|contained|ENSG00000101346.7,ENSG00000101346.7
2,chr3,141667691,141667722,5.369356,4.12904,-,3utr|contained|ENSG00000114126.13,ENSG00000114126.13
3,chr1,113667717,113667786,8.662595,4.978746,+,3utr|contained|ENSG00000198799.7,ENSG00000198799.7
4,chr1,113667654,113667717,5.413653,4.076449,+,3utr|contained|ENSG00000198799.7,ENSG00000198799.7


In [8]:
def get_ensembl_id(row):
    try:
        return row['geneid'].split('.')[0]
    except AttributeError:
        return 'intergenic'
    
genes['ensembl'] = genes.apply(get_ensembl_id, axis=1)
genes.head()

Unnamed: 0,chrom,start,end,name,score,strand,region,geneid,ensembl
0,chr5,138665777,138665826,5.290644,3.686779,+,3utr|contained|ENSG00000015479.13,ENSG00000015479.13,ENSG00000015479
1,chr20,30824074,30824134,3.887475,5.240236,+,3utr|contained|ENSG00000101346.7,ENSG00000101346.7,ENSG00000101346
2,chr3,141667691,141667722,5.369356,4.12904,-,3utr|contained|ENSG00000114126.13,ENSG00000114126.13,ENSG00000114126
3,chr1,113667717,113667786,8.662595,4.978746,+,3utr|contained|ENSG00000198799.7,ENSG00000198799.7,ENSG00000198799
4,chr1,113667654,113667717,5.413653,4.076449,+,3utr|contained|ENSG00000198799.7,ENSG00000198799.7,ENSG00000198799


In [9]:
with open(gene_list, 'w') as f:
    for geneid in set(genes['ensembl']):
        if geneid != 'intergenic':
            f.write('{}\n'.format(geneid))

In [10]:
df = hg19go.enrichment(list(set(genes['ensembl'])), background=list(protein_coding_genes))

                                                      Gene stable ID  \
GO term accession                                                      
GO:0005829         {ENSG00000167693, ENSG00000171408, ENSG0000006...   
GO:0005515         {ENSG00000167693, ENSG00000064703, ENSG0000011...   
GO:0005634         {ENSG00000167693, ENSG00000064703, ENSG0000019...   
GO:0005654         {ENSG00000102054, ENSG00000198157, ENSG0000023...   
GO:0005737         {ENSG00000124209, ENSG00000261772, ENSG0000018...   

                                                           Gene name  \
GO term accession                                                      
GO:0005829         {HIF3A, PGM2L1, CEL, FHIT, NCBP2, HSPA8, AGL, ...   
GO:0005515         {RNF14, DUOXA1, RNF17, RNF10, RNF11, RNF13, UB...   
GO:0005634         {HIF3A, HIST1H4B, HIST1H4A, RNF17, HIST1H4G, R...   
GO:0005654         {HIF3A, HIST1H4B, HIST1H4A, C17orf70, HIST1H4F...   
GO:0005737         {HIF3A, RNF14, ELANE, RNF17, RNF10, FHIT, NC