In [1]:
import os
import sys
from importlib import reload

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.insert(0,module_path)

import pandas as pd
from IPython.display import display
pd.set_option('max_colwidth', 70)
pd.set_option('display.max_rows', 500)
import string
import numpy as np

from orangecontrib.bioinformatics.geneset.__init__ import list_all

import jupyter_functions as jf
reload(jf)
import jupyter_functions as jf
from networks.functionsDENet import loadPickle, savePickle
import enrichment_library as enr
reload(enr)
import enrichment_library as enr
from stages_DE.stages_library import PHENOTYPES

In [2]:
#Path for expression data (mergedGenes_RPKUM.tsv)
dataPath='/home/karin/Documents/timeTrajectories/data/RPKUM/combined/'
path_abberant='/home/karin/Documents/timeTrajectories/data/regulons/selected_genes/'

In [34]:
# Get all gene sets
gene_sets=list(list_all(organism='44689'))
GENE_SETS_ONTOLOGY=enr.get_gene_sets(gene_set_names=gene_sets, go_slims=True,set_sizes=(5,500))

In [30]:
# For conversion of gene names to EID
genes = pd.read_csv(dataPath + 'mergedGenes_RPKUM.tsv', sep='\t', index_col=0)
all_gene_names= genes[(genes != 0).any(axis=1)].index
NAME_EID=enr.name_genes_entrez(gene_names=all_gene_names, key_entrez=False)
ALL_GENE_NAMES_EID=enr.convert_EID(genes=all_gene_names, name_EID=NAME_EID)

In [31]:
def group_diff_enrichment(query_names,group:str,padj:float=0.25,min_overlap:int=None,
                          use_annotated_genes:bool=False):
    #Displays only gene sets that have overlap with query greater or equal to min_overlap 
    #For p value and padj calculation uses alll that have overlap >=1 } from gene_set_enrichment
    """
    :param use_annotated_genes: if True use for reference and query  only genes that have at 
    least one gene set annotation
    """
    query_EID=enr.convert_EID(genes=query_names, name_EID=NAME_EID)
    print('***  '+group+' selected:',len(query_names),'with EID:',len(query_EID))
    
    reference_gene_eids=ALL_GENE_NAMES_EID.copy()
    query_eids=query_EID.copy()
    
    if use_annotated_genes:
        gene_sets_genes=set()
        for gene_set_name, gene_sets in GENE_SETS_ONTOLOGY.items():
            for gene_set in gene_sets:
                gene_sets_genes.update(gene_set.genes)
        reference_gene_eids=set(reference_gene_eids) & gene_sets_genes
        query_eids=set(query_eids) & gene_sets_genes
        
        query_annotated_reatio='NA'
        if len(query_EID)>0:
            query_annotated_reatio=round(len(query_eids)/len(query_EID),2)
        print('Ratio of genes annotated with a gene set in reference',
              round(len(reference_gene_eids)/len(ALL_GENE_NAMES_EID),2),
             'and query',query_annotated_reatio)
    
    query_in_enriched=set()
    result=None
    if len(query_eids) > 0:
        enrichment=enr.gene_set_enrichment(query_eids, reference_EID=reference_gene_eids, 
                                                padj_threshold=padj,min_overlap=min_overlap,
                                                gene_sets_ontology=GENE_SETS_ONTOLOGY)
        if len(enrichment)>0:
            enrichment_display=list()
            enrichment= sorted(enrichment, key=lambda data: data.padj)
            for enriched in enrichment:
                query_in_enriched.update(enriched.gene_set.genes & query_eids)
                enrichment_display.append({'Gene set':enriched.gene_set.name,'Ontology':enriched.ontology,
                                           'FDR':"{:.2e}".format(enriched.padj),'N in query':enriched.in_query})
            result=pd.DataFrame(enrichment_display)
    print('Enrichment at FDR: '+str(padj)+' and min query - gene set overlap',str(min_overlap))
    print('N query genes in displayed gene sets:',len(query_in_enriched),'out of', len(query_eids),
          'query genes used for enrichment calculation.')
    display(result)
    print('\n')
    return result

In [32]:
comparison_order=('agg-','dis','tag','cud')

In [35]:
abberant=pd.read_table(
    path_abberant+'comparisonsAvgSimsSingle2STDAny0.2_lessComparisons2_AX4basedNeigh_u-less_newGenes_noAll-removeZeroRep_simsDict_scalemean0std1_logTrue_kN11_splitStrain.tsv',
                       index_col=0,sep='\t')
abberant=abberant[(abberant['Difference mean']>=0.3) & (abberant['FDR']<=0.05)]
for comparison in comparison_order:
    query_names=list(abberant.query('Comparison == "'+comparison+'"').index)
    enriched=group_diff_enrichment(query_names=query_names,group=comparison,min_overlap=2,use_annotated_genes=True)



***  agg- selected: 162 with EID: 160
Ratio of genes annotated with a gene set in reference 0.32 and query 0.28
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 30 out of 45 query genes used for enrichment calculation.


Unnamed: 0,Gene set,Ontology,FDR,N in query
0,cell differentiation,"(GO, biological_process)",5.47e-05,11
1,anatomical structure development,"(GO, biological_process)",0.00389,15
2,anatomical structure formation involved in morphogenesis,"(GO, biological_process)",0.00744,6
3,aberrant spore coat morphology,"(Dictybase, Phenotypes)",0.00837,3
4,cell wall,"(GO, cellular_component)",0.00837,3
5,external encapsulating structure,"(GO, cellular_component)",0.00837,3
6,cell wall organization or biogenesis,"(GO, biological_process)",0.0112,3
7,multiple tips in culminant,"(Dictybase, Phenotypes)",0.0213,2
8,aberrant stalk morphogenesis,"(Dictybase, Phenotypes)",0.0518,2
9,decreased spore viability,"(Dictybase, Phenotypes)",0.0584,4




***  dis selected: 550 with EID: 549
Ratio of genes annotated with a gene set in reference 0.32 and query 0.26
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 52 out of 142 query genes used for enrichment calculation.


Unnamed: 0,Gene set,Ontology,FDR,N in query
0,extracellular region,"(GO, cellular_component)",8.29e-07,23
1,cell wall,"(GO, cellular_component)",0.0568,4
2,external encapsulating structure,"(GO, cellular_component)",0.0568,4
3,anatomical structure development,"(GO, biological_process)",0.0742,28
4,Starch and sucrose metabolism,"(KEGG, Pathways)",0.149,5
5,anatomical structure formation involved in morphogenesis,"(GO, biological_process)",0.149,8
6,aberrant spore coat morphology,"(Dictybase, Phenotypes)",0.215,3




***  tag selected: 39 with EID: 39
Ratio of genes annotated with a gene set in reference 0.32 and query 0.21
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 3 out of 8 query genes used for enrichment calculation.


Unnamed: 0,Gene set,Ontology,FDR,N in query
0,extracellular region,"(GO, cellular_component)",0.0572,3




***  cud selected: 517 with EID: 510
Ratio of genes annotated with a gene set in reference 0.32 and query 0.21
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 39 out of 109 query genes used for enrichment calculation.


Unnamed: 0,Gene set,Ontology,FDR,N in query
0,extracellular region,"(GO, cellular_component)",2.67e-05,18
1,anatomical structure formation involved in morphogenesis,"(GO, biological_process)",0.000318,11
2,cell differentiation,"(GO, biological_process)",0.00176,14
3,hydrolase activity,"(GO, molecular_function)",0.14,5
4,secondary metabolic process,"(GO, biological_process)",0.14,2
5,cell adhesion,"(GO, biological_process)",0.14,7
6,decreased spore viability,"(Dictybase, Phenotypes)",0.14,6
7,decreased prespore cell differentiation,"(Dictybase, Phenotypes)",0.14,3




