In [22]:
import os
import sys
from importlib import reload

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.insert(0,module_path)

import pandas as pd
from IPython.display import display
pd.set_option('max_colwidth', 70)
pd.set_option('display.max_rows', 500)
import string
import numpy as np

from orangecontrib.bioinformatics.geneset.__init__ import list_all

import jupyter_functions as jf
reload(jf)
import jupyter_functions as jf
from networks.functionsDENet import loadPickle, savePickle
import enrichment_library as enr
reload(enr)
import enrichment_library as enr
from stages_DE.stages_library import PHENOTYPES

In [23]:
#Path for expression data (mergedGenes_RPKUM.tsv)
dataPath='/home/karin/Documents/timeTrajectories/data/RPKUM/combined/'
path_abberant='/home/karin/Documents/timeTrajectories/data/regulons/selected_genes/'

In [24]:
# For conversion of gene names to EID
genes = pd.read_csv(dataPath + 'mergedGenes_RPKUM.tsv', sep='\t', index_col=0)
all_gene_names= genes[(genes != 0).any(axis=1)].index
NAME_EID=enr.name_genes_entrez(gene_names=all_gene_names, key_entrez=False)
ALL_GENE_NAMES_EID=enr.convert_EID(genes=all_gene_names, name_EID=NAME_EID)

In [30]:
def group_diff_enrichment(query_names,group:str,padj:float=0.25,min_overlap:int=None,
                          use_annotated_genes:bool=False):
    #Displays only gene sets that have overlap with query greater or equal to min_overlap 
    #For p value and padj calculation uses alll that have overlap >=1 } from gene_set_enrichment
    """
    :param use_annotated_genes: if True use for reference and query  only genes that have at 
    least one gene set annotation
    """
    query_EID=enr.convert_EID(genes=query_names, name_EID=NAME_EID)
    print('***  '+group+' selected:',len(query_names),'with EID:',len(query_EID))
    
    reference_gene_eids=ALL_GENE_NAMES_EID.copy()
    query_eids=query_EID.copy()
    
    if use_annotated_genes:
        gene_sets_genes=set()
        for gene_set_name, gene_sets in GENE_SETS_ONTOLOGY.items():
            for gene_set in gene_sets:
                gene_sets_genes.update(gene_set.genes)
        reference_gene_eids=set(reference_gene_eids) & gene_sets_genes
        query_eids=set(query_eids) & gene_sets_genes
        
        query_annotated_reatio='NA'
        if len(query_EID)>0:
            query_annotated_reatio=round(len(query_eids)/len(query_EID),2)
        print('Ratio of genes annotated with a gene set in reference',
              round(len(reference_gene_eids)/len(ALL_GENE_NAMES_EID),2),
             'and query',query_annotated_reatio)
    
    query_in_enriched=set()
    result=None
    if len(query_eids) > 0:
        enrichment=enr.gene_set_enrichment(query_eids, reference_EID=reference_gene_eids, 
                                                padj_threshold=padj,min_overlap=min_overlap,
                                                gene_sets_ontology=GENE_SETS_ONTOLOGY)
        if len(enrichment)>0:
            enrichment_display=list()
            enrichment= sorted(enrichment, key=lambda data: data.padj)
            for enriched in enrichment:
                query_in_enriched.update(enriched.gene_set.genes & query_eids)
                enrichment_display.append({'Gene set':enriched.gene_set.name,
                                           'Ontology':enriched.ontology[0]+': '+enriched.ontology[1],
                                           'FDR':"{:.2e}".format(enriched.padj),'N in query':enriched.in_query,
                                          'Set size':len(enriched.gene_set.genes),
                                                          'N in ref.':enriched.in_reference})
            result=pd.DataFrame(enrichment_display)
    print('Enrichment at FDR: '+str(padj)+' and min query - gene set overlap',str(min_overlap))
    print('N query genes in displayed gene sets:',len(query_in_enriched),'out of', len(query_eids),
          'query genes used for enrichment calculation.')
    display(result)
    print('\n')
    return result

In [31]:
comparison_order=('agg-','dis','tag','cud')

## Enrichment report

Filter results based on FDR (<=0.05) and Difference between group means (>=0.3).

In [32]:
abberant=pd.read_table(
    path_abberant+'comparisonsAvgSimsSingle2STDAny0.2_lessComparisons2_AX4basedNeigh_u-less_newGenes_noAll-removeZeroRep_simsDict_scalemean0std1_logTrue_kN11_splitStrain.tsv',
                       index_col=0,sep='\t')
abberant=abberant[(abberant['Difference mean']>=0.3) & (abberant['FDR']<=0.05)]

### KEGG, DictyPhenotypes, GO slims

In [33]:
# Get all gene sets
gene_sets=list(list_all(organism='44689'))
GENE_SETS_ONTOLOGY=enr.get_gene_sets(gene_set_names=gene_sets, go_slims=True,set_sizes=(5,500))

In [34]:
# Find out max/min gene set sizes and N sets
max_n=0
min_n=np.inf
for k,v in GENE_SETS_ONTOLOGY.items():
    for s in v:
        n=len(s.genes)
        if n<min_n:
            min_n=n
        if n>max_n:
            max_n=n
print('Min size',min_n,'Max size',max_n)  
print('N sets',{k:len(v) for k,v in GENE_SETS_ONTOLOGY.items()})   

Min size 5 Max size 440
N sets {('Dictybase', 'Phenotypes'): 209, ('GO', 'molecular_function'): 35, ('GO', 'biological_process'): 52, ('KEGG', 'Pathways'): 96, ('GO', 'cellular_component'): 24}


In [35]:
for comparison in comparison_order:
    query_names=list(abberant.query('Comparison == "'+comparison+'"').index)
    enriched=group_diff_enrichment(query_names=query_names,group=comparison,min_overlap=2,use_annotated_genes=True)

***  agg- selected: 162 with EID: 160
Ratio of genes annotated with a gene set in reference 0.32 and query 0.28
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 22 out of 45 query genes used for enrichment calculation.


Unnamed: 0,Gene set,Ontology,FDR,N in query,Set size,N in ref.
0,cell differentiation,GO: biological_process,5.05e-05,11,145,145
1,anatomical structure development,GO: biological_process,0.0036,15,440,438
2,anatomical structure formation involved in morphogenesis,GO: biological_process,0.00683,6,74,74
3,aberrant spore coat morphology,Dictybase: Phenotypes,0.00766,3,12,12
4,cell wall,GO: cellular_component,0.00766,3,13,13
5,external encapsulating structure,GO: cellular_component,0.00766,3,13,13
6,cell wall organization or biogenesis,GO: biological_process,0.0103,3,15,15
7,multiple tips in culminant,Dictybase: Phenotypes,0.0194,2,5,5
8,aberrant stalk morphogenesis,Dictybase: Phenotypes,0.0473,2,8,8
9,decreased spore viability,Dictybase: Phenotypes,0.0535,4,60,59




***  dis selected: 550 with EID: 549
Ratio of genes annotated with a gene set in reference 0.32 and query 0.26
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 52 out of 142 query genes used for enrichment calculation.


Unnamed: 0,Gene set,Ontology,FDR,N in query,Set size,N in ref.
0,extracellular region,GO: cellular_component,7.51e-07,23,157,157
1,cell wall,GO: cellular_component,0.0504,4,13,13
2,external encapsulating structure,GO: cellular_component,0.0504,4,13,13
3,anatomical structure development,GO: biological_process,0.0668,28,440,438
4,anatomical structure formation involved in morphogenesis,GO: biological_process,0.133,8,74,74
5,Starch and sucrose metabolism,KEGG: Pathways,0.133,5,31,31
6,aberrant spore coat morphology,Dictybase: Phenotypes,0.191,3,12,12
7,abolished sporulation,Dictybase: Phenotypes,0.239,4,25,25




***  tag selected: 39 with EID: 39
Ratio of genes annotated with a gene set in reference 0.32 and query 0.21
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 3 out of 8 query genes used for enrichment calculation.


Unnamed: 0,Gene set,Ontology,FDR,N in query,Set size,N in ref.
0,extracellular region,GO: cellular_component,0.0544,3,157,157




***  cud selected: 517 with EID: 510
Ratio of genes annotated with a gene set in reference 0.32 and query 0.21
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 39 out of 109 query genes used for enrichment calculation.


Unnamed: 0,Gene set,Ontology,FDR,N in query,Set size,N in ref.
0,extracellular region,GO: cellular_component,2.5e-05,18,157,157
1,anatomical structure formation involved in morphogenesis,GO: biological_process,0.000296,11,74,74
2,cell differentiation,GO: biological_process,0.00164,14,145,145
3,decreased spore viability,Dictybase: Phenotypes,0.13,6,60,59
4,decreased prespore cell differentiation,Dictybase: Phenotypes,0.13,3,14,14
5,hydrolase activity,GO: molecular_function,0.13,5,46,45
6,secondary metabolic process,GO: biological_process,0.13,2,5,5
7,cell adhesion,GO: biological_process,0.13,7,84,83






### KEGG, DictyPhenotypes, GO 

In [36]:
# Get all gene sets
gene_sets=list(list_all(organism='44689'))
GENE_SETS_ONTOLOGY=enr.get_gene_sets(gene_set_names=gene_sets, go_slims=False,set_sizes=(5,500))

In [37]:
# Find out max/min gene set sizes and N sets
max_n=0
min_n=np.inf
for k,v in GENE_SETS_ONTOLOGY.items():
    for s in v:
        n=len(s.genes)
        if n<min_n:
            min_n=n
        if n>max_n:
            max_n=n
print('Min size',min_n,'Max size',max_n)  
print('N sets',{k:len(v) for k,v in GENE_SETS_ONTOLOGY.items()})  

Min size 5 Max size 481
N sets {('Dictybase', 'Phenotypes'): 209, ('GO', 'molecular_function'): 424, ('GO', 'biological_process'): 1379, ('KEGG', 'Pathways'): 96, ('GO', 'cellular_component'): 305}


In [38]:
# Genes sets, not including only slims
gene_sets=list(list_all(organism='44689'))
GENE_SETS_ONTOLOGY=enr.get_gene_sets(gene_set_names=gene_sets, go_slims=False,set_sizes=(5,500))
for comparison in comparison_order:
    query_names=list(abberant.query('Comparison == "'+comparison+'"').index)
    enriched=group_diff_enrichment(query_names=query_names,group=comparison,min_overlap=2,use_annotated_genes=True)

***  agg- selected: 162 with EID: 160
Ratio of genes annotated with a gene set in reference 0.33 and query 0.31
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 40 out of 50 query genes used for enrichment calculation.


Unnamed: 0,Gene set,Ontology,FDR,N in query,Set size,N in ref.
0,cellular developmental process,GO: biological_process,0.000261,11,145,145
1,cell differentiation,GO: biological_process,0.000261,11,145,145
2,cell wall biogenesis,GO: biological_process,0.00519,3,8,8
3,external encapsulating structure organization,GO: biological_process,0.00519,3,8,8
4,cell wall organization,GO: biological_process,0.00519,3,8,8
5,spore wall assembly,GO: biological_process,0.00519,3,8,8
6,cell wall assembly,GO: biological_process,0.00519,3,8,8
7,cellular component morphogenesis,GO: biological_process,0.00519,3,8,8
8,developmental process,GO: biological_process,0.00519,16,449,447
9,cellular component assembly involved in morphogenesis,GO: biological_process,0.00519,3,8,8




***  dis selected: 550 with EID: 549
Ratio of genes annotated with a gene set in reference 0.33 and query 0.28
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 38 out of 151 query genes used for enrichment calculation.


Unnamed: 0,Gene set,Ontology,FDR,N in query,Set size,N in ref.
0,extracellular region,GO: cellular_component,5.67e-06,23,157,157
1,cell surface receptor signaling pathway,GO: biological_process,0.0273,8,39,39
2,multicellular organism development,GO: biological_process,0.192,9,72,72
3,cell wall,GO: cellular_component,0.192,4,13,13
4,external encapsulating structure,GO: cellular_component,0.192,4,13,13
5,multicellular organismal process,GO: biological_process,0.236,9,76,76




***  tag selected: 39 with EID: 39
Ratio of genes annotated with a gene set in reference 0.33 and query 0.21
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 3 out of 8 query genes used for enrichment calculation.


Unnamed: 0,Gene set,Ontology,FDR,N in query,Set size,N in ref.
0,polysaccharide binding,GO: molecular_function,0.0691,2,24,24
1,extracellular region,GO: cellular_component,0.103,3,157,157
2,carbohydrate binding,GO: molecular_function,0.133,2,58,58




***  cud selected: 517 with EID: 510
Ratio of genes annotated with a gene set in reference 0.33 and query 0.23
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 39 out of 116 query genes used for enrichment calculation.


Unnamed: 0,Gene set,Ontology,FDR,N in query,Set size,N in ref.
0,carbohydrate binding,GO: molecular_function,1.63e-08,15,58,58
1,polysaccharide binding,GO: molecular_function,5.22e-05,8,24,24
2,extracellular region,GO: cellular_component,5.54e-05,18,157,157
3,cellulose binding,GO: molecular_function,0.000462,6,16,16
4,sporulation,GO: biological_process,0.000563,11,75,75
5,sporulation resulting in formation of a cellular spore,GO: biological_process,0.000563,11,74,74
6,anatomical structure formation involved in morphogenesis,GO: biological_process,0.000563,11,74,74
7,spore germination,GO: biological_process,0.000907,5,12,12
8,cellular developmental process,GO: biological_process,0.00296,14,145,145
9,cell differentiation,GO: biological_process,0.00296,14,145,145




