# Enrichment of regulons from Regulons_by_strain

In [28]:
import os
import sys
from importlib import reload

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.insert(0,module_path)

import pandas as pd
from IPython.display import display,HTML

from orangecontrib.bioinformatics.geneset.__init__ import list_all

import jupyter_functions as jf
reload(jf)
import jupyter_functions as jf
from functionsDENet import loadPickle, savePickle
import stages_DE.enrichment_library as enr
reload(enr)
import stages_DE.enrichment_library as enr

In [2]:
dataPath='/home/karin/Documents/timeTrajectories/data/RPKUM/combined/'
pathRegulons='/home/karin/Documents/timeTrajectories/data/regulons/by_strain/kN300_mean0std1_log/'

## Used gene sets
Used gene sets that do not have less than 5 or more than 500 genes. Used all Orange ontologies.

In [3]:
# Get all gene sets
gene_sets=list(list_all(organism='44689'))
GENE_SETS_ONTOLOGY=enr.get_gene_sets(gene_set_names=gene_sets, go_slims=True,set_sizes=(5,500))

In [4]:
# For conversion of gene names to EID
genes = pd.read_csv(dataPath + 'mergedGenes_RPKUM.tsv', sep='\t', index_col=0)
all_gene_names= genes[(genes != 0).any(axis=1)].index
NAME_EID=enr.name_genes_entrez(gene_names=all_gene_names, key_entrez=False)
ALL_GENE_NAMES_EID=enr.convert_EID(genes=all_gene_names, name_EID=NAME_EID)

In [37]:
pd.set_option('max_colwidth', 55)
def group_diff_enrichment(data:pd.DataFrame,group:str,padj:float=0.25,min_overlap:int=None,
                          use_annotated_genes:bool=False):
    #Displays only gene sets that have overlap with query greater or equal to min_overlap 
    #For p value and padj calculation uses alll that have overlap >=1 } from gene_set_enrichment
    """
    :param use_annotated_genes: if True use for reference and query  only genes that have at 
    least one gene set annotation
    """
    selected=list(data[data['Cluster']==group].index)
    query_EID=enr.convert_EID(genes=selected, name_EID=NAME_EID)
    print('***  '+group+' selected:',len(selected),'with EID:',len(query_EID))
    
    reference_gene_eids=ALL_GENE_NAMES_EID.copy()
    query_eids=query_EID.copy()
    
    if use_annotated_genes:
        gene_sets_genes=set()
        for gene_set_name, gene_sets in GENE_SETS_ONTOLOGY.items():
            for gene_set in gene_sets:
                gene_sets_genes.update(gene_set.genes)
        reference_gene_eids=set(reference_gene_eids) & gene_sets_genes
        query_eids=set(query_eids) & gene_sets_genes
        print('Ratio of genes annotated with a gene set in reference',
              round(len(reference_gene_eids)/len(ALL_GENE_NAMES_EID),2),
             'and query',round(len(query_eids)/len(query_EID),2))
    
    
    result=None
    query_in_enriched=set()
    if len(query_eids) > 0:
        enrichment=enr.gene_set_enrichment(query_eids, reference_EID=reference_gene_eids, 
                                                padj_threshold=padj,min_overlap=min_overlap,
                                                gene_sets_ontology=GENE_SETS_ONTOLOGY)

        if len(enrichment)>0:
            enrichment_display=list()
            enrichment= sorted(enrichment, key=lambda data: data.padj)
            for enriched in enrichment:
                query_in_enriched.update(enriched.gene_set.genes & query_eids)
                enrichment_display.append({'Gene set':enriched.gene_set.name,
                                           'Ontology':enriched.ontology[0]+' '+enriched.ontology[1],
                                           'FDR':"{:.2e}".format(enriched.padj),'N in query':enriched.in_query})
            result=pd.DataFrame(enrichment_display)
    print('Enrichment at FDR: '+str(padj)+' and min query - gene set overlap',str(min_overlap))
    print('N query genes in displayed gene sets:',len(query_in_enriched),'out of', len(query_eids),
          'query genes used for enrichment calculation.')
    if result is not None:
        display(HTML(result.to_html(index=False)))
    else:
        display(result)
    print('\n')
    return result

### Enrichment for mergedGenes_min18_clustersfilledDiagonal11.tab and expression_clustersfilledDiagonal11.png

In [38]:
regulons=pd.read_table(pathRegulons+'mergedGenes_min18_clusters_larger.tab',index_col=0)
for group in range(1,len(regulons['Cluster'].unique())+1):
    enriched=group_diff_enrichment(data=regulons,group='C'+str(group),min_overlap=2)
    if enriched is not None:
        enriched.to_csv(pathRegulons+'enrichment/filledDiagonal_enrichment_cluster'+str(group)+'.tsv', sep='\t',index=False)

***  C1 selected: 36 with EID: 36
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 18 out of 36 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cell cycle,GO biological_process,2.31e-14,13
chromosome segregation,GO biological_process,1.37e-10,6
mitotic nuclear division,GO biological_process,5.9e-09,5
chromosome organization,GO biological_process,1.74e-08,7
mitotic cell cycle,GO biological_process,9.72e-08,7
DNA replication,KEGG Pathways,1.92e-07,5
cell division,GO biological_process,1.92e-07,7
chromosome,GO cellular_component,2.17e-07,6
cytoskeleton,GO cellular_component,3.71e-06,7
cytoskeleton organization,GO biological_process,0.000901,5




***  C2 selected: 42 with EID: 42
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 18 out of 42 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
anatomical structure development,GO biological_process,0.000107,10
Glycosaminoglycan degradation,KEGG Pathways,0.00631,2
response to stress,GO biological_process,0.0272,6
cell adhesion,GO biological_process,0.0365,3
precocious development,Dictybase Phenotypes,0.0406,2
phosphatase activity,GO molecular_function,0.0724,2
development arrests at mound stage,Dictybase Phenotypes,0.0724,2
abolished slug migration,Dictybase Phenotypes,0.0724,2
aberrant aggregation,Dictybase Phenotypes,0.076,2
kinase activity,GO molecular_function,0.0891,4




***  C3 selected: 30 with EID: 30
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 20 out of 30 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Ribosome biogenesis in eukaryotes,KEGG Pathways,3.61e-19,12
nucleolus,GO cellular_component,4.81e-17,11
ribosome biogenesis,GO biological_process,5.97e-16,10
ribonucleoprotein complex assembly,GO biological_process,5.35e-07,4
RNA binding,GO molecular_function,8.44e-06,6
protein-containing complex assembly,GO biological_process,0.001,4
rRNA binding,GO molecular_function,0.001,2
cellular component assembly,GO biological_process,0.003,4
helicase activity,GO molecular_function,0.012,2
methyltransferase activity,GO molecular_function,0.013,2




***  C4 selected: 17 with EID: 17
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 5 out of 17 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
external encapsulating structure,GO cellular_component,0.00228,2
cell wall,GO cellular_component,0.00228,2
anatomical structure formation involved in morphogenesis,GO biological_process,0.0453,2
extracellular region,GO cellular_component,0.052,2
anatomical structure development,GO biological_process,0.052,3
cell differentiation,GO biological_process,0.052,2
plasma membrane,GO cellular_component,0.102,2
small molecule metabolic process,GO biological_process,0.136,2




***  C5 selected: 5 with EID: 5
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 3 out of 5 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
response to stress,GO biological_process,0.000645,3
anatomical structure development,GO biological_process,0.0173,2




***  C6 selected: 4 with EID: 4
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 0 out of 4 query genes used for enrichment calculation.


None



***  C7 selected: 15 with EID: 14
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 7 out of 14 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
aberrant cellular response to stress,Dictybase Phenotypes,0.00085,2
increased cell size,Dictybase Phenotypes,0.000924,2
Citrate cycle (TCA cycle),KEGG Pathways,0.00221,2
homeostatic process,GO biological_process,0.00848,2
wild type,Dictybase Phenotypes,0.0154,3
Biosynthesis of antibiotics,KEGG Pathways,0.036,2
oxidoreductase activity,GO molecular_function,0.0426,2
Biosynthesis of secondary metabolites,KEGG Pathways,0.0489,2
response to stress,GO biological_process,0.0838,2
Metabolic pathways,KEGG Pathways,0.168,2




***  C8 selected: 18 with EID: 18
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 16 out of 18 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
ribosome,GO cellular_component,7.680000000000001e-29,15
Ribosome,KEGG Pathways,7.680000000000001e-29,15
structural constituent of ribosome,GO molecular_function,1.84e-24,13
structural molecule activity,GO molecular_function,1.14e-22,13
translation,GO biological_process,1.79e-18,12
rRNA binding,GO molecular_function,2.71e-06,3
RNA binding,GO molecular_function,1.03e-05,5
cytosol,GO cellular_component,2.16e-05,5
ribosome biogenesis,GO biological_process,0.000218,3
delayed development,Dictybase Phenotypes,0.00952,2




***  C9 selected: 29 with EID: 29
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 10 out of 29 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
translucent sorus,Dictybase Phenotypes,0.00912,2
DNA-binding transcription factor activity,GO molecular_function,0.0381,2
kinase activity,GO molecular_function,0.0381,4
aberrant aggregation,Dictybase Phenotypes,0.0381,2
decreased chemotaxis to cAMP,Dictybase Phenotypes,0.0381,2
DNA binding,GO molecular_function,0.0387,3
anatomical structure formation involved in morphogenesis,GO biological_process,0.0387,2
anatomical structure development,GO biological_process,0.0448,4
aberrant cytokinesis,Dictybase Phenotypes,0.0481,2
decreased growth rate,Dictybase Phenotypes,0.0745,2




***  C10 selected: 47 with EID: 47
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 43 out of 47 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Proteasome,KEGG Pathways,2.15e-66,29
peptidase activity,GO molecular_function,4.5400000000000005e-32,21
catabolic process,GO biological_process,1.1900000000000001e-29,25
Protein processing in endoplasmic reticulum,KEGG Pathways,0.0544,3
enzyme regulator activity,GO molecular_function,0.057,3
response to stress,GO biological_process,0.068,5
endoplasmic reticulum,GO cellular_component,0.0777,3
ATPase activity,GO molecular_function,0.132,3
cytoplasmic vesicle,GO cellular_component,0.182,3
mitochondrion,GO cellular_component,0.207,3




***  C11 selected: 6 with EID: 6
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 0 out of 6 query genes used for enrichment calculation.


None



***  C12 selected: 51 with EID: 51
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 45 out of 51 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Ribosome,KEGG Pathways,3.0099999999999998e-92,45
ribosome,GO cellular_component,1.89e-90,44
structural constituent of ribosome,GO molecular_function,1.45e-80,40
structural molecule activity,GO molecular_function,1.23e-73,40
translation,GO biological_process,1.31e-66,40
RNA binding,GO molecular_function,1.81e-10,11
rRNA binding,GO molecular_function,4.37e-07,4




***  C13 selected: 14 with EID: 14
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 8 out of 14 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cytoskeletal protein binding,GO molecular_function,7.6e-11,8
cytoskeleton organization,GO biological_process,2.19e-10,8
cellular component assembly,GO biological_process,2.37e-10,8
protein-containing complex assembly,GO biological_process,9.13e-10,7
cytoskeleton,GO cellular_component,4.49e-09,7
Endocytosis,KEGG Pathways,1.09e-07,5
cytosol,GO cellular_component,0.000375,4
cytoplasmic vesicle,GO cellular_component,0.000375,4
cell morphogenesis,GO biological_process,0.00156,2
aberrant cell morphology,Dictybase Phenotypes,0.00449,2




***  C14 selected: 51 with EID: 51
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 28 out of 51 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Oxidative phosphorylation,KEGG Pathways,2.07e-09,8
mitochondrion,GO cellular_component,5.12e-06,9
Metabolic pathways,KEGG Pathways,5.12e-06,14
protein folding,GO biological_process,0.000207,4
transmembrane transporter activity,GO molecular_function,0.000359,6
Phagosome,KEGG Pathways,0.000482,4
transmembrane transport,GO biological_process,0.000482,6
unfolded protein binding,GO molecular_function,0.00109,3
generation of precursor metabolites and energy,GO biological_process,0.00173,4
Ribosome,KEGG Pathways,0.0358,3






### Enrichment for expression_minExpressed0.990.1Strains1Min1Max18_clustersLouvain0.4minmaxNologPCA30kN30.pdf

In [48]:
regulons=pd.read_table(pathRegulons+'mergedGenes_minExpressed0.990.1Strains1Min1Max18_clustersLouvain0.4minmaxNologPCA30kN30.tab',index_col=0)
for group in range(1,len(regulons['Cluster'].unique())+1):
    enriched=group_diff_enrichment(data=regulons,group='C'+str(group),min_overlap=2)
    if enriched is not None:
        enriched.to_csv(pathRegulons+'enrichment/minExpressed0.990.1Strains1Min1Max18_clustersLouvain0.4minmaxNologPCA30kN30_enrichment_cluster'+str(group)+'.tsv', sep='\t',index=False)

***  C1 selected: 91 with EID: 91
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 34 out of 91 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
external encapsulating structure,GO cellular_component,8.23e-11,7
cell wall,GO cellular_component,8.23e-11,7
aberrant spore coat morphology,Dictybase Phenotypes,4.29e-09,6
anatomical structure formation involved in morphogenesis,GO biological_process,8.53e-08,9
cell wall organization or biogenesis,GO biological_process,1.17e-06,5
cell differentiation,GO biological_process,1.85e-06,10
anatomical structure development,GO biological_process,0.000258,13
abolished cellulose binding,Dictybase Phenotypes,0.000713,2
Starch and sucrose metabolism,KEGG Pathways,0.000881,4
extracellular region,GO cellular_component,0.0106,6




***  C2 selected: 77 with EID: 77
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 13 out of 77 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cell differentiation,GO biological_process,0.00282,7
reproduction,GO biological_process,0.0679,4
Starch and sucrose metabolism,KEGG Pathways,0.101,2
anatomical structure development,GO biological_process,0.101,7
cell adhesion,GO biological_process,0.101,3
decreased sporulation,Dictybase Phenotypes,0.144,2
development arrests at mound stage,Dictybase Phenotypes,0.144,2
abolished culmination,Dictybase Phenotypes,0.166,2
anatomical structure formation involved in morphogenesis,GO biological_process,0.2,2
delayed aggregation,Dictybase Phenotypes,0.2,2




***  C3 selected: 74 with EID: 74
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 24 out of 74 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
anatomical structure development,GO biological_process,4.02e-06,15
cell adhesion,GO biological_process,0.00799,5
development arrests at loose mound stage,Dictybase Phenotypes,0.00856,3
extracellular region,GO cellular_component,0.0106,6
Glycosaminoglycan degradation,KEGG Pathways,0.0123,2
response to stress,GO biological_process,0.0136,8
signal transduction,GO biological_process,0.0136,8
decreased fruiting body size,Dictybase Phenotypes,0.0136,4
increased slug migration,Dictybase Phenotypes,0.0136,2
decreased aggregate size,Dictybase Phenotypes,0.0136,3




***  C4 selected: 74 with EID: 74
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 8 out of 74 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
extracellular region,GO cellular_component,0.00016,8




***  C5 selected: 68 with EID: 68
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 65 out of 68 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Ribosome,KEGG Pathways,6.409999999999999e-126,60
ribosome,GO cellular_component,1.4700000000000001e-124,59
structural constituent of ribosome,GO molecular_function,8.17e-109,53
structural molecule activity,GO molecular_function,1.72e-98,53
translation,GO biological_process,5.92e-86,52
RNA binding,GO molecular_function,2.97e-15,16
rRNA binding,GO molecular_function,8.86e-13,7
cytosol,GO cellular_component,5.69e-05,8
ribosome biogenesis,GO biological_process,0.0168,3
transmembrane transporter activity,GO molecular_function,0.162,3




***  C6 selected: 66 with EID: 65
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 34 out of 65 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Oxidative phosphorylation,KEGG Pathways,6.98e-10,9
Metabolic pathways,KEGG Pathways,2.55e-08,19
mitochondrion,GO cellular_component,8.07e-05,9
Citrate cycle (TCA cycle),KEGG Pathways,0.00039,4
Phagosome,KEGG Pathways,0.00328,4
Biosynthesis of antibiotics,KEGG Pathways,0.00492,6
cytoplasmic vesicle,GO cellular_component,0.00895,6
generation of precursor metabolites and energy,GO biological_process,0.00924,4
protein folding,GO biological_process,0.0103,3
aberrant cellular response to stress,Dictybase Phenotypes,0.0131,2




***  C7 selected: 64 with EID: 64
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 4 out of 64 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
aberrant cell-cell adhesion,Dictybase Phenotypes,0.00446,2
aberrant fruiting body morphology,Dictybase Phenotypes,0.195,2




***  C8 selected: 61 with EID: 59
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 0 out of 59 query genes used for enrichment calculation.


None



***  C9 selected: 58 with EID: 58
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 9 out of 58 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
extracellular region,GO cellular_component,0.0813,4
cell adhesion,GO biological_process,0.0813,3
hydrolase activity,GO molecular_function,0.113,2
cell differentiation,GO biological_process,0.159,3
anatomical structure formation involved in morphogenesis,GO biological_process,0.176,2




***  C10 selected: 57 with EID: 57
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 9 out of 57 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
extracellular region,GO cellular_component,0.000359,7
cell-cell signaling,GO biological_process,0.0372,2
decreased spore viability,Dictybase Phenotypes,0.187,2
peptidase activity,GO molecular_function,0.207,2




***  C11 selected: 55 with EID: 55
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 21 out of 55 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cytoskeletal protein binding,GO molecular_function,5.97e-08,10
Endocytosis,KEGG Pathways,0.000398,5
cytoskeleton organization,GO biological_process,0.00293,6
cellular component assembly,GO biological_process,0.00303,6
protein-containing complex assembly,GO biological_process,0.0034,5
response to stress,GO biological_process,0.00574,7
cytoskeleton,GO cellular_component,0.00778,5
cytosol,GO cellular_component,0.0533,4
cytoplasmic vesicle,GO cellular_component,0.0533,4
plasma membrane,GO cellular_component,0.106,4




***  C12 selected: 54 with EID: 54
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 4 out of 54 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Phagosome,KEGG Pathways,0.25,2
decreased spore viability,Dictybase Phenotypes,0.25,2




***  C13 selected: 46 with EID: 45
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 10 out of 45 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
extracellular region,GO cellular_component,0.000497,6
anatomical structure formation involved in morphogenesis,GO biological_process,0.0282,3
cell differentiation,GO biological_process,0.121,3
oxidoreductase activity,GO molecular_function,0.152,3




***  C14 selected: 45 with EID: 45
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 0 out of 45 query genes used for enrichment calculation.


None



***  C15 selected: 41 with EID: 41
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 38 out of 41 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Proteasome,KEGG Pathways,2.8e-69,29
peptidase activity,GO molecular_function,7.5499999999999995e-34,21
catabolic process,GO biological_process,6.870000000000001e-32,25
enzyme regulator activity,GO molecular_function,0.0373,3
response to stress,GO biological_process,0.0373,5
ATPase activity,GO molecular_function,0.0827,3
Protein processing in endoplasmic reticulum,KEGG Pathways,0.135,2




***  C16 selected: 40 with EID: 40
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 13 out of 40 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
aberrant CRAC localization to the plasma membrane,Dictybase Phenotypes,0.0148,2
translucent sorus,Dictybase Phenotypes,0.0161,2
development arrests at tipped mound stage,Dictybase Phenotypes,0.0161,2
decreased intracellular cAMP level,Dictybase Phenotypes,0.0161,2
cell death,GO biological_process,0.0314,2
anatomical structure development,GO biological_process,0.0314,6
anatomical structure formation involved in morphogenesis,GO biological_process,0.0314,3
kinase activity,GO molecular_function,0.0344,5
RNA degradation,KEGG Pathways,0.0488,2
cell differentiation,GO biological_process,0.0488,3




***  C17 selected: 36 with EID: 36
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 18 out of 36 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cell cycle,GO biological_process,2.31e-14,13
chromosome segregation,GO biological_process,1.37e-10,6
mitotic nuclear division,GO biological_process,5.9e-09,5
chromosome organization,GO biological_process,1.74e-08,7
mitotic cell cycle,GO biological_process,9.72e-08,7
DNA replication,KEGG Pathways,1.92e-07,5
cell division,GO biological_process,1.92e-07,7
chromosome,GO cellular_component,2.17e-07,6
cytoskeleton,GO cellular_component,3.71e-06,7
cytoskeleton organization,GO biological_process,0.000901,5




***  C18 selected: 30 with EID: 30
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 20 out of 30 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Ribosome biogenesis in eukaryotes,KEGG Pathways,3.61e-19,12
nucleolus,GO cellular_component,4.81e-17,11
ribosome biogenesis,GO biological_process,5.97e-16,10
ribonucleoprotein complex assembly,GO biological_process,5.35e-07,4
RNA binding,GO molecular_function,8.44e-06,6
protein-containing complex assembly,GO biological_process,0.001,4
rRNA binding,GO molecular_function,0.001,2
cellular component assembly,GO biological_process,0.003,4
helicase activity,GO molecular_function,0.012,2
methyltransferase activity,GO molecular_function,0.013,2




***  C19 selected: 23 with EID: 23
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 3 out of 23 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
kinase activity,GO molecular_function,0.11,3




***  C20 selected: 20 with EID: 20
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 0 out of 20 query genes used for enrichment calculation.


None



***  C21 selected: 19 with EID: 19
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 0 out of 19 query genes used for enrichment calculation.


None





#### Using only genes annotated with at least one gene set 
To find genes that have gene set annotations only gene sets used in enrichment are used (see above). Genes that are in at least one of these gene sets are used for enrichment calculation, both for reference and gene group of interest (query).

In [40]:
regulons=pd.read_table(pathRegulons+'mergedGenes_minExpressed0.990.1Strains1Min1Max18_clustersLouvain0.4minmaxNologPCA30kN30.tab',index_col=0)
for group in range(1,len(regulons['Cluster'].unique())+1):
    enriched=group_diff_enrichment(data=regulons,group='C'+str(group),min_overlap=2,use_annotated_genes=True)
    if enriched is not None:
        enriched.to_csv(pathRegulons+'enrichment/minExpressed0.990.1Strains1Min1Max18_clustersLouvain0.4minmaxNologPCA30kN30_annotatedOnly_enrichment_cluster'+str(group)+'.tsv', sep='\t',index=False)

***  C1 selected: 91 with EID: 91
Ratio of genes annotated with a gene set in reference 0.32 and query 0.45
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 34 out of 41 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
external encapsulating structure,GO cellular_component,6.81e-10,7
cell wall,GO cellular_component,6.81e-10,7
aberrant spore coat morphology,Dictybase Phenotypes,2.74e-08,6
anatomical structure formation involved in morphogenesis,GO biological_process,1.03e-06,9
cell wall organization or biogenesis,GO biological_process,5.66e-06,5
cell differentiation,GO biological_process,2.53e-05,10
abolished cellulose binding,Dictybase Phenotypes,0.00161,2
Starch and sucrose metabolism,KEGG Pathways,0.00346,4
anatomical structure development,GO biological_process,0.0035,13
abolished stalk cell differentiation,Dictybase Phenotypes,0.0228,2




***  C2 selected: 77 with EID: 77
Ratio of genes annotated with a gene set in reference 0.32 and query 0.29
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 15 out of 22 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cell differentiation,GO biological_process,0.000748,7
reproduction,GO biological_process,0.0608,4
anatomical structure development,GO biological_process,0.0724,7
cell adhesion,GO biological_process,0.0883,3
Starch and sucrose metabolism,KEGG Pathways,0.0908,2
decreased sporulation,Dictybase Phenotypes,0.12,2
development arrests at mound stage,Dictybase Phenotypes,0.12,2
abolished culmination,Dictybase Phenotypes,0.138,2
delayed aggregation,Dictybase Phenotypes,0.169,2
anatomical structure formation involved in morphogenesis,GO biological_process,0.172,2




***  C3 selected: 74 with EID: 74
Ratio of genes annotated with a gene set in reference 0.32 and query 0.42
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 24 out of 31 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
anatomical structure development,GO biological_process,2.37e-05,15
cell adhesion,GO biological_process,0.018,5
development arrests at loose mound stage,Dictybase Phenotypes,0.018,3
Glycosaminoglycan degradation,KEGG Pathways,0.0259,2
extracellular region,GO cellular_component,0.0286,6
increased slug migration,Dictybase Phenotypes,0.0315,2
decreased aggregate size,Dictybase Phenotypes,0.0315,3
decreased fruiting body size,Dictybase Phenotypes,0.038,4
response to stress,GO biological_process,0.0549,8
signal transduction,GO biological_process,0.0549,8




***  C4 selected: 74 with EID: 74
Ratio of genes annotated with a gene set in reference 0.32 and query 0.23
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 15 out of 17 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
extracellular region,GO cellular_component,3.67e-06,8
wild type,Dictybase Phenotypes,0.142,5
decreased gene expression,Dictybase Phenotypes,0.208,2
decreased spore viability,Dictybase Phenotypes,0.208,2
DNA binding,GO molecular_function,0.215,3




***  C5 selected: 68 with EID: 68
Ratio of genes annotated with a gene set in reference 0.32 and query 0.97
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 62 out of 66 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Ribosome,KEGG Pathways,6.55e-98,60
ribosome,GO cellular_component,6.07e-97,59
structural constituent of ribosome,GO molecular_function,9.42e-84,53
structural molecule activity,GO molecular_function,1.8600000000000003e-73,53
translation,GO biological_process,2.02e-61,52
rRNA binding,GO molecular_function,2.37e-09,7
RNA binding,GO molecular_function,3.8e-08,16
cytosol,GO cellular_component,0.0748,8




***  C6 selected: 66 with EID: 65
Ratio of genes annotated with a gene set in reference 0.32 and query 0.58
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 34 out of 38 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Oxidative phosphorylation,KEGG Pathways,9.41e-08,9
Metabolic pathways,KEGG Pathways,8.64e-05,19
Citrate cycle (TCA cycle),KEGG Pathways,0.00509,4
mitochondrion,GO cellular_component,0.00511,9
Phagosome,KEGG Pathways,0.0299,4
Biosynthesis of antibiotics,KEGG Pathways,0.0694,6
protein folding,GO biological_process,0.0694,3
generation of precursor metabolites and energy,GO biological_process,0.0694,4
aberrant cellular response to stress,Dictybase Phenotypes,0.0694,2
cytoplasmic vesicle,GO cellular_component,0.111,6




***  C7 selected: 64 with EID: 64
Ratio of genes annotated with a gene set in reference 0.32 and query 0.16
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 6 out of 10 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
aberrant cell-cell adhesion,Dictybase Phenotypes,0.000986,2
anatomical structure development,GO biological_process,0.0572,4
aberrant fruiting body morphology,Dictybase Phenotypes,0.059,2
extracellular region,GO cellular_component,0.0993,2




***  C8 selected: 61 with EID: 59
Ratio of genes annotated with a gene set in reference 0.32 and query 0.19
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 6 out of 11 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
anatomical structure development,GO biological_process,0.208,4
extracellular region,GO cellular_component,0.224,2
wild type,Dictybase Phenotypes,0.224,3




***  C9 selected: 58 with EID: 58
Ratio of genes annotated with a gene set in reference 0.32 and query 0.21
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 9 out of 12 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
extracellular region,GO cellular_component,0.0226,4
cell adhesion,GO biological_process,0.0226,3
hydrolase activity,GO molecular_function,0.044,2
cell differentiation,GO biological_process,0.044,3
anatomical structure formation involved in morphogenesis,GO biological_process,0.0885,2




***  C10 selected: 57 with EID: 57
Ratio of genes annotated with a gene set in reference 0.32 and query 0.25
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 11 out of 14 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
extracellular region,GO cellular_component,1.89e-05,7
cell-cell signaling,GO biological_process,0.0211,2
decreased spore viability,Dictybase Phenotypes,0.12,2
peptidase activity,GO molecular_function,0.142,2
plasma membrane,GO cellular_component,0.17,3
anatomical structure development,GO biological_process,0.17,4




***  C11 selected: 55 with EID: 55
Ratio of genes annotated with a gene set in reference 0.32 and query 0.44
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 18 out of 24 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cytoskeletal protein binding,GO molecular_function,3.76e-07,10
Endocytosis,KEGG Pathways,0.00141,5
cellular component assembly,GO biological_process,0.0115,6
protein-containing complex assembly,GO biological_process,0.0115,5
cytoskeleton organization,GO biological_process,0.0115,6
response to stress,GO biological_process,0.0249,7
cytoskeleton,GO cellular_component,0.0255,5
cytosol,GO cellular_component,0.132,4
cytoplasmic vesicle,GO cellular_component,0.132,4
plasma membrane,GO cellular_component,0.233,4




***  C12 selected: 54 with EID: 54
Ratio of genes annotated with a gene set in reference 0.32 and query 0.2
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 4 out of 11 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Phagosome,KEGG Pathways,0.16,2
decreased spore viability,Dictybase Phenotypes,0.16,2




***  C13 selected: 46 with EID: 45
Ratio of genes annotated with a gene set in reference 0.32 and query 0.33
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 10 out of 15 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
extracellular region,GO cellular_component,0.000326,6
anatomical structure formation involved in morphogenesis,GO biological_process,0.0285,3
cell differentiation,GO biological_process,0.124,3
oxidoreductase activity,GO molecular_function,0.16,3




***  C14 selected: 45 with EID: 45
Ratio of genes annotated with a gene set in reference 0.32 and query 0.13
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 3 out of 6 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cytosol,GO cellular_component,0.235,2
response to stress,GO biological_process,0.243,2




***  C15 selected: 41 with EID: 41
Ratio of genes annotated with a gene set in reference 0.32 and query 0.98
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 35 out of 40 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Proteasome,KEGG Pathways,2.2e-55,29
peptidase activity,GO molecular_function,8.26e-24,21
catabolic process,GO biological_process,4.25e-20,25




***  C16 selected: 40 with EID: 40
Ratio of genes annotated with a gene set in reference 0.32 and query 0.38
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 12 out of 15 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
aberrant CRAC localization to the plasma membrane,Dictybase Phenotypes,0.0196,2
translucent sorus,Dictybase Phenotypes,0.0213,2
development arrests at tipped mound stage,Dictybase Phenotypes,0.0213,2
decreased intracellular cAMP level,Dictybase Phenotypes,0.0213,2
cell death,GO biological_process,0.0376,2
anatomical structure development,GO biological_process,0.0376,6
anatomical structure formation involved in morphogenesis,GO biological_process,0.0376,3
kinase activity,GO molecular_function,0.0508,5
RNA degradation,KEGG Pathways,0.0574,2
cell differentiation,GO biological_process,0.0574,3




***  C17 selected: 36 with EID: 36
Ratio of genes annotated with a gene set in reference 0.32 and query 0.53
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 18 out of 19 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cell cycle,GO biological_process,8.25e-13,13
chromosome segregation,GO biological_process,1.81e-09,6
mitotic nuclear division,GO biological_process,5.5e-08,5
chromosome organization,GO biological_process,3.03e-07,7
mitotic cell cycle,GO biological_process,1.66e-06,7
DNA replication,KEGG Pathways,2.06e-06,5
chromosome,GO cellular_component,2.75e-06,6
cell division,GO biological_process,2.75e-06,7
cytoskeleton,GO cellular_component,5.99e-05,7
microtubule organizing center,GO cellular_component,0.00617,3




***  C18 selected: 30 with EID: 30
Ratio of genes annotated with a gene set in reference 0.32 and query 0.7
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 20 out of 21 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Ribosome biogenesis in eukaryotes,KEGG Pathways,1.08e-15,12
nucleolus,GO cellular_component,8.61e-14,11
ribosome biogenesis,GO biological_process,6.18e-13,10
ribonucleoprotein complex assembly,GO biological_process,1.1e-05,4
RNA binding,GO molecular_function,0.000566,6
rRNA binding,GO molecular_function,0.00545,2
protein-containing complex assembly,GO biological_process,0.0142,4
cellular component assembly,GO biological_process,0.0441,4
helicase activity,GO molecular_function,0.0526,2
methyltransferase activity,GO molecular_function,0.0563,2




***  C19 selected: 23 with EID: 23
Ratio of genes annotated with a gene set in reference 0.32 and query 0.26
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 3 out of 6 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
kinase activity,GO molecular_function,0.0471,3




***  C20 selected: 20 with EID: 20
Ratio of genes annotated with a gene set in reference 0.32 and query 0.0
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 0 out of 0 query genes used for enrichment calculation.


None



***  C21 selected: 19 with EID: 19
Ratio of genes annotated with a gene set in reference 0.32 and query 0.05
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 0 out of 1 query genes used for enrichment calculation.


None





### Enrichment for mergedGenes_minExpressed0.990.1Strains1Min1Max18_clustersAX4Louvain0.4m0s1log.pdf

In [41]:
regulons=pd.read_table(pathRegulons+'mergedGenes_minExpressed0.990.1Strains1Min1Max18_clustersAX4Louvain0.4m0s1log.tab',index_col=0)
for group in range(1,len(regulons['Cluster'].unique())+1):
    enriched=group_diff_enrichment(data=regulons,group='C'+str(group),min_overlap=2)
    if enriched is not None:
        enriched.to_csv(pathRegulons+'enrichment/minExpressed0.990.1Strains1Min1Max18_clustersAX4Louvain0.4m0s1log_enrichment_cluster'+str(group)+'.tsv', sep='\t',index=False)

***  C1 selected: 96 with EID: 96
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 30 out of 96 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
anatomical structure development,GO biological_process,2.66e-05,16
cell differentiation,GO biological_process,0.00827,7
signal transduction,GO biological_process,0.0147,10
development arrests at loose mound stage,Dictybase Phenotypes,0.0147,3
Glycosaminoglycan degradation,KEGG Pathways,0.0218,2
extracellular region,GO cellular_component,0.0252,6
aberrant aggregation,Dictybase Phenotypes,0.0252,4
increased slug migration,Dictybase Phenotypes,0.0252,2
decreased aggregate size,Dictybase Phenotypes,0.0265,3
decreased fruiting body size,Dictybase Phenotypes,0.034,4




***  C2 selected: 89 with EID: 88
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 10 out of 88 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
extracellular region,GO cellular_component,0.00263,7
anatomical structure formation involved in morphogenesis,GO biological_process,0.019,4
Starch and sucrose metabolism,KEGG Pathways,0.102,2
cell differentiation,GO biological_process,0.102,4




***  C3 selected: 87 with EID: 87
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 36 out of 87 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
external encapsulating structure,GO cellular_component,7.49e-09,6
cell wall,GO cellular_component,7.49e-09,6
aberrant spore coat morphology,Dictybase Phenotypes,3.47e-07,5
cell wall organization or biogenesis,GO biological_process,9.7e-07,5
anatomical structure formation involved in morphogenesis,GO biological_process,1.47e-05,7
anatomical structure development,GO biological_process,0.000151,13
extracellular region,GO cellular_component,0.000189,8
abolished cellulose binding,Dictybase Phenotypes,0.000541,2
cell differentiation,GO biological_process,0.000698,7
aberrant cell-cell adhesion,Dictybase Phenotypes,0.00427,2




***  C4 selected: 85 with EID: 85
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 7 out of 85 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
extracellular region,GO cellular_component,0.00697,7




***  C5 selected: 73 with EID: 73
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 2 out of 73 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cell-cell signaling,GO biological_process,0.0361,2




***  C6 selected: 72 with EID: 72
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 10 out of 72 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
extracellular region,GO cellular_component,6.28e-07,10




***  C7 selected: 70 with EID: 70
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 12 out of 70 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
extracellular region,GO cellular_component,0.154,4
kinase activity,GO molecular_function,0.154,6
cell adhesion,GO biological_process,0.227,2




***  C8 selected: 65 with EID: 63
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 8 out of 63 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
external encapsulating structure,GO cellular_component,0.0174,2
cell wall,GO cellular_component,0.0174,2
anatomical structure formation involved in morphogenesis,GO biological_process,0.0381,3
hydrolase activity,GO molecular_function,0.0911,2
lipid binding,GO molecular_function,0.114,2
cell differentiation,GO biological_process,0.114,3
extracellular region,GO cellular_component,0.12,3




***  C9 selected: 57 with EID: 56
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 6 out of 56 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cell adhesion,GO biological_process,0.0123,4
decreased spore viability,Dictybase Phenotypes,0.18,2




***  C10 selected: 54 with EID: 54
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 26 out of 54 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
mitochondrion,GO cellular_component,7.61e-08,11
Oxidative phosphorylation,KEGG Pathways,7.61e-08,7
Metabolic pathways,KEGG Pathways,9.01e-06,14
generation of precursor metabolites and energy,GO biological_process,1.52e-05,6
protein folding,GO biological_process,0.000212,4
unfolded protein binding,GO molecular_function,0.00176,3
oxidoreductase activity,GO molecular_function,0.00207,6
Phagosome,KEGG Pathways,0.0103,3
transmembrane transporter activity,GO molecular_function,0.0242,4
transmembrane transport,GO biological_process,0.0303,4




***  C11 selected: 54 with EID: 54
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 44 out of 54 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Ribosome,KEGG Pathways,1.13e-86,44
ribosome,GO cellular_component,4.68e-85,43
structural constituent of ribosome,GO molecular_function,8.59e-76,39
structural molecule activity,GO molecular_function,4.2499999999999997e-69,39
translation,GO biological_process,2.73e-62,39
RNA binding,GO molecular_function,4.33e-10,11
rRNA binding,GO molecular_function,6.82e-07,4




***  C12 selected: 49 with EID: 49
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 13 out of 49 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cell differentiation,GO biological_process,0.00188,6
anatomical structure formation involved in morphogenesis,GO biological_process,0.00885,4
DNA-binding transcription factor activity,GO molecular_function,0.0129,3
reproduction,GO biological_process,0.0207,4
anatomical structure development,GO biological_process,0.0265,7
decreased gene expression,Dictybase Phenotypes,0.0374,3
Starch and sucrose metabolism,KEGG Pathways,0.0478,2
DNA binding,GO molecular_function,0.06,4
decreased spore viability,Dictybase Phenotypes,0.0725,2
generation of precursor metabolites and energy,GO biological_process,0.0955,2




***  C13 selected: 49 with EID: 49
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 42 out of 49 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Proteasome,KEGG Pathways,1.18e-65,29
peptidase activity,GO molecular_function,1.24e-31,21
catabolic process,GO biological_process,4.33e-29,25
Protein processing in endoplasmic reticulum,KEGG Pathways,0.0525,3
enzyme regulator activity,GO molecular_function,0.0525,3
response to stress,GO biological_process,0.0713,5
endoplasmic reticulum,GO cellular_component,0.0768,3
ATPase activity,GO molecular_function,0.13,3
cytoplasmic vesicle,GO cellular_component,0.178,3




***  C14 selected: 43 with EID: 43
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 21 out of 43 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cytoskeletal protein binding,GO molecular_function,2.38e-10,11
cellular component assembly,GO biological_process,8.13e-06,8
protein-containing complex assembly,GO biological_process,8.13e-06,7
cytoskeleton organization,GO biological_process,8.13e-06,8
cytoskeleton,GO cellular_component,3.66e-05,7
Endocytosis,KEGG Pathways,5.73e-05,5
response to stress,GO biological_process,0.00154,7
cytosol,GO cellular_component,0.00469,5
plasma membrane,GO cellular_component,0.0136,5
decreased cell motility,Dictybase Phenotypes,0.0137,3




***  C15 selected: 38 with EID: 38
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 18 out of 38 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cell cycle,GO biological_process,5.3e-14,13
chromosome segregation,GO biological_process,1.94e-10,6
mitotic nuclear division,GO biological_process,7.84e-09,5
chromosome organization,GO biological_process,2.61e-08,7
mitotic cell cycle,GO biological_process,1.45e-07,7
DNA replication,KEGG Pathways,2.74e-07,5
cell division,GO biological_process,2.74e-07,7
chromosome,GO cellular_component,3.05e-07,6
cytoskeleton,GO cellular_component,5.47e-06,7
cytoskeleton organization,GO biological_process,0.00117,5




***  C16 selected: 35 with EID: 35
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 12 out of 35 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
aberrant CRAC localization to the plasma membrane,Dictybase Phenotypes,0.0112,2
development arrests at tipped mound stage,Dictybase Phenotypes,0.0243,2
anatomical structure formation involved in morphogenesis,GO biological_process,0.0378,3
cell death,GO biological_process,0.0389,2
RNA degradation,KEGG Pathways,0.0437,2
kinase activity,GO molecular_function,0.0437,4
anatomical structure development,GO biological_process,0.0437,5
cell differentiation,GO biological_process,0.0437,3
aberrant aggregation,Dictybase Phenotypes,0.0437,2
decreased chemotaxis to cAMP,Dictybase Phenotypes,0.0437,2




***  C17 selected: 31 with EID: 31
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 0 out of 31 query genes used for enrichment calculation.


None



***  C18 selected: 31 with EID: 31
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 20 out of 31 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Ribosome biogenesis in eukaryotes,KEGG Pathways,7.7e-19,12
nucleolus,GO cellular_component,9.74e-17,11
ribosome biogenesis,GO biological_process,1.15e-15,10
ribonucleoprotein complex assembly,GO biological_process,8.06e-07,4
RNA binding,GO molecular_function,1.36e-05,6
rRNA binding,GO molecular_function,0.00141,2
protein-containing complex assembly,GO biological_process,0.00141,4
cellular component assembly,GO biological_process,0.00446,4
helicase activity,GO molecular_function,0.0168,2
methyltransferase activity,GO molecular_function,0.0182,2




***  C19 selected: 20 with EID: 20
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 18 out of 20 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
ribosome,GO cellular_component,4.29e-33,17
Ribosome,KEGG Pathways,4.7e-33,17
structural constituent of ribosome,GO molecular_function,1.1199999999999999e-28,15
structural molecule activity,GO molecular_function,1.4600000000000002e-26,15
translation,GO biological_process,5.8e-22,14
rRNA binding,GO molecular_function,3.78e-06,3
RNA binding,GO molecular_function,1.82e-05,5
cytosol,GO cellular_component,3.81e-05,5
ribosome biogenesis,GO biological_process,0.000302,3
delayed development,Dictybase Phenotypes,0.0117,2




***  C20 selected: 1 with EID: 1
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 0 out of 1 query genes used for enrichment calculation.


None





#### Using only genes annotated with at least one gene set

In [42]:
regulons=pd.read_table(pathRegulons+'mergedGenes_minExpressed0.990.1Strains1Min1Max18_clustersAX4Louvain0.4m0s1log.tab',index_col=0)
for group in range(1,len(regulons['Cluster'].unique())+1):
    enriched=group_diff_enrichment(data=regulons,group='C'+str(group),min_overlap=2,use_annotated_genes=True)
    if enriched is not None:
        enriched.to_csv(pathRegulons+'enrichment/minExpressed0.990.1Strains1Min1Max18_clustersAX4Louvain0.4m0s1log_enrichment_cluster'+str(group)+'.tsv', sep='\t',index=False)

***  C1 selected: 96 with EID: 96
Ratio of genes annotated with a gene set in reference 0.32 and query 0.39
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 30 out of 37 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
anatomical structure development,GO biological_process,6.78e-05,16
cell differentiation,GO biological_process,0.0202,7
Glycosaminoglycan degradation,KEGG Pathways,0.0313,2
signal transduction,GO biological_process,0.0313,10
development arrests at loose mound stage,Dictybase Phenotypes,0.0313,3
aberrant aggregation,Dictybase Phenotypes,0.0418,4
increased slug migration,Dictybase Phenotypes,0.0418,2
extracellular region,GO cellular_component,0.0436,6
decreased aggregate size,Dictybase Phenotypes,0.0436,3
decreased fruiting body size,Dictybase Phenotypes,0.0604,4




***  C2 selected: 89 with EID: 88
Ratio of genes annotated with a gene set in reference 0.32 and query 0.2
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 10 out of 18 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
extracellular region,GO cellular_component,6.5e-05,7
anatomical structure formation involved in morphogenesis,GO biological_process,0.0029,4
cell differentiation,GO biological_process,0.0242,4
Starch and sucrose metabolism,KEGG Pathways,0.0425,2




***  C3 selected: 87 with EID: 87
Ratio of genes annotated with a gene set in reference 0.32 and query 0.45
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 34 out of 39 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
external encapsulating structure,GO cellular_component,4.58e-08,6
cell wall,GO cellular_component,4.58e-08,6
aberrant spore coat morphology,Dictybase Phenotypes,1.63e-06,5
cell wall organization or biogenesis,GO biological_process,4.53e-06,5
anatomical structure formation involved in morphogenesis,GO biological_process,0.000105,7
abolished cellulose binding,Dictybase Phenotypes,0.00141,2
extracellular region,GO cellular_component,0.00152,8
anatomical structure development,GO biological_process,0.00185,13
cell differentiation,GO biological_process,0.00448,7
aberrant cell-cell adhesion,Dictybase Phenotypes,0.00828,2




***  C4 selected: 85 with EID: 85
Ratio of genes annotated with a gene set in reference 0.32 and query 0.24
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 7 out of 20 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
extracellular region,GO cellular_component,0.000488,7




***  C5 selected: 73 with EID: 73
Ratio of genes annotated with a gene set in reference 0.32 and query 0.11
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 5 out of 8 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cell-cell signaling,GO biological_process,0.00391,2
anatomical structure development,GO biological_process,0.0503,4
extracellular region,GO cellular_component,0.134,2
signal transduction,GO biological_process,0.233,2
wild type,Dictybase Phenotypes,0.233,2




***  C6 selected: 72 with EID: 72
Ratio of genes annotated with a gene set in reference 0.32 and query 0.24
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 12 out of 17 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
extracellular region,GO cellular_component,3.29e-09,10
wild type,Dictybase Phenotypes,0.154,5




***  C7 selected: 70 with EID: 70
Ratio of genes annotated with a gene set in reference 0.32 and query 0.26
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 12 out of 18 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
extracellular region,GO cellular_component,0.0617,4
kinase activity,GO molecular_function,0.0617,6
cell adhesion,GO biological_process,0.181,2




***  C8 selected: 65 with EID: 63
Ratio of genes annotated with a gene set in reference 0.32 and query 0.14
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 8 out of 9 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
external encapsulating structure,GO cellular_component,0.00297,2
cell wall,GO cellular_component,0.00297,2
anatomical structure formation involved in morphogenesis,GO biological_process,0.00297,3
extracellular region,GO cellular_component,0.0134,3
hydrolase activity,GO molecular_function,0.0134,2
cell differentiation,GO biological_process,0.0134,3
lipid binding,GO molecular_function,0.0179,2
anatomical structure development,GO biological_process,0.0266,4
protein transport,GO biological_process,0.115,2




***  C9 selected: 57 with EID: 56
Ratio of genes annotated with a gene set in reference 0.32 and query 0.2
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 6 out of 11 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cell adhesion,GO biological_process,0.00125,4
decreased spore viability,Dictybase Phenotypes,0.112,2




***  C10 selected: 54 with EID: 54
Ratio of genes annotated with a gene set in reference 0.32 and query 0.57
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 26 out of 31 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Oxidative phosphorylation,KEGG Pathways,6.18e-06,7
mitochondrion,GO cellular_component,1.39e-05,11
generation of precursor metabolites and energy,GO biological_process,0.000471,6
Metabolic pathways,KEGG Pathways,0.0019,14
protein folding,GO biological_process,0.0019,4
unfolded protein binding,GO molecular_function,0.00933,3
oxidoreductase activity,GO molecular_function,0.0362,6
Phagosome,KEGG Pathways,0.0514,3
Citrate cycle (TCA cycle),KEGG Pathways,0.124,2
Propanoate metabolism,KEGG Pathways,0.126,2




***  C11 selected: 54 with EID: 54
Ratio of genes annotated with a gene set in reference 0.32 and query 0.89
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 44 out of 48 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Ribosome,KEGG Pathways,7.620000000000001e-70,44
ribosome,GO cellular_component,2.1799999999999997e-68,43
structural constituent of ribosome,GO molecular_function,4.2199999999999996e-60,39
structural molecule activity,GO molecular_function,2.02e-53,39
translation,GO biological_process,1.2299999999999999e-46,39
RNA binding,GO molecular_function,1.25e-05,11
rRNA binding,GO molecular_function,3.87e-05,4




***  C12 selected: 49 with EID: 49
Ratio of genes annotated with a gene set in reference 0.32 and query 0.31
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 13 out of 15 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cell differentiation,GO biological_process,0.000742,6
anatomical structure formation involved in morphogenesis,GO biological_process,0.0058,4
DNA-binding transcription factor activity,GO molecular_function,0.01,3
reproduction,GO biological_process,0.0111,4
anatomical structure development,GO biological_process,0.0111,7
decreased gene expression,Dictybase Phenotypes,0.0298,3
Starch and sucrose metabolism,KEGG Pathways,0.0425,2
DNA binding,GO molecular_function,0.0528,4
decreased spore viability,Dictybase Phenotypes,0.0697,2
cytoskeleton organization,GO biological_process,0.0864,3




***  C13 selected: 49 with EID: 49
Ratio of genes annotated with a gene set in reference 0.32 and query 0.94
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 35 out of 46 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Proteasome,KEGG Pathways,1.94e-52,29
peptidase activity,GO molecular_function,4.64e-22,21
catabolic process,GO biological_process,6.21e-18,25




***  C14 selected: 43 with EID: 43
Ratio of genes annotated with a gene set in reference 0.32 and query 0.51
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 20 out of 22 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cytoskeletal protein binding,GO molecular_function,8.28e-09,11
cellular component assembly,GO biological_process,0.000128,8
protein-containing complex assembly,GO biological_process,0.000128,7
cytoskeleton organization,GO biological_process,0.000128,8
cytoskeleton,GO cellular_component,0.000444,7
Endocytosis,KEGG Pathways,0.000444,5
response to stress,GO biological_process,0.0182,7
cytosol,GO cellular_component,0.0316,5
cell morphogenesis,GO biological_process,0.0462,2
decreased cell motility,Dictybase Phenotypes,0.0484,3




***  C15 selected: 38 with EID: 38
Ratio of genes annotated with a gene set in reference 0.32 and query 0.5
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 18 out of 19 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
cell cycle,GO biological_process,8.25e-13,13
chromosome segregation,GO biological_process,1.81e-09,6
mitotic nuclear division,GO biological_process,5.5e-08,5
chromosome organization,GO biological_process,3.03e-07,7
mitotic cell cycle,GO biological_process,1.66e-06,7
DNA replication,KEGG Pathways,2.06e-06,5
chromosome,GO cellular_component,2.75e-06,6
cell division,GO biological_process,2.75e-06,7
cytoskeleton,GO cellular_component,5.99e-05,7
microtubule organizing center,GO cellular_component,0.00617,3




***  C16 selected: 35 with EID: 35
Ratio of genes annotated with a gene set in reference 0.32 and query 0.4
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 12 out of 14 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
aberrant CRAC localization to the plasma membrane,Dictybase Phenotypes,0.0168,2
development arrests at tipped mound stage,Dictybase Phenotypes,0.0366,2
cell death,GO biological_process,0.0488,2
anatomical structure formation involved in morphogenesis,GO biological_process,0.0488,3
RNA degradation,KEGG Pathways,0.0598,2
anatomical structure development,GO biological_process,0.0598,5
cell differentiation,GO biological_process,0.0598,3
aberrant aggregation,Dictybase Phenotypes,0.0598,2
decreased chemotaxis to cAMP,Dictybase Phenotypes,0.0598,2
kinase activity,GO molecular_function,0.066,4




***  C17 selected: 31 with EID: 31
Ratio of genes annotated with a gene set in reference 0.32 and query 0.26
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 2 out of 8 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
oxidoreductase activity,GO molecular_function,0.231,2




***  C18 selected: 31 with EID: 31
Ratio of genes annotated with a gene set in reference 0.32 and query 0.71
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 20 out of 22 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
Ribosome biogenesis in eukaryotes,KEGG Pathways,3.09e-15,12
nucleolus,GO cellular_component,2.23e-13,11
ribosome biogenesis,GO biological_process,1.47e-12,10
ribonucleoprotein complex assembly,GO biological_process,1.76e-05,4
RNA binding,GO molecular_function,0.000985,6
rRNA binding,GO molecular_function,0.00786,2
protein-containing complex assembly,GO biological_process,0.0221,4
cellular component assembly,GO biological_process,0.0678,4
helicase activity,GO molecular_function,0.0754,2
methyltransferase activity,GO molecular_function,0.0806,2




***  C19 selected: 20 with EID: 20
Ratio of genes annotated with a gene set in reference 0.32 and query 0.9
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 18 out of 18 query genes used for enrichment calculation.


Gene set,Ontology,FDR,N in query
ribosome,GO cellular_component,1.92e-26,17
Ribosome,KEGG Pathways,2.1e-26,17
structural constituent of ribosome,GO molecular_function,1.65e-22,15
structural molecule activity,GO molecular_function,2.13e-20,15
translation,GO biological_process,3.87e-16,14
rRNA binding,GO molecular_function,8.21e-05,3
RNA binding,GO molecular_function,0.0023,5
cytosol,GO cellular_component,0.00453,5
ribosome biogenesis,GO biological_process,0.006,3
delayed development,Dictybase Phenotypes,0.083,2




***  C20 selected: 1 with EID: 1
Ratio of genes annotated with a gene set in reference 0.32 and query 0.0
Enrichment at FDR: 0.25 and min query - gene set overlap 2
N query genes in displayed gene sets: 0 out of 0 query genes used for enrichment calculation.


None



