In [8]:
import os
import sys
from importlib import reload

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.insert(0,module_path)

import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display

from orangecontrib.bioinformatics.geneset.__init__ import (list_all, load_gene_sets)

import deR.enrichment_library as enr
reload(enr)


<module 'deR.enrichment_library' from '/home/karin/Documents/git/baylor-dicty/deR/enrichment_library.py'>

In [9]:
pathSelected = '/home/karin/Documents/timeTrajectories/data/regulons/selected_genes/'
pathDE = '/home/karin/Documents/timeTrajectories/data/deTime/de_time_impulse/'
dataPath='/home/karin/Documents/timeTrajectories/data/RPKUM/combined/'

In [10]:
# Get all ene sets
gene_sets=list(list_all(organism='44689'))
GENE_SETS_ONTOLOGY=enr.get_gene_sets(gene_set_names=gene_sets, go_slims=True)

In [12]:
# For conversion of gene names to EID
genes = pd.read_csv(dataPath + 'mergedGenes_RPKUM.tsv', sep='\t', index_col=0)
all_gene_names= genes[(genes != 0).any(axis=1)].index
NAME_EID=enr.name_genes_entrez(gene_names=all_gene_names, key_entrez=False)
ALL_GENE_NAMES_EID=enr.convert_EID(genes=all_gene_names, name_EID=NAME_EID)

In [13]:
# Rename strain groups to work with pd query
selected_genes=pd.read_table(pathSelected+'newGenes_noAll-removeSelf_selectedGenesByGroup_scalemean0std1_logTrue_kN6_splitStrain.tsv',
                             index_col=0).T.rename(
    {'1Ag-':'Ag','2LAg':'LAg','3TA':'TA','4CD':'CD','5WT':'WT','6SFB':'SFB','7PD':'PD'},axis=1)

# Genes that were never expressed were never selected - thus they are not used in enrichment
# selected_genes.loc[genes[(genes == 0).all(axis=1)].index,:].max().max()
selected_genes=selected_genes.loc[all_gene_names,:]

In [14]:
# Rename strain groups to work with pd query
selected_genes1000=pd.read_table(pathSelected+'newGenes_noAll-removeSelf_selectedGenes1000ByGroup_scalemean0std1_logTrue_kN6_splitStrain.tsv',
                             index_col=0).T.rename(
    {'1Ag-':'Ag','2LAg':'LAg','3TA':'TA','4CD':'CD','5WT':'WT','6SFB':'SFB','7PD':'PD'},axis=1)


selected_genes1000=selected_genes1000.loc[all_gene_names,:]

In [15]:
# Rename strain groups to work with pd query
selected_genes_DE=pd.read_table(pathDE+'DE_FDRe-11group.tsv',
                             index_col=0).T.rename(
    {'1Ag-_DE':'Ag','2LAg_DE':'LAg','3TA_DE':'TA','4CD_DE':'CD','5WT_DE':'WT','6SFB_DE':'SFB','7PD_DE':'PD'},axis=1)

selected_genes_DE=selected_genes_DE.loc[all_gene_names,:]

In [16]:
def group_diff_enrichment(selected_genes:pd.DataFrame, condition:str,padj:float=0.25):
    selected=selected_genes.query(condition).index
    query_EID=enr.convert_EID(genes=selected, name_EID=NAME_EID)
    print('*** Condition:',condition, ' selected:',len(selected),'with EID:',len(query_EID))
    result=None
    if len(query_EID) > 0:
        enrichment=enr.gene_set_enrichment(query_EID, reference_EID=ALL_GENE_NAMES_EID, 
                                                padj_threshold=padj,
                                                gene_sets_ontology=GENE_SETS_ONTOLOGY)
        if len(enrichment)>0:
            enrichment_display=list()
            enrichment= sorted(enrichment, key=lambda data: data.padj)
            for enriched in enrichment:
                enrichment_display.append({'Gene set':enriched.gene_set.name,'Ontology':enriched.ontology,
                                           'FDR':"{:.2e}".format(enriched.padj),'N in query':enriched.in_query})
            result=pd.DataFrame(enrichment_display)
    print('Enrichment:')
    display(result)
        

In [18]:
for condition in ['Ag<=1 & LAg>=3 & TA>=1 & CD>=4 & WT>=1 & PD >=1',
                 'Ag<=1 & LAg<=1 & TA>=1 & CD>=4 & WT>=1 & PD >=1',
                 'Ag<=1 & LAg<=1 & TA<=1 & CD>=4 & WT>=1 & PD >=1',
                 'Ag<=1 & LAg<=1 & TA<=1 & CD<=1 & WT>=1 & PD >=1']:
    group_diff_enrichment(selected_genes,condition)

*** Condition: Ag<=1 & LAg>=3 & TA>=1 & CD>=4 & WT>=1 & PD >=1  selected: 739 with EID: 735
Enrichment:


Unnamed: 0,Gene set,Ontology,FDR,N in query
0,cell wall organization or biogenesis,"(GO, biological_process)",0.000176,8
1,extracellular region,"(GO, cellular_component)",0.00018,26
2,aberrant spore coat morphology,"(Dictybase, Phenotypes)",0.00018,7
3,external encapsulating structure,"(GO, cellular_component)",0.0333,5
4,cell wall,"(GO, cellular_component)",0.0333,5
5,anatomical structure development,"(GO, biological_process)",0.102,41


*** Condition: Ag<=1 & LAg<=1 & TA>=1 & CD>=4 & WT>=1 & PD >=1  selected: 157 with EID: 154
Enrichment:


None

*** Condition: Ag<=1 & LAg<=1 & TA<=1 & CD>=4 & WT>=1 & PD >=1  selected: 87 with EID: 86
Enrichment:


Unnamed: 0,Gene set,Ontology,FDR,N in query
0,precocious aggregation,"(Dictybase, Phenotypes)",0.16,2
1,decreased actin nucleation on plasma membranes,"(Dictybase, Phenotypes)",0.16,1
2,decreased lateral pseudopod traction,"(Dictybase, Phenotypes)",0.16,1
3,increased endocytic recycling,"(Dictybase, Phenotypes)",0.16,1


*** Condition: Ag<=1 & LAg<=1 & TA<=1 & CD<=1 & WT>=1 & PD >=1  selected: 256 with EID: 250
Enrichment:


None

In [19]:
for condition in ['Ag==4',
                 'LAg==4',
                 'TA==2',
                  # Use 4 instead of 5 (all strains) for CD as gtaG has almost no DE genes
                 'CD>=4',
                 'PD==2']:
    group_diff_enrichment(selected_genes_DE,condition)

*** Condition: Ag==4  selected: 834 with EID: 827
Enrichment:


Unnamed: 0,Gene set,Ontology,FDR,N in query
0,extracellular region,"(GO, cellular_component)",2.46e-06,32
1,anatomical structure development,"(GO, biological_process)",2.46e-06,62
2,cell differentiation,"(GO, biological_process)",2.8e-06,30
3,anatomical structure formation involved in mor...,"(GO, biological_process)",0.00225,16
4,delayed tip formation,"(Dictybase, Phenotypes)",0.00699,6
5,decreased spore viability,"(Dictybase, Phenotypes)",0.00699,13
6,abolished stalk cell differentiation,"(Dictybase, Phenotypes)",0.0315,4
7,aberrant spore coat morphology,"(Dictybase, Phenotypes)",0.032,5
8,Starch and sucrose metabolism,"(KEGG, Pathways)",0.032,8
9,abolished sporulation,"(Dictybase, Phenotypes)",0.0362,7


*** Condition: LAg==4  selected: 1888 with EID: 1884
Enrichment:


Unnamed: 0,Gene set,Ontology,FDR,N in query
0,extracellular region,"(GO, cellular_component)",0.000524,48
1,external encapsulating structure,"(GO, cellular_component)",0.00158,9
2,cell wall,"(GO, cellular_component)",0.00158,9
3,decreased spore viability,"(Dictybase, Phenotypes)",0.00158,23
4,anatomical structure formation involved in mor...,"(GO, biological_process)",0.00158,26
5,cell differentiation,"(GO, biological_process)",0.00158,42
6,anatomical structure development,"(GO, biological_process)",0.00583,96
7,extracellular matrix organization,"(GO, biological_process)",0.03,4
8,ribosome biogenesis,"(GO, biological_process)",0.03,20
9,aberrant spore coat morphology,"(Dictybase, Phenotypes)",0.0371,7


*** Condition: TA==2  selected: 444 with EID: 442
Enrichment:


Unnamed: 0,Gene set,Ontology,FDR,N in query
0,Metabolic pathways,"(KEGG, Pathways)",0.000499,48
1,Starch and sucrose metabolism,"(KEGG, Pathways)",0.00141,8
2,Amino sugar and nucleotide sugar metabolism,"(KEGG, Pathways)",0.00332,7
3,Galactose metabolism,"(KEGG, Pathways)",0.0064,5
4,anatomical structure formation involved in mor...,"(GO, biological_process)",0.0168,10
5,cell differentiation,"(GO, biological_process)",0.0339,14
6,carbohydrate metabolic process,"(GO, biological_process)",0.0404,10
7,anatomical structure development,"(GO, biological_process)",0.0722,28
8,Biosynthesis of secondary metabolites,"(KEGG, Pathways)",0.0865,18
9,Phagosome,"(KEGG, Pathways)",0.0865,7


*** Condition: CD>=4  selected: 27 with EID: 27
Enrichment:


None

*** Condition: PD==2  selected: 2055 with EID: 2045
Enrichment:


Unnamed: 0,Gene set,Ontology,FDR,N in query
0,extracellular region,"(GO, cellular_component)",9.77e-10,63
1,anatomical structure formation involved in mor...,"(GO, biological_process)",0.00668,27
2,Starch and sucrose metabolism,"(KEGG, Pathways)",0.00668,15
3,hydrolase activity,"(GO, molecular_function)",0.0234,17
4,cell differentiation,"(GO, biological_process)",0.0234,41
5,Metabolic pathways,"(KEGG, Pathways)",0.0234,143
6,One carbon pool by folate,"(KEGG, Pathways)",0.0241,8
7,anatomical structure development,"(GO, biological_process)",0.027,99
8,Amino sugar and nucleotide sugar metabolism,"(KEGG, Pathways)",0.0336,12
9,Galactose metabolism,"(KEGG, Pathways)",0.0336,8
