In [10]:
import os, glob
import numpy as np
import pandas as pd
import gseapy as gp

In [11]:
signature_fn = sorted(glob.glob('results/*/*/*.txt'))
signature_dict = {'_'.join([fn.split('/')[i] for i in [-3, -1]]).replace('.txt', '') : fn for fn in signature_fn}
for key in signature_dict:
    signature_dict[key] = pd.read_csv(signature_dict[key])
    print(key, signature_dict[key])

GSE136689_early    mmusculus hsapiens
0       Dab2     DAB2
1      Map1b    MAP1B
2       Irx1     IRX1
3      Rbm24    RBM24
4      Sox11    SOX11
..       ...      ...
65     Nop58    NOP58
66     Aldoa    ALDOA
67       Mmd      MMD
68   Slc16a3  SLC16A3
69    Mpped2   MPPED2

[70 rows x 2 columns]
GSE136689_late    mmusculus hsapiens
0        Id2      ID2
1     Homer2   HOMER2
2       Manf     MANF
3       Bex2     BEX1
4       Bex2     BEX2
5       Bex1     BEX1
6       Bex1     BEX2
7      Nr2f2    NR2F2
8       Wnt2     WNT2
9       Alx1     ALX1
10     Csrp2    CSRP2
11     Sfrp1    SFRP1
12      Krt8     KRT8
13     Krt18    KRT18
14    Phlda1   PHLDA1
15      Osr1     OSR1
16      Mycn     MYCN
17      Rbp1     RBP1
18     Gata6    GATA6
19     Sfrp5    SFRP5
20     Rras2    RRAS2
21     Pmp22    PMP22
22     Cox17    COX17
23    Popdc2   POPDC2
24       Ddt     DDTL
25       Ddt      DDT
26   Smarcd3  SMARCD3
27      Ly6e     LY6E
28      Nrp1     NRP1
29     Dusp9    DUSP9


In [17]:
for key, signature in signature_dict.items():
    df = gp.enrichr(
        gene_list = signature.mmusculus.tolist(),
        gene_sets = ['GO_Biological_Process_2025', 'GO_Cellular_Component_2025', 'GO_Molecular_Function_2025'],
        organism = 'Mouse',
        outdir = None,
        cutoff = .05
        ).results[['Term', 'Adjusted P-value']]
    df['Term'] = df.Term.str.extract(r'\((GO:\d+)\)')
    df = df.loc[df['Adjusted P-value'] < .05].sort_values('Adjusted P-value')

    if df.empty:
        print(key, 'No significant GO terms found.')
    else:
        gseapy_dir_fn = '/'.join(['results', key.split('_')[0], 'gseapy'])
        os.makedirs(gseapy_dir_fn, exist_ok = True)
        df.to_csv(f'{gseapy_dir_fn}/{key.split('_')[1]}.csv', index = False)
        print(key, df.head(), '\n')

GSE136689_early          Term  Adjusted P-value
0  GO:0030182      4.704480e-07
1  GO:0048699      4.704480e-07
2  GO:0045597      1.548557e-06
3  GO:0003229      3.944875e-06
4  GO:0055010      1.721078e-05 

GSE136689_late            Term  Adjusted P-value
0    GO:0045893      5.219443e-08
1    GO:0045944      8.695259e-07
2    GO:0006357      1.446692e-05
599  GO:0043565      7.464384e-05
3    GO:0007507      1.529709e-04 

GSE162534_early          Term  Adjusted P-value
0  GO:0016126          0.001437
1  GO:1902653          0.010305
2  GO:0006695          0.010305
3  GO:0006694          0.010305
4  GO:0008203          0.010305 

GSE162534_late          Term  Adjusted P-value
0  GO:0007507      6.601746e-14
1  GO:0060976      2.871516e-10
2  GO:0006357      6.408609e-10
3  GO:0008284      1.757656e-09
4  GO:0007369      1.885291e-09 

GSE201257_early             Term  Adjusted P-value
1023  GO:1990837      2.330689e-08
1024  GO:0003690      2.330689e-08
0     GO:0009952      2.69487