In [20]:
import os, glob
import numpy as np
import pandas as pd
import gseapy as gp
import matplotlib.pyplot as plt

In [13]:
signature_fn = sorted(glob.glob('../03_signatures/results/*/biomart/*.txt'))
signature_dict = {'_'.join([fn.split('/')[i] for i in [-3, -1]]).replace('.txt', '') : fn for fn in signature_fn}
for key in signature_dict:
    signature_dict[key] = pd.read_csv(signature_dict[key])
    print(key, signature_dict[key])

GSE136689_early    mmusculus hsapiens
0       Dab2     DAB2
1      Map1b    MAP1B
2       Irx1     IRX1
3      Rbm24    RBM24
4      Sox11    SOX11
..       ...      ...
65     Nop58    NOP58
66     Aldoa    ALDOA
67       Mmd      MMD
68   Slc16a3  SLC16A3
69    Mpped2   MPPED2

[70 rows x 2 columns]
GSE136689_late    mmusculus hsapiens
0        Id2      ID2
1     Homer2   HOMER2
2       Manf     MANF
3       Bex2     BEX1
4       Bex2     BEX2
5       Bex1     BEX1
6       Bex1     BEX2
7      Nr2f2    NR2F2
8       Wnt2     WNT2
9       Alx1     ALX1
10     Csrp2    CSRP2
11     Sfrp1    SFRP1
12      Krt8     KRT8
13     Krt18    KRT18
14    Phlda1   PHLDA1
15      Osr1     OSR1
16      Mycn     MYCN
17      Rbp1     RBP1
18     Gata6    GATA6
19     Sfrp5    SFRP5
20     Rras2    RRAS2
21     Pmp22    PMP22
22     Cox17    COX17
23    Popdc2   POPDC2
24       Ddt     DDTL
25       Ddt      DDT
26   Smarcd3  SMARCD3
27      Ly6e     LY6E
28      Nrp1     NRP1
29     Dusp9    DUSP9


In [14]:
top_signatures = {
    'PC1' : [
        'GSE162534_early',
        'GSE162534_late',
        'GSE201257_late',
        'GSE229103_late'
        ],
    'PC2' : [
        'GSE136689_early',
        'GSE136689_late',
        'GSE229103_early'
        ]
    }

In [None]:
for pc, signatures in top_signatures.items():
    gene_list = []
    for signature in signatures:
        gene_list.extend(signature_dict[signature].mmusculus.tolist())
    gene_list = sorted(set(gene_list))
    
    df = gp.enrichr(
        gene_list = gene_list,
        gene_sets = ['GO_Molecular_Function_2025'],
        organism = 'Mouse',
        outdir = None,
        cutoff = .05
        ).results[['Term', 'Adjusted P-value']]
    df['Term'] = df.Term.str.extract(r'\((GO:\d+)\)')
    df = df.loc[df['Adjusted P-value'] < .05].sort_values('Adjusted P-value')

    if df.empty:
        print(pc, 'No significant GO terms found.')
    else:
        df.to_csv(f'results/gseapy/{pc}.txt', index = False, sep = '\t')
        print(pc, df.head(), '\n')

PC1          Term  Adjusted P-value
0  GO:1990837      2.095216e-18
1  GO:0043565      7.238997e-17
2  GO:0003690      1.699094e-16
3  GO:0000977      2.413187e-11
4  GO:0000978      9.588516e-11 

PC2          Term  Adjusted P-value
0  GO:0043565      8.907472e-11
1  GO:1990837      1.674723e-10
2  GO:0003690      5.081938e-10
3  GO:0000977      1.184278e-09
4  GO:0000978      6.446386e-08 



In [17]:
revigo_dict = top_signatures.copy()
for pc in revigo_dict:
    revigo_dict[pc] = pd.read_csv(f'results/revigo/{pc}.tsv', sep = '\t', index_col = 0)
    revigo_dict[pc] = revigo_dict[pc][revigo_dict[pc].Representative.isna()]
    print(pc, revigo_dict[pc])

PC1                                                          Name      Value  \
TermID                                                                     
GO:0001228  DNA-binding transcription activator activity, ...  -5.609599   
GO:0004714  transmembrane receptor protein tyrosine kinase...  -4.703099   
GO:0005104          fibroblast growth factor receptor binding  -1.316386   
GO:0005109                                   frizzled binding  -2.513445   
GO:0005501                                   retinoid binding  -1.803079   
GO:0009008                     DNA-methyltransferase activity  -1.669648   
GO:0033550           MAP kinase tyrosine phosphatase activity  -1.669648   
GO:0045296                                   cadherin binding  -1.856368   
GO:0048018                           receptor ligand activity  -2.807382   
GO:0048407             platelet-derived growth factor binding  -2.238479   
GO:0070016                    armadillo repeat domain binding  -1.423038   
GO:00706

In [37]:
# ix_shared = revigo_dict['PC1'].index.intersection(revigo_dict['PC2'].index)

# plot_df = pd.DataFrame({
#     'Description': revigo_dict['PC1'].loc[ix_shared, 'Name'],
#     'PC1': revigo_dict['PC1'].loc[ix_shared, 'Value'],
#     'PC2': revigo_dict['PC2'].loc[ix_shared, 'Value']
# })
# plot_df['mean'] = plot_df[['PC1', 'PC2']].mean(1)
# plot_df = plot_df.sort_values('mean')  # sort if you want clearer visual comparison

# # Plot paired bar plot
# plt.figure(figsize=(4, 6))
# x = range(len(plot_df))
# bar_width = 0.4

# plt.barh([i - bar_width/2 for i in x], plot_df['PC1'], height=bar_width, label='PC1')
# plt.barh([i + bar_width/2 for i in x], plot_df['PC2'], height=bar_width, label='PC2')

# plt.gca().invert_xaxis()
# plt.gca().invert_yaxis()
# plt.yticks(x, plot_df['Description'], ha='right')
# plt.xlabel('REVIGO Value')
# plt.title('Shared Representative GO Terms')
# plt.legend()
# plt.tight_layout()
# plt.show()


In [38]:
# ix_shared = sorted(set(revigo_dict['PC1'].index) - set(revigo_dict['PC2'].index))

# plot_df = pd.DataFrame({
#     'Description': revigo_dict['PC1'].loc[ix_shared, 'Name'],
#     'PC1': revigo_dict['PC1'].loc[ix_shared, 'Value'],
#     # 'PC2': revigo_dict['PC2'].loc[ix_shared, 'Value']
# })
# # plot_df['mean'] = plot_df[['PC1', 'PC2']].mean(1)
# plot_df = plot_df.sort_values('PC1')  # sort if you want clearer visual comparison

# # Plot paired bar plot
# plt.figure(figsize=(4, 6))
# x = range(len(plot_df))
# bar_width = 0.4

# plt.barh([i - bar_width/2 for i in x], plot_df['PC1'], height=bar_width, label='PC1')
# # plt.barh([i + bar_width/2 for i in x], plot_df['PC2'], height=bar_width, label='PC2')

# plt.gca().invert_xaxis()
# plt.gca().invert_yaxis()
# plt.yticks(x, plot_df['Description'], ha='right')
# plt.xlabel('REVIGO Value')
# plt.title('Shared Representative GO Terms')
# plt.legend()
# plt.tight_layout()
# plt.show()

In [39]:
# ix_shared = sorted(set(revigo_dict['PC2'].index) - set(revigo_dict['PC1'].index))

# plot_df = pd.DataFrame({
#     'Description': revigo_dict['PC2'].loc[ix_shared, 'Name'],
#     'PC2': revigo_dict['PC2'].loc[ix_shared, 'Value'],
#     # 'PC2': revigo_dict['PC2'].loc[ix_shared, 'Value']
# })
# # plot_df['mean'] = plot_df[['PC1', 'PC2']].mean(1)
# plot_df = plot_df.sort_values('PC2')  # sort if you want clearer visual comparison

# # Plot paired bar plot
# plt.figure(figsize=(4, 6))
# x = range(len(plot_df))
# bar_width = 0.4

# plt.barh([i - bar_width/2 for i in x], plot_df['PC2'], height=bar_width, label='PC2')
# # plt.barh([i + bar_width/2 for i in x], plot_df['PC2'], height=bar_width, label='PC2')

# plt.gca().invert_xaxis()
# plt.gca().invert_yaxis()
# plt.yticks(x, plot_df['Description'], ha='right')
# plt.xlabel('REVIGO Value')
# plt.title('Shared Representative GO Terms')
# plt.legend()
# plt.tight_layout()
# plt.show()