In [3]:
# Imports
import numpy as np
import pandas as pd
import muon as mu
import mofax
import gget

In [4]:
# Define the data and figure folder.
data_folder = "/users/csb/huizing/Documents/PhD/Code/mowgli_reproducibility/data/"
h_folder = "/users/csb/huizing/Documents/PhD/Code/Mowgli/local_analysis/from_jz/h/"

In [5]:
# Load the data.
mdata = mu.read_h5mu(data_folder + "TEA/tea_preprocessed.h5mu.gz")

In [6]:
# Load MOFA+'s weights.
mofa_model = mofax.mofa_model(data_folder + "TEA/tea_mofa_15.hdf5")
H_mofa = mofa_model.get_weights("rna")

In [7]:
# Load Mowgli's weights.
H_mowgli = np.load(
    h_folder + "tea_mowgli_cosine_50_0_05_rna_0_01_atac_0_1_adt_0_01_0_001.npy",
    allow_pickle=True,
).item()["H_rna"]

In [12]:
n_genes = 200

In [8]:
def top_mowgli(dim, n):
    """
    Get the top n genes for a given dimension.
    """
    idx = H_mowgli[:, dim].argsort()[::-1][:n]
    return mdata["rna"].var_names[idx].str.replace("rna:", "").to_list()


def top_mofa(dim, n):
    """
    Get the top n genes for a given dimension.
    """
    idx = H_mofa[:, dim].argsort()[::-1][:n]
    return mdata["rna"].var_names[idx].str.replace("rna:", "").to_list()


def bottom_mofa(dim, n):
    """
    Get the bottom n genes for a given dimension.
    """
    idx = H_mofa[:, dim].argsort()[:n]
    return mdata["rna"].var_names[idx].str.replace("rna:", "").to_list()

In [28]:
sources = [
    "GO_Biological_Process_2021",
    "GO_Cellular_Component_2021",
    "GO_Molecular_Function_2021",
    "KEGG_2021_Human",
    "PanglaoDB_Augmented_2021",
    "Reactome_2016",
    "Azimuth_Cell_Types_2021.gmt",
]
enr_total = pd.DataFrame({})

for source in sources:
    for dim in range(H_mowgli.shape[1]):
        enr = gget.enrichr(genes=top_mowgli(dim, n_genes), database=source)
        enr["dim"] = dim
        enr["source"] = source
        enr["method"] = "mowgli"
        enr["query"] = f"mowgli {dim}"
        enr_total = pd.concat([enr_total, enr])

    for dim in range(H_mofa.shape[1]):
        enr = gget.enrichr(genes=top_mofa(dim, n_genes), database=source)
        enr["dim"] = dim
        enr["source"] = source
        enr["method"] = "mofa"
        enr["query"] = f"top_mofa {dim}"
        enr_total = pd.concat([enr_total, enr])

    for dim in range(H_mofa.shape[1]):
        enr = gget.enrichr(genes=bottom_mofa(dim, n_genes), database=source)
        enr["dim"] = dim
        enr["source"] = source
        enr["method"] = "mofa"
        enr["query"] = f"bottom_mofa {dim}"
        enr_total = pd.concat([enr_total, enr])


Wed Sep  7 11:30:18 2022 INFO Performing Enichr analysis using database GO_Biological_Process_2021.
Wed Sep  7 11:30:20 2022 INFO Performing Enichr analysis using database GO_Biological_Process_2021.
Wed Sep  7 11:30:22 2022 INFO Performing Enichr analysis using database GO_Biological_Process_2021.
Wed Sep  7 11:30:24 2022 INFO Performing Enichr analysis using database GO_Biological_Process_2021.
Wed Sep  7 11:30:25 2022 INFO Performing Enichr analysis using database GO_Biological_Process_2021.
Wed Sep  7 11:30:27 2022 INFO Performing Enichr analysis using database GO_Biological_Process_2021.
Wed Sep  7 11:30:29 2022 INFO Performing Enichr analysis using database GO_Biological_Process_2021.
Wed Sep  7 11:30:30 2022 INFO Performing Enichr analysis using database GO_Biological_Process_2021.
Wed Sep  7 11:30:32 2022 INFO Performing Enichr analysis using database GO_Biological_Process_2021.
Wed Sep  7 11:30:33 2022 INFO Performing Enichr analysis using database GO_Biological_Process_2021.


In [29]:
len(enr_total)

250619

In [30]:
enr_total = enr_total[enr_total["adj_p_val"] < .05]

In [31]:
len(enr_total)

17442

In [33]:
enr_total.head()

Unnamed: 0,rank,path_name,p_val,z_score,combined_score,overlapping_genes,adj_p_val,database,dim,source,method,query
0,1,"positive regulation of transcription, DNA-temp...",5.17886e-10,3.69489,79.001419,"[PRKN, TXK, LEF1, RORA, PIK3R1, GABPB1, FOXO3,...",1e-06,GO_Biological_Process_2021,0,GO_Biological_Process_2021,mowgli,mowgli 0
1,2,positive regulation of transcription by RNA po...,8.219745e-09,3.803162,70.802432,"[PRKN, TXK, LEF1, RORA, PIK3R1, GABPB1, FOXO3,...",9e-06,GO_Biological_Process_2021,0,GO_Biological_Process_2021,mowgli,mowgli 0
2,3,regulation of transcription by RNA polymerase ...,5.228629e-08,2.654248,44.502525,"[RORA, GABPB1, LITAF, BACH2, NR3C2, PPP3CA, TR...",3.7e-05,GO_Biological_Process_2021,0,GO_Biological_Process_2021,mowgli,mowgli 0
3,4,protein autophosphorylation (GO:0046777),7.088338e-08,8.533652,140.482937,"[LYN, FER, CAMK2D, GRK5, TXK, CAMK4, STK39, EI...",3.8e-05,GO_Biological_Process_2021,0,GO_Biological_Process_2021,mowgli,mowgli 0
4,5,negative regulation of apoptotic process (GO:0...,9.620003e-08,4.620072,74.645741,"[PRKN, ITGB1, CD74, PRKCH, ANXA1, TFRC, KDM2B,...",4.1e-05,GO_Biological_Process_2021,0,GO_Biological_Process_2021,mowgli,mowgli 0


In [32]:
enr_total.to_csv("/users/csb/huizing/Documents/PhD/Code/mowgli_reproducibility/enrich/enrichr.csv")