In [9]:
# Imports
import numpy as np
import pandas as pd
import muon as mu
import mofax
import gget

In [10]:
# Define the data and figure folder.
data_folder = "/users/csb/huizing/Documents/PhD/Code/mowgli_reproducibility/data/"
h_folder = "/users/csb/huizing/Documents/PhD/Code/Mowgli/local_analysis/from_jz/h/"

In [11]:
# Load the data.
mdata = mu.read_h5mu(data_folder + "TEA/tea_preprocessed.h5mu.gz")

In [12]:
# Load MOFA+'s weights.
mofa_model = mofax.mofa_model(data_folder + "TEA/tea_mofa_15.hdf5")
H_mofa = mofa_model.get_weights("rna")

In [13]:
# Load Mowgli's weights.
H_mowgli = np.load(
    h_folder + "tea_mowgli_cosine_50_0_05_rna_0_01_atac_0_1_adt_0_01_0_001.npy",
    allow_pickle=True,
).item()["H_rna"]

In [14]:
n_genes = 200

In [15]:
def top_mowgli(dim, n):
    """
    Get the top n genes for a given dimension.
    """
    idx = H_mowgli[:, dim].argsort()[::-1][:n]
    return mdata["rna"].var_names[idx].str.replace("rna:", "").to_list()


def top_mofa(dim, n):
    """
    Get the top n genes for a given dimension.
    """
    idx = H_mofa[:, dim].argsort()[::-1][:n]
    return mdata["rna"].var_names[idx].str.replace("rna:", "").to_list()


def bottom_mofa(dim, n):
    """
    Get the bottom n genes for a given dimension.
    """
    idx = H_mofa[:, dim].argsort()[:n]
    return mdata["rna"].var_names[idx].str.replace("rna:", "").to_list()

In [16]:
sources = [
    # "GO_Biological_Process_2021",
    # "GO_Cellular_Component_2021",
    # "GO_Molecular_Function_2021",
    # "KEGG_2021_Human",
    # "PanglaoDB_Augmented_2021",
    # "Reactome_2016",
    "Azimuth_Cell_Types_2021",
]
enr_total = pd.DataFrame({})

for source in sources:
    for dim in range(H_mowgli.shape[1]):
        enr = gget.enrichr(genes=top_mowgli(dim, n_genes), database=source)
        enr["dim"] = dim
        enr["source"] = source
        enr["method"] = "mowgli"
        enr["query"] = f"mowgli {dim}"
        enr_total = pd.concat([enr_total, enr])

    for dim in range(H_mofa.shape[1]):
        enr = gget.enrichr(genes=top_mofa(dim, n_genes), database=source)
        enr["dim"] = dim
        enr["source"] = source
        enr["method"] = "mofa"
        enr["query"] = f"top_mofa {dim}"
        enr_total = pd.concat([enr_total, enr])

    for dim in range(H_mofa.shape[1]):
        enr = gget.enrichr(genes=bottom_mofa(dim, n_genes), database=source)
        enr["dim"] = dim
        enr["source"] = source
        enr["method"] = "mofa"
        enr["query"] = f"bottom_mofa {dim}"
        enr_total = pd.concat([enr_total, enr])


Thu Sep  8 23:23:47 2022 INFO Performing Enichr analysis using database Azimuth_Cell_Types_2021.
Thu Sep  8 23:23:54 2022 INFO Performing Enichr analysis using database Azimuth_Cell_Types_2021.
Thu Sep  8 23:23:55 2022 INFO Performing Enichr analysis using database Azimuth_Cell_Types_2021.
Thu Sep  8 23:23:56 2022 INFO Performing Enichr analysis using database Azimuth_Cell_Types_2021.
Thu Sep  8 23:23:57 2022 INFO Performing Enichr analysis using database Azimuth_Cell_Types_2021.
Thu Sep  8 23:23:58 2022 INFO Performing Enichr analysis using database Azimuth_Cell_Types_2021.
Thu Sep  8 23:23:59 2022 INFO Performing Enichr analysis using database Azimuth_Cell_Types_2021.
Thu Sep  8 23:24:00 2022 INFO Performing Enichr analysis using database Azimuth_Cell_Types_2021.
Thu Sep  8 23:24:01 2022 INFO Performing Enichr analysis using database Azimuth_Cell_Types_2021.
Thu Sep  8 23:24:03 2022 INFO Performing Enichr analysis using database Azimuth_Cell_Types_2021.
Thu Sep  8 23:24:04 2022 INFO 

In [17]:
len(enr_total)

6864

In [18]:
enr_total = enr_total[enr_total["adj_p_val"] < .05]

In [19]:
len(enr_total)

1659

In [20]:
enr_total.head()

Unnamed: 0,rank,path_name,p_val,z_score,combined_score,overlapping_genes,adj_p_val,database,dim,source,method,query
0,1,B Cell CL0000785,2e-06,67.326531,885.393278,"[CD74, BANK1, MS4A1, RALGPS2]",8.2e-05,Azimuth_Cell_Types_2021,0,Azimuth_Cell_Types_2021,mowgli,mowgli 0
1,2,CD4 T CL0000624,2e-06,67.326531,885.393278,"[BCL11B, CAMK4, LEF1, MALAT1]",8.2e-05,Azimuth_Cell_Types_2021,0,Azimuth_Cell_Types_2021,mowgli,mowgli 0
2,3,"Memory B Cell, Kappa Light Chain CL0000787",7.9e-05,50.238579,474.457917,"[BANK1, MS4A1, RALGPS2]",0.001571,Azimuth_Cell_Types_2021,0,Azimuth_Cell_Types_2021,mowgli,mowgli 0
3,4,Intermediate B Cell CL0000785,0.000112,43.059463,391.617756,"[BANK1, MS4A1, RALGPS2]",0.001571,Azimuth_Cell_Types_2021,0,Azimuth_Cell_Types_2021,mowgli,mowgli 0
4,5,B CL0000236,0.000112,43.059463,391.617756,"[ADAM28, BANK1, MS4A1]",0.001571,Azimuth_Cell_Types_2021,0,Azimuth_Cell_Types_2021,mowgli,mowgli 0


In [21]:
enr_total.to_csv("/users/csb/huizing/Documents/PhD/Code/mowgli_reproducibility/enrich/enrichr.csv")