In [190]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad

import mudata

import os
from tqdm import tqdm

import re

In [166]:
h5mu_data_path="/project/GCRB/Hon_lab/s223695/Data_project/jamboree_2025/data/work_99_304b545580e299107b2429dd55a968_inference_mudata.h5mu"

gene_program_threshold = "2_0"

spectra_match_reg = f"spectra.k_([0-9]+).dt_{gene_program_cutoff}.consensus.txt"
usage_match_reg = f"usages.k_([0-9]+).dt_{gene_program_cutoff}.consensus.txt"


In [181]:
def load_cNMF_path(base_path,run_name):
    cNMF_result_folder = os.path.join(base_path,run_name)
    
    spectra_path_list = [os.path.join(cNMF_result_folder,f.name) \
                           for f in os.scandir(cNMF_result_folder) if re.search(spectra_match_reg,f.name)]
    usage_path_list = [os.path.join(cNMF_result_folder,f.name) \
                           for f in os.scandir(cNMF_result_folder) if re.search(usage_match_reg,f.name)]
    selection_stat_path = os.path.join(cNMF_result_folder,f"{run_name}.k_selection_stats.df.npz")
    
    return spectra_path_list, usage_path_list, selection_stat_path

In [182]:
base_path="/project/GCRB/Hon_lab/s223695/Data_project/jamboree_2025/processing_Hon_benchmark/"

# Process batch1
batch1_run_name = "Honlab_benchmark_batch1"
spectra_path_list_1, usage_path_list_1, selection_stat_path_1 = load_cNMF_path(base_path,batch1_run_name)

selection_stat_1 = np.load(selection_stat_path_1, allow_pickle=True)
selection_stat_df_1 = pd.DataFrame(selection_stat_1["data"],
                                   columns=selection_stat_1["columns"])


# Process batch2
batch2_run_name = "Honlab_benchmark_batch2"
spectra_path_list_2, usage_path_list_2, selection_stat_path_2 = load_cNMF_path(base_path,batch2_run_name)

selection_stat_2 = np.load(selection_stat_path_2, allow_pickle=True)
selection_stat_df_2 = pd.DataFrame(selection_stat_2["data"],
                                   columns=selection_stat_2["columns"])

# Integrate two batches
spectra_path_list = spectra_path_list_1 + spectra_path_list_2
usage_path_list = usage_path_list_1 + usage_path_list_2 

In [183]:
selection_stat_df = pd.concat([selection_stat_df_1,selection_stat_df_2]).sort_values("k")

In [184]:
spectra_path_dict = {int(re.findall(spectra_match_reg,f)[0]) : f for f in spectra_path_list}
usage_path_dict = {int(re.findall(usage_match_reg,f)[0]) : f for f in usage_path_list}

In [185]:
shared_k_list = np.intersect1d(list(spectra_path_dict.keys()),
                               list(usage_path_dict.keys())
                              )

### process_mudata

In [173]:
#load h5mu
mdata = mudata.read_h5mu(h5mu_data_path)

In [186]:
mdata.uns["k_selection_stat"] = selection_stat_df

In [215]:
for k in tqdm(shared_k_list):
    spectra_df = pd.read_csv(spectra_path_dict[k],sep="\t",index_col=0)
    usage_df = pd.read_csv(usage_path_dict[k],sep="\t",index_col=0)
    spectra_df.index = spectra_df.index.astype(str)
    
    #Convert into anndata
    usage_ad = ad.AnnData(usage_df)
    usage_ad.varm["loading"] = spectra_df.values
    usage_ad.uns["loading_index"] = spectra_df.index.tolist()
    usage_ad.uns["loading_columns"] = spectra_df.columns.tolist()
    
    mdata.mod[f"cNMF_{k}"] = usage_ad

100%|██████████| 30/30 [00:12<00:00,  2.33it/s]


In [219]:
mdata

In [217]:
mdata["cNMF_200"].varm["loading"].shape

(200, 5451)

In [218]:
mdata.write("./Output/Honlab_benchmark_cNMF.h5mu")

