In [1]:

#################################################### 1. cell2location: map snRNA celltype to spatial cellbin #######################################################
# 1. import packages
import os
import sys
import scanpy as sc
import anndata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import cell2location
import scvi
import torch
from PIL import Image
Image.MAX_IMAGE_PIXELS = None
torch.cuda.empty_cache()
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42  # enables correct plotting of text for PDFs
import time
print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

# 2. define variables
os.chdir("/data/work/human_embryo")
from py_he_function_zy import *

deconv = "cell2loc"
cut = "spateo_cut"
scRNA_name = "snRNA_final"
sample = sys.argv[1]
organ = sys.argv[2]

# 3. initialize paths
chip, sample_name, bin50_folder, cut_folder = initial_path(sample, dest_dir, cut)
final_folder = cut_folder + sample + "." + scRNA_name + "." + deconv + "/"
final_cellbin_path = final_folder + sample_name + "." + "whole" + ".cellbin."
run_name = final_folder + '/cell2location_map'
if organ != "":
    final_cellbin_path = final_folder + sample_name + "." + organ + ".cellbin."
    run_name = final_folder + '/cell2location_map' + '.' + organ

if not os.path.exists(final_folder):
    os.makedirs(final_folder)    
if not os.path.exists(run_name):
    os.makedirs(run_name)

# 4. prepare spatial cellbin
adata_cellbin_path = cut_folder + "h5ad/" + sample_name + ".cellbin"
adata_st = sc.read(adata_cellbin_path + ".h5ad")
adata_st.X = adata_st.layers['counts']
adata_st.obs['sample'] = sample

if organ == "":
    adata_st = bin_remove_domain(adata_st, "celltype", ["not"])
else if organ == "Brain":
    adata_st = bin_subset_domain(adata_st, "celltype", ["Brain", "Spinal Cord"])

# 5. prepare scRNA data 
sc_dir = dest_dir + "snRNA/"
if organ == "":
    adata_sc_path = sc_dir + scRNA_name + ".h5ad"
else:
    adata_sc_path = sc_dir + scRNA_name + ".brain.h5ad"
adata_sc = sc.read(adata_sc_path)
adata_sc.X = adata_sc.layers["counts"]
stage = snRNA_dict[sample]
adata_sc = bin_subset_domain(adata_sc, "stage", [stage])
anno = "celltype"
if organ=="Brain":
    anno = "brain_celltype"
    
# 6. prepare matrix
cell_key = "cell_label"
adata_sc.obs[cell_key] = adata_sc.obs[anno]
print(adata_sc.obs[cell_key].value_counts())
# filter genes: very permissive genes selection but not standard highly-variable-gene selection
from cell2location.utils.filtering import filter_genes
selected = filter_genes(adata_sc, cell_count_cutoff=5, cell_percentage_cutoff2=0.03, nonz_mean_cutoff=1.12)
# filter the object
adata_sc = adata_sc[:, selected].copy()
 # inf_aver = cell2loc_NB_samples(adata_sc, anno, final_folder, "samples")
inf_aver = bin_cluster_mean(adata_sc, cell_key)
print(inf_aver.iloc[0:5, 0:5])

# 7. Cell2location: spatial mapping
# find shared genes and subset both anndata and reference signatures
intersect = np.intersect1d(adata_st.var_names, inf_aver.index)
adata_st = adata_st[:, intersect].copy()
inf_aver = inf_aver.loc[intersect, :].copy()
 # prepare anndata for cell2location model
cell2location.models.Cell2location.setup_anndata(adata=adata_st, batch_key="sample")
# create the model
mod = cell2location.models.Cell2location(
adata_st, cell_state_df=inf_aver, # the expected average cell abundance: tissue-dependent hyper-prior which can be estimated from paired histology:
N_cells_per_location=1, # hyperparameter controlling normalisation of within-experiment variation in RNA detection:
detection_alpha=20  # 200
)
mod.view_anndata_setup()
# training
with torch.no_grad():
    mod.train(max_epochs=25, # train using full data (batch_size=None)
    batch_size=10000,  # None,
    train_size=1, # use all data points in training because we need to estimate cell abundance at all locations
    use_gpu=False)
torch.cuda.empty_cache()
# # plot ELBO loss history during training, removing first 100 epochs from the plot
# mod.plot_history(20)
# plt.legend(labels=['full data training'])
# plt.savefig(final_cellbin_path + ".cell2loc.elbo_loss.png")
# Save model
mod.save(f"{run_name}", overwrite=True)
# # Read model
# adata_st = sc.read(final_cellbin_path + ".h5ad")
# mod = cell2location.models.Cell2location.load(f"{run_name}", adata_st)
# In this section, we export the estimated cell abundance (summary of the posterior distribution).
adata_st = mod.export_posterior(
    adata_st, sample_kwargs={'num_samples': 3000, 'batch_size': mod.adata.n_obs, 'use_gpu': False}
    )
# Save anndata object with results
adata_st.write(final_cellbin_path + ".h5ad")
# # mapping quality
# mod.plot_QC()
# plt.savefig(final_cellbin_path + ".cell2loc.mod_QC.png")

# 8. export spatial mapping label table and plot
cell_key = "cell_label"
adata_cellbin_cell2loc, cells, score_list = deconv_assign_cell_label(adata_st, deconv)
adata_cellbin_cell2loc.obs[anno] = adata_cellbin_cell2loc.obs[cell_key].astype(str)
save_bin_clusters_tab(adata_cellbin_cell2loc, final_cellbin_path, [cell_key], cell_key)
plot_scanpy_spatial_cluster(adata_cellbin_cell2loc, final_cellbin_path, [cell_key], cell_key, sample)
adata_st.write(final_cellbin_path + ".h5ad")



  from .autonotebook import tqdm as notebook_tqdm
Global seed set to 0


2024-08-06 12:46:04


In [None]:

#################################################### 2. integrate organ bin50 from multiple sections and re-cluster #######################################################
# 1. import packages
import warnings
import sys
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import os
import time
import pandas as pd
import numpy as np
import scanpy as sc

# 2. define variables
os.chdir("/data/work/human_embryo")
from py_he_function import *
cut = "spateo_cut"
bin_type = "bin50"

organ_table = pd.read_csv(dest_dir + "ref/all_sample.celltype.ordered_clusters_colors.tab", sep="\t", header=0)
organ_list = organ_table['clusters'].values
plot_cluster_list = ["organ_anno", "sample", "embryo"]
domain_name = "organ_anno"
final_bin_folder = dest_dir + "bin50/merge_organs/"
if not os.path.exists(final_bin_folder):
    os.makedirs(final_bin_folder)
final_bin_path = final_bin_folder + "all_26_samples.bin50"

organ_sample_table = pd.read_csv(dest_dir + "organ_samples.tab", sep="\t", header=0)
organs = organ_sample_table['organ'].values
for organ in organ_list:
    organ_name = organ.replace("_", " ")
    samples = organ_sample_table[organ_sample_table["organ"] == organ]['samples'].values
    sample_list = samples.tolist()[0].split(", ")
    if len(sample_list) > 0:
        domain_key_list = [organ]
        domain_file_name = organ_name
        organ_bin_path = final_bin_path + "." + organ_name + ".before_correct"
        # merge organ bin50
        print(organ + " merging!")
        adata_bin_organ_before = bin_merge_by_domain_cluster_denovo(sample_list, organ_bin_path, domain_name, domain_key_list, domain_file_name, plot_cluster_list, dest_dir, cut)
        del adata_bin_organ_before.obsp
        # scvi_correct
        print(organ + " scvi correction!")
        final_save_path = final_bin_path + "." + organ_name + ".scvi_correct"
        adata_bin_organ = scvi_bin_merge_tutorial(adata_bin_organ_before, "sample", final_save_path, plot_cluster_list)
        # scvi_correct cluster
        print(organ + " clustering!")
        cluster_folder = final_bin_folder + "scvi_cluster/"
        if not os.path.exists(cluster_folder):     
            os.makedirs(cluster_folder)
        organ_cluster_path = cluster_folder + "all_26_samples.bin50." + organ_name + ".scvi_correct"
        res_array = np.arange(1.0, 1.42, 0.2).round(1)
        adata_bin_organ = bin_louvain_cluster(adata_bin_organ, organ_cluster_path, "all", res_array)
        cluster_list = ['louvain_1.0_clusters', 'louvain_1.2_clusters', 'louvain_1.4_clusters']
        save_bin_clusters_tab(adata_bin_organ, organ_cluster_path, cluster_list, "louvain_clusters")
        # rank
        select_cluster = 'louvain_1.0_clusters'
        rank_folder = dest_dir + "bin50/merge_organs_final/scvi_cluster_rank/"
        if not os.path.exists(rank_folder):
            os.makedirs(rank_folder)
        organ_rank_path = rank_folder + "all_26_samples.bin50." + organ_name + ".scvi_correct" + "." + select_cluster
        adata_bin_organ_rank = bin_rank_genes(adata_bin_organ, organ_rank_path, select_cluster)
        bin_rank_genes_plot(adata_bin_organ_rank, organ_rank_path, "all", select_cluster)
        # map cluster to each section and spatial plot
        sample_list = np.unique(adata_bin_organ.obs['sample'].values)
        for sample in sample_list:
            # chip, sample_name, bin50_folder, cut_folder, register_folder = initial_path(sample, dest_dir, cut)
            # h5ad_path = bin50_folder + "/h5ad/" + sample + "." + bin_type
            if sample in sample_list: # if os.path.isfile(h5ad_path + ".h5ad"):
                plot_folder = dest_dir + "bin50/merge_organs_final/scvi_cluster_plot/"
                if not os.path.exists(plot_folder):
                    os.makedirs(plot_folder)
                plot_sample_path = plot_folder + "all_26_samples.bin50." + organ_name + ".scvi_correct" + "." + sample
                adata_bin50 = bin_subset_domain(adata_bin_organ, "sample", [sample])
                if len(adata_bin50.obs_names) > 0:
                    color_tab_path = dest_dir + "ref/all_samples.louvain_clusters.palettes.tab"
                    plot_scanpy_spatial_cluster_palette(adata_bin50, plot_sample_path, cluster_list, "louvain_clusters", sample, color_tab_path)
                    plot_scanpy_umap_cluster_palette(adata_bin50, plot_sample_path, cluster_list, "louvain_clusters", sample, color_tab_path)
                    adata_bin50.write(plot_sample_path + ".louvain_clusters.h5ad")
