# pycisTopic analysis

Full dataset, using consensus peak regions.

In [1]:
import pycisTopic

%load_ext nb_black
import warnings

warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")
import pickle
import pandas as pd
import os

%load_ext lab_black

<IPython.core.display.Javascript object>

In [2]:
wdir = "/lustre1/project/stg_00090/scatac_benchmark/fixedcells_6_merged_equalcells"
os.chdir(wdir)

<IPython.core.display.Javascript object>

In [3]:
# create output directory:
f_final_dir = os.path.join(wdir, "downstream_analysis")
if not os.path.exists(f_final_dir):
    os.makedirs(f_final_dir)

<IPython.core.display.Javascript object>

In [4]:
import glob

<IPython.core.display.Javascript object>

## Save/load cisTopic objects

In [5]:
cto_consensus_paths = sorted(glob.glob(f"cistopic_objects/*consensus_harmony.pkl"))
cistopic_obj_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x for x in cto_consensus_paths
}
cistopic_obj_path_dict

{'master_sub_1.FIXEDCELLS': 'cistopic_objects/master_sub_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_20topics.dimreduc.consensus_harmony.pkl'}

<IPython.core.display.Javascript object>

## Topic binarization & QC

In [6]:
from pycisTopic.topic_binarization import binarize_topics
from pycisTopic.topic_qc import compute_topic_metrics, plot_topic_qc
from pycisTopic.utils import fig2img
import matplotlib.pyplot as plt
from pycisTopic.topic_qc import topic_annotation
from pycisTopic.diff_features import (
    impute_accessibility,
    normalize_scores,
    find_highly_variable_features,
    find_diff_features,
)
from pycisTopic.clust_vis import plot_imputed_features
import numpy as np
import copy

<IPython.core.display.Javascript object>

### Binarize the topic-region distributions

In [9]:
for sample in cistopic_obj_path_dict.keys():
    # for sample  in ['SAN_10xmultiome_1.FIXEDCELLS']:
    path = cistopic_obj_path_dict[sample]
    if not os.path.exists(f"downstream_analysis/DARs/{sample}"):
        if os.path.isfile(path):
            print(f"Loading {path}")
            with open(path, "rb") as f:
                cto = pickle.load(f)

            cto.cell_names = cto.cell_data.index
            if [x.split("-")[0] for x in cto.selected_model.cell_topic.columns] == [
                x.split("___")[0] for x in list(cto.cell_names)
            ]:
                cto.selected_model.cell_topic.columns = list(cto.cell_names)

            region_bin_topics = binarize_topics(
                cto,
                method="otsu",
                ntop=3000,
                plot=True,
                num_columns=6,
                save=f"plots_qc/{sample}__topic_region_binarized.png",
            )

            with open(
                f"downstream_analysis/binarized_topics/{sample}__topic_region_binarized.pkl",
                "wb",
            ) as f:
                pickle.dump(region_bin_topics, f, protocol=4)

            binarized_cell_topics = binarize_topics(
                cto,
                target="cell",
                method="li",
                plot=True,
                num_columns=5,
                nbins=100,
                save=f"plots_qc/{sample}__cells_topic_binarized.png",
            )

            with open(
                f"downstream_analysis/binarized_topics/{sample}__cells_topic_binarized.pkl",
                "wb",
            ) as f:
                pickle.dump(region_bin_topics, f, protocol=4)

            topic_qc_metrics = compute_topic_metrics(cto)

            with open(
                f"downstream_analysis/binarized_topics/{sample}__topic_qc.pkl", "wb"
            ) as f:
                pickle.dump(topic_qc_metrics, f, protocol=4)

            fig_dict = {
                "CoherenceVSAssignments": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Coherence",
                    var_y="Log10_Assignments",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
                "AssignmentsVSCells_in_bin": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Log10_Assignments",
                    var_y="Cells_in_binarized_topic",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
                "CoherenceVSCells_in_bin": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Coherence",
                    var_y="Cells_in_binarized_topic",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
                "CoherenceVSRegions_in_bin": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Coherence",
                    var_y="Regions_in_binarized_topic",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
                "CoherenceVSMarginal_dist": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Coherence",
                    var_y="Marginal_topic_dist",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
                "CoherenceVSGini_index": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Coherence",
                    var_y="Gini_index",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
            }

            fig = plt.figure(figsize=(40, 43))
            i = 1
            for fig_ in fig_dict.keys():
                plt.subplot(2, 3, i)
                img = fig2img(
                    fig_dict[fig_]
                )  # To convert figures to png to plot together, see .utils.py. This converts the figure to png.
                plt.imshow(img)
                plt.axis("off")
                i += 1
            plt.subplots_adjust(wspace=0, hspace=-0.70)
            # plt.tight_layout()
            plt.show()
            plt.savefig(
                f"plots_qc/{sample}__topic_qc_metrics.png", facecolor="white", dpi=150
            )

            topic_annot = topic_annotation(
                cto,
                annot_var="consensus_cell_type",
                binarized_cell_topic=binarized_cell_topics,
                general_topic_thr=0.2,
            )
            topic_qc_metrics = pd.concat(
                [
                    topic_annot[
                        [
                            "consensus_cell_type",
                            "Ratio_cells_in_topic",
                            "Ratio_group_in_population",
                        ]
                    ],
                    topic_qc_metrics,
                ],
                axis=1,
            )

            imputed_acc_obj = impute_accessibility(
                cto, selected_cells=None, selected_regions=None, scale_factor=10**6
            )

            normalized_imputed_acc_obj = normalize_scores(
                imputed_acc_obj, scale_factor=10**4
            )

            with open(
                f"downstream_analysis/imputed_acc_objs/{sample}__imputed_acc_obs.pkl",
                "wb",
            ) as f:
                pickle.dump(normalized_imputed_acc_obj, f, protocol=4)

            variable_regions = find_highly_variable_features(
                normalized_imputed_acc_obj,
                min_disp=0.05,
                min_mean=0.0125,
                max_mean=3,
                max_disp=np.inf,
                n_bins=20,
                n_top_features=None,
                plot=True,
                save=f"plots_qc/{sample}__HVR.png",
            )

            print(f"Found {len(variable_regions)} variable regions")
            with open(f"downstream_analysis/HVRs/{sample}__HVRs.bed", "w") as f:
                for line in [
                    x.replace(":", "\t").replace("-", "\t") for x in variable_regions
                ]:
                    f.write(f"{line}\n")

            with open(f"downstream_analysis/HVRs/{sample}__HVRs.pkl", "wb") as f:
                pickle.dump(variable_regions, f, protocol=4)

            if not os.path.exists(f"downstream_analysis/DARs/{sample}"):
                os.mkdir(f"downstream_analysis/DARs/{sample}")

            markers_dict = find_diff_features(
                cto,
                normalized_imputed_acc_obj,
                variable="consensus_cell_type",
                var_features=variable_regions,
                contrasts=None,
                adjpval_thr=0.05,
                log2fc_thr=np.log2(1.5),
                n_cpu=16,
            )
            with open(
                f"downstream_analysis/DARs/{sample}/{sample}__DARs_dict.pkl", "wb"
            ) as f:
                pickle.dump(markers_dict, f, protocol=4)

            for cell_type in markers_dict.keys():
                markers = markers_dict[cell_type].index.tolist()
                df = pd.DataFrame(markers)
                if len(df) == 0:
                    print(f"no DARs found for {cell_type} in {sample}")
                else:
                    df[[0, 1]] = df[0].str.split(":", expand=True)
                    df[[1, 2]] = df[1].str.split("-", expand=True)
                    df[3] = markers_dict[cell_type].index.tolist()
                    df[3] = cell_type.replace(" ", "_")  # + '_' + df[3].astype(str)
                    score = markers_dict[cell_type]["Log2FC"]
                    score = score.reset_index(drop=True)
                    df[4] = score
                    df[5] = "."
                    pval = markers_dict[cell_type]["Adjusted_pval"]
                    pval = pval.reset_index(drop=True)
                    df[6] = pval
                    name = cell_type.replace(" ", "_")
                    # name = name.replace('/', '-')
                    if not os.path.exists(f"downstream_analysis/DARs/{sample}"):
                        os.mkdir(f"downstream_analysis/DARs/{sample}")

                    df.to_csv(
                        f'downstream_analysis/DARs/{sample}/{sample}__{cell_type.replace(" ", "_")}__DARs.bed',
                        sep="\t",
                        header=False,
                        index=False,
                    )
                    df[0:2000].to_csv(
                        f'downstream_analysis/DARs/{sample}/{sample}__{cell_type.replace(" ", "_")}__DARs.TOP2k.bed',
                        sep="\t",
                        header=False,
                        index=False,
                    )

        else:
            print(f"{path} does not exist!")
    else:
        print(f"downstream_analysis/DARs/{sample} already exists!")

downstream_analysis/DARs/master_sub_1.FIXEDCELLS already exists!


<IPython.core.display.Javascript object>

# Calculate DARs between male and female

In [12]:
normalized_imputed_acc_obj_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(
        glob.glob(f"downstream_analysis/imputed_acc_objs/*imputed_acc_obs.pkl")
    )
}
normalized_imputed_acc_obj_path_dict

{'master_sub_1.FIXEDCELLS': 'downstream_analysis/imputed_acc_objs/master_sub_1.FIXEDCELLS__imputed_acc_obs.pkl'}

<IPython.core.display.Javascript object>

In [13]:
hvr_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob(f"downstream_analysis/HVRs/*bed"))
}
hvr_path_dict

{'master_sub_1.FIXEDCELLS': 'downstream_analysis/HVRs/master_sub_1.FIXEDCELLS__HVRs.bed'}

<IPython.core.display.Javascript object>

In [14]:
cistopic_obj_path_dict

{'master_sub_1.FIXEDCELLS': 'cistopic_objects/master_sub_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_20topics.dimreduc.consensus_harmony.pkl'}

<IPython.core.display.Javascript object>

In [10]:
for sample in cistopic_obj_path_dict.keys():
    # for sample  in ['SAN_10xmultiome_1.FIXEDCELLS']:
    if not os.path.exists(f"male_female_DARs/{sample}"):
        cto_path = cistopic_obj_path_dict[sample]
        if os.path.isfile(cto_path):
            print(f"Loading {cto_path}")
            with open(cto_path, "rb") as f:
                cto = pickle.load(f)

            cto.cell_names = cto.cell_data.index
            if [x.split("-")[0] for x in cto.selected_model.cell_topic.columns] == [
                x.split("___")[0] for x in list(cto.cell_names)
            ]:
                cto.selected_model.cell_topic.columns = list(cto.cell_names)

        if "fmx_sample" in cto.cell_data.columns:
            acc_path = normalized_imputed_acc_obj_path_dict[sample]

            if os.path.isfile(acc_path):
                print(f"Loading {acc_path}")
                with open(acc_path, "rb") as f:
                    normalized_imputed_acc_obj = pickle.load(f)

            hvr_pkl_path = f"downstream_analysis/HVRs/{sample}__HVRs.pkl"

            if os.path.exists(hvr_pkl_path):
                print(f"Loading {hvr_pkl_path}")
                with open(hvr_pkl_path, "rb") as f:
                    variable_regions = pickle.load(f)
            else:
                print("HVRs do not exist!")

            cto.cell_data["fmx_sample_consensus_cell_type"] = (
                cto.cell_data["fmx_sample"] + "_" + cto.cell_data["consensus_cell_type"]
            )

            contrasts = []
            for cell_type in cto.cell_data["consensus_cell_type"].unique():
                print(cell_type)
                ct_a = "sampleA_" + cell_type
                ct_b = "sampleB_" + cell_type
                contrast = [[[ct_a], [ct_b]], [[ct_b], [ct_a]]]
                contrasts = contrasts + contrast

            markers_dict = find_diff_features(
                cto,
                normalized_imputed_acc_obj,
                variable="fmx_sample_consensus_cell_type",
                var_features=variable_regions,
                contrasts=contrasts,
                adjpval_thr=0.05,
                log2fc_thr=np.log2(1.2),
                n_cpu=10,
            )

            if not os.path.exists(f"male_female_DARs/{sample}"):
                os.mkdir(f"male_female_DARs/{sample}")

            with open(
                f"male_female_DARs/{sample}/{sample}__DARs_dict_1-2xfoldchange.pkl",
                "wb",
            ) as f:
                pickle.dump(markers_dict, f, protocol=4)

            for contrast in markers_dict.keys():
                markers = markers_dict[contrast].index.tolist()
                df = pd.DataFrame(markers)
                if len(df) == 0:
                    print(f"no DARs found for {contrast} in {sample}")
                else:
                    df[[0, 1]] = df[0].str.split(":", expand=True)
                    df[[1, 2]] = df[1].str.split("-", expand=True)
                    df[3] = markers_dict[contrast].index.tolist()
                    df[3] = contrast.replace(" ", "_")  # + '_' + df[3].astype(str)
                    score = markers_dict[contrast]["Log2FC"]
                    score = score.reset_index(drop=True)
                    df[4] = score
                    df[5] = "."
                    pval = markers_dict[contrast]["Adjusted_pval"]
                    pval = pval.reset_index(drop=True)
                    df[6] = pval
                    name = contrast.replace(" ", "_")

                    df.to_csv(
                        f'male_female_DARs/{sample}/{sample}__{contrast.replace(" ", "_")}__1-2xfoldchange_DARs.bed',
                        sep="\t",
                        header=False,
                        index=False,
                    )

Loading cistopic_objects/master_sub_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_20topics.dimreduc.consensus.pkl
Loading downstream_analysis/imputed_acc_objs/master_sub_1.FIXEDCELLS.singlets__imputed_acc_obs.pkl
Loading downstream_analysis/HVRs/master_sub_1.FIXEDCELLS.singlets__HVRs.pkl
Cytotoxic T cell
B cell
CD4+ T cell
Natural killer cell
Dendritic cell
CD16+ monocyte
CD14+ monocyte


2023-01-01 21:35:52,723	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(markers_ray pid=3310943)[0m 2023-01-01 21:35:59,833 cisTopic     INFO     Formatting data for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=3310945)[0m 2023-01-01 21:36:03,360 cisTopic     INFO     Formatting data for sampleB_Cytotoxic T cell_VS_sampleA_Cytotoxic T cell
[2m[36m(markers_ray pid=3310949)[0m 2023-01-01 21:36:06,774 cisTopic     INFO     Formatting data for sampleA_B cell_VS_sampleB_B cell
[2m[36m(markers_ray pid=3310943)[0m 2023-01-01 21:36:07,229 cisTopic     INFO     Computing p-value for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=3310942)[0m 2023-01-01 21:36:10,260 cisTopic     INFO     Formatting data for sampleB_B cell_VS_sampleA_B cell
[2m[36m(markers_ray pid=3310945)[0m 2023-01-01 21:36:11,208 cisTopic     INFO     Computing p-value for sampleB_Cytotoxic T cell_VS_sampleA_Cytotoxic T cell
[2m[36m(markers_ray pid=3310948)[0m 2023-01-01 21:36:13,548 cisTopic     INFO     For

[2m[36m(raylet)[0m Spilled 15930 MiB, 1 objects, write throughput 785 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.


[2m[36m(markers_ray pid=3310945)[0m 2023-01-01 21:36:47,349 cisTopic     INFO     Computing log2FC for sampleB_Cytotoxic T cell_VS_sampleA_Cytotoxic T cell
[2m[36m(markers_ray pid=3310944)[0m 2023-01-01 21:36:48,066 cisTopic     INFO     Computing log2FC for sampleA_Natural killer cell_VS_sampleB_Natural killer cell


[2m[36m(raylet)[0m Spilled 31861 MiB, 2 objects, write throughput 1409 MiB/s.


[2m[36m(markers_ray pid=3310945)[0m 2023-01-01 21:36:48,926 cisTopic     INFO     sampleB_Cytotoxic T cell_VS_sampleA_Cytotoxic T cell done!
[2m[36m(markers_ray pid=3310944)[0m 2023-01-01 21:36:49,640 cisTopic     INFO     sampleA_Natural killer cell_VS_sampleB_Natural killer cell done!
[2m[36m(markers_ray pid=3310943)[0m 2023-01-01 21:36:49,744 cisTopic     INFO     Formatting data for sampleA_CD16+ monocyte_VS_sampleB_CD16+ monocyte
[2m[36m(markers_ray pid=3310945)[0m 2023-01-01 21:36:49,725 cisTopic     INFO     Formatting data for sampleB_Dendritic cell_VS_sampleA_Dendritic cell
[2m[36m(markers_ray pid=3310944)[0m 2023-01-01 21:36:49,685 cisTopic     INFO     Formatting data for sampleA_Dendritic cell_VS_sampleB_Dendritic cell
[2m[36m(markers_ray pid=3310944)[0m 2023-01-01 21:36:51,247 cisTopic     INFO     Computing p-value for sampleA_Dendritic cell_VS_sampleB_Dendritic cell
[2m[36m(markers_ray pid=3310945)[0m 2023-01-01 21:36:51,555 cisTopic     INFO     Com

[2m[36m(raylet)[0m Spilled 47792 MiB, 3 objects, write throughput 1784 MiB/s.


[2m[36m(markers_ray pid=3310943)[0m 2023-01-01 21:36:52,430 cisTopic     INFO     Computing p-value for sampleA_CD16+ monocyte_VS_sampleB_CD16+ monocyte
[2m[36m(markers_ray pid=3310946)[0m 2023-01-01 21:36:53,574 cisTopic     INFO     Computing log2FC for sampleB_Natural killer cell_VS_sampleA_Natural killer cell
[2m[36m(markers_ray pid=3310946)[0m 2023-01-01 21:36:55,142 cisTopic     INFO     sampleB_Natural killer cell_VS_sampleA_Natural killer cell done!
[2m[36m(markers_ray pid=3310946)[0m 2023-01-01 21:36:55,189 cisTopic     INFO     Formatting data for sampleB_CD16+ monocyte_VS_sampleA_CD16+ monocyte
[2m[36m(markers_ray pid=3310946)[0m 2023-01-01 21:36:58,114 cisTopic     INFO     Computing p-value for sampleB_CD16+ monocyte_VS_sampleA_CD16+ monocyte
[2m[36m(markers_ray pid=3310949)[0m 2023-01-01 21:37:03,326 cisTopic     INFO     Computing log2FC for sampleA_B cell_VS_sampleB_B cell
[2m[36m(markers_ray pid=3310949)[0m 2023-01-01 21:37:05,033 cisTopic     INFO

[2m[36m(raylet)[0m Spilled 63723 MiB, 4 objects, write throughput 1380 MiB/s.


[2m[36m(markers_ray pid=3310948)[0m 2023-01-01 21:37:16,737 cisTopic     INFO     Computing log2FC for sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell
[2m[36m(markers_ray pid=3310949)[0m 2023-01-01 21:37:17,037 cisTopic     INFO     Computing p-value for sampleA_CD14+ monocyte_VS_sampleB_CD14+ monocyte
[2m[36m(markers_ray pid=3310948)[0m 2023-01-01 21:37:18,477 cisTopic     INFO     sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell done!
[2m[36m(markers_ray pid=3310946)[0m 2023-01-01 21:37:19,356 cisTopic     INFO     Computing log2FC for sampleB_CD16+ monocyte_VS_sampleA_CD16+ monocyte
[2m[36m(markers_ray pid=3310942)[0m 2023-01-01 21:37:20,242 cisTopic     INFO     Computing p-value for sampleB_CD14+ monocyte_VS_sampleA_CD14+ monocyte
[2m[36m(markers_ray pid=3310946)[0m 2023-01-01 21:37:20,889 cisTopic     INFO     sampleB_CD16+ monocyte_VS_sampleA_CD16+ monocyte done!
[2m[36m(markers_ray pid=3310947)[0m 2023-01-01 21:37:21,994 cisTopic     INFO     Computing log2FC for samp

[2m[36m(raylet)[0m Spilled 79653 MiB, 5 objects, write throughput 1458 MiB/s.


[2m[36m(markers_ray pid=3310947)[0m 2023-01-01 21:37:23,690 cisTopic     INFO     sampleB_CD4+ T cell_VS_sampleA_CD4+ T cell done!


[2m[36m(raylet)[0m Spilled 127448 MiB, 12 objects, write throughput 1651 MiB/s.


[2m[36m(markers_ray pid=3310949)[0m 2023-01-01 21:38:04,428 cisTopic     INFO     Computing log2FC for sampleA_CD14+ monocyte_VS_sampleB_CD14+ monocyte
[2m[36m(markers_ray pid=3310949)[0m 2023-01-01 21:38:06,075 cisTopic     INFO     sampleA_CD14+ monocyte_VS_sampleB_CD14+ monocyte done!
[2m[36m(markers_ray pid=3310942)[0m 2023-01-01 21:38:07,543 cisTopic     INFO     Computing log2FC for sampleB_CD14+ monocyte_VS_sampleA_CD14+ monocyte
[2m[36m(markers_ray pid=3310942)[0m 2023-01-01 21:38:09,193 cisTopic     INFO     sampleB_CD14+ monocyte_VS_sampleA_CD14+ monocyte done!
no DARs found for sampleA_Dendritic cell_VS_sampleB_Dendritic cell in master_sub_1.FIXEDCELLS.singlets
no DARs found for sampleB_Dendritic cell_VS_sampleA_Dendritic cell in master_sub_1.FIXEDCELLS.singlets


<IPython.core.display.Javascript object>

In [11]:
for sample in cistopic_obj_path_dict.keys():
    # for sample  in ['SAN_10xmultiome_1.FIXEDCELLS']:
    if not os.path.exists(f"harmony_male_female_DARs/{sample}"):
        cto_path = cistopic_obj_path_dict[sample]
        if os.path.isfile(cto_path):
            print(f"Loading {cto_path}")
            with open(cto_path, "rb") as f:
                cto = pickle.load(f)

            cto.cell_names = cto.cell_data.index
            if [x.split("-")[0] for x in cto.selected_model.cell_topic.columns] == [
                x.split("___")[0] for x in list(cto.cell_names)
            ]:
                cto.selected_model.cell_topic.columns = list(cto.cell_names)

        if "fmx_sample" in cto.cell_data.columns:
            acc_path = normalized_imputed_acc_obj_path_dict[sample]

            if os.path.isfile(acc_path):
                print(f"Loading {acc_path}")
                with open(acc_path, "rb") as f:
                    normalized_imputed_acc_obj = pickle.load(f)

            hvr_pkl_path = f"downstream_analysis/HVRs/{sample}__HVRs.pkl"

            if os.path.exists(hvr_pkl_path):
                print(f"Loading {hvr_pkl_path}")
                with open(hvr_pkl_path, "rb") as f:
                    variable_regions = pickle.load(f)
            else:
                print("HVRs do not exist!")

            cto.cell_data["fmx_sample_harmony_consensus_cell_type"] = (
                cto.cell_data["fmx_sample"]
                + "_"
                + cto.cell_data["harmony_consensus_cell_type"]
            )

            contrasts = []
            for cell_type in cto.cell_data["harmony_consensus_cell_type"].unique():
                print(cell_type)
                ct_a = "sampleA_" + cell_type
                ct_b = "sampleB_" + cell_type
                contrast = [[[ct_a], [ct_b]], [[ct_b], [ct_a]]]
                contrasts = contrasts + contrast

            markers_dict = find_diff_features(
                cto,
                normalized_imputed_acc_obj,
                variable="fmx_sample_harmony_consensus_cell_type",
                var_features=variable_regions,
                contrasts=contrasts,
                adjpval_thr=0.05,
                log2fc_thr=np.log2(1.2),
                n_cpu=10,
            )

            if not os.path.exists(f"harmony_male_female_DARs/{sample}"):
                os.mkdir(f"harmony_male_female_DARs/{sample}")

            with open(
                f"harmony_male_female_DARs/{sample}/{sample}__DARs_dict_1-2xfoldchange.pkl",
                "wb",
            ) as f:
                pickle.dump(markers_dict, f, protocol=4)

            for contrast in markers_dict.keys():
                markers = markers_dict[contrast].index.tolist()
                df = pd.DataFrame(markers)
                if len(df) == 0:
                    print(f"no DARs found for {contrast} in {sample}")
                else:
                    df[[0, 1]] = df[0].str.split(":", expand=True)
                    df[[1, 2]] = df[1].str.split("-", expand=True)
                    df[3] = markers_dict[contrast].index.tolist()
                    df[3] = contrast.replace(" ", "_")  # + '_' + df[3].astype(str)
                    score = markers_dict[contrast]["Log2FC"]
                    score = score.reset_index(drop=True)
                    df[4] = score
                    df[5] = "."
                    pval = markers_dict[contrast]["Adjusted_pval"]
                    pval = pval.reset_index(drop=True)
                    df[6] = pval
                    name = contrast.replace(" ", "_")

                    df.to_csv(
                        f'harmony_male_female_DARs/{sample}/{sample}__{contrast.replace(" ", "_")}__1-2xfoldchange_DARs.bed',
                        sep="\t",
                        header=False,
                        index=False,
                    )

Loading cistopic_objects/master_sub_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_20topics.dimreduc.consensus_harmony.pkl
Loading downstream_analysis/imputed_acc_objs/master_sub_1.FIXEDCELLS__imputed_acc_obs.pkl
Loading downstream_analysis/HVRs/master_sub_1.FIXEDCELLS__HVRs.pkl
CD4+ T cell
CD14+ monocyte
B cell
Cytotoxic T cell
Natural killer cell
CD16+ monocyte
Dendritic cell


2023-01-04 16:16:03,230	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(markers_ray pid=2820806)[0m 2023-01-04 16:16:10,464 cisTopic     INFO     Formatting data for sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell
[2m[36m(markers_ray pid=2820814)[0m 2023-01-04 16:16:14,021 cisTopic     INFO     Formatting data for sampleB_CD4+ T cell_VS_sampleA_CD4+ T cell
[2m[36m(markers_ray pid=2820808)[0m 2023-01-04 16:16:17,459 cisTopic     INFO     Formatting data for sampleA_CD14+ monocyte_VS_sampleB_CD14+ monocyte
[2m[36m(markers_ray pid=2820811)[0m 2023-01-04 16:16:21,093 cisTopic     INFO     Formatting data for sampleB_CD14+ monocyte_VS_sampleA_CD14+ monocyte
[2m[36m(markers_ray pid=2820807)[0m 2023-01-04 16:16:24,558 cisTopic     INFO     Formatting data for sampleA_B cell_VS_sampleB_B cell
[2m[36m(markers_ray pid=2820806)[0m 2023-01-04 16:16:26,013 cisTopic     INFO     Computing p-value for sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell
[2m[36m(markers_ray pid=2820808)[0m 2023-01-04 16:16:28,001 cisTopic     INFO     Computing p-value for s

[2m[36m(raylet)[0m Spilled 15930 MiB, 2 objects, write throughput 805 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.
[2m[36m(raylet)[0m Spilled 31861 MiB, 4 objects, write throughput 1285 MiB/s.
[2m[36m(raylet)[0m Spilled 47792 MiB, 5 objects, write throughput 1801 MiB/s.


[2m[36m(markers_ray pid=2820808)[0m 2023-01-04 16:17:13,694 cisTopic     INFO     Computing log2FC for sampleA_CD14+ monocyte_VS_sampleB_CD14+ monocyte
[2m[36m(markers_ray pid=2820807)[0m 2023-01-04 16:17:15,294 cisTopic     INFO     Computing log2FC for sampleA_B cell_VS_sampleB_B cell
[2m[36m(markers_ray pid=2820808)[0m 2023-01-04 16:17:15,336 cisTopic     INFO     sampleA_CD14+ monocyte_VS_sampleB_CD14+ monocyte done!
[2m[36m(markers_ray pid=2820808)[0m 2023-01-04 16:17:15,418 cisTopic     INFO     Formatting data for sampleA_Natural killer cell_VS_sampleB_Natural killer cell
[2m[36m(markers_ray pid=2820807)[0m 2023-01-04 16:17:16,938 cisTopic     INFO     sampleA_B cell_VS_sampleB_B cell done!
[2m[36m(markers_ray pid=2820807)[0m 2023-01-04 16:17:16,980 cisTopic     INFO     Formatting data for sampleB_Natural killer cell_VS_sampleA_Natural killer cell
[2m[36m(markers_ray pid=2820812)[0m 2023-01-04 16:17:16,912 cisTopic     INFO     Computing log2FC for sampleA_

[2m[36m(raylet)[0m Spilled 63723 MiB, 8 objects, write throughput 1359 MiB/s.
[2m[36m(raylet)[0m Spilled 79654 MiB, 11 objects, write throughput 1630 MiB/s.
[2m[36m(raylet)[0m Spilled 111517 MiB, 17 objects, write throughput 2203 MiB/s.


[2m[36m(markers_ray pid=2820812)[0m 2023-01-04 16:17:43,126 cisTopic     INFO     Computing log2FC for sampleA_CD16+ monocyte_VS_sampleB_CD16+ monocyte
[2m[36m(markers_ray pid=2820812)[0m 2023-01-04 16:17:44,641 cisTopic     INFO     sampleA_CD16+ monocyte_VS_sampleB_CD16+ monocyte done!
[2m[36m(markers_ray pid=2820808)[0m 2023-01-04 16:17:48,154 cisTopic     INFO     Computing log2FC for sampleA_Natural killer cell_VS_sampleB_Natural killer cell
[2m[36m(markers_ray pid=2820808)[0m 2023-01-04 16:17:49,703 cisTopic     INFO     sampleA_Natural killer cell_VS_sampleB_Natural killer cell done!
[2m[36m(markers_ray pid=2820807)[0m 2023-01-04 16:17:50,383 cisTopic     INFO     Computing log2FC for sampleB_Natural killer cell_VS_sampleA_Natural killer cell
[2m[36m(markers_ray pid=2820807)[0m 2023-01-04 16:17:51,939 cisTopic     INFO     sampleB_Natural killer cell_VS_sampleA_Natural killer cell done!
[2m[36m(markers_ray pid=2820807)[0m 2023-01-04 16:18:43,546 cisTopic    

<IPython.core.display.Javascript object>

In [17]:
sample = "master_sub_1.FIXEDCELLS"
with open(
    "cistopic_objects/master_sub_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_20topics.dimreduc.consensus_harmony.pkl",
    "rb",
) as f:
    cto = pickle.load(f)

<IPython.core.display.Javascript object>

In [18]:
with open(
    "/lustre1/project/stg_00090/scatac_benchmark/fixedcells_4_merged/cistopic_objects_master/master_all_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_24topics.dimreduc.consensus_harmony.pkl",
    "rb",
) as f:
    cto_all = pickle.load(f)

<IPython.core.display.Javascript object>

In [22]:
cto.cell_data["seurat_cell_type__mega"] = cto_all.cell_data["cell_type__mega"]

<IPython.core.display.Javascript object>

In [23]:
cto.cell_data["seurat_cell_type__mega"]

GGTTAGTTGGCTGCATTACGTACTCATA___OHS_s3atac_1.FIXEDCELLS    Cytotoxic T cell
GGTTAGTTGCCAAGGCAAGCTCATTG___OHS_s3atac_2.FIXEDCELLS      Cytotoxic T cell
GAAGAGTATTTTGGTTCTCAGCTTGTCA___OHS_s3atac_1.FIXEDCELLS    Cytotoxic T cell
GAAGAGTATTTGGCTCATATGTGTCGGA___OHS_s3atac_1.FIXEDCELLS              B cell
ATTGAGGATATATTGCAGCTCGCCGATC___OHS_s3atac_1.FIXEDCELLS    Cytotoxic T cell
                                                                ...       
TCACAGAGTGCCCGAT___VIB_10xv1_1.FIXEDCELLS                      CD4+ T cell
CAGTGCGAGAATCAAC___VIB_10xv1_2.FIXEDCELLS                           B cell
AATGCCAGTTTCGTTT___VIB_10xv1_2.FIXEDCELLS                      CD4+ T cell
TCTAGTTCAATGATGA___VIB_10xv1_1.FIXEDCELLS                      CD4+ T cell
CAATCCCCATACTGCA___VIB_10xv1_2.FIXEDCELLS                   CD14+ monocyte
Name: seurat_cell_type__mega, Length: 38648, dtype: object

<IPython.core.display.Javascript object>

In [24]:
acc_path = normalized_imputed_acc_obj_path_dict[sample]

if os.path.isfile(acc_path):
    print(f"Loading {acc_path}")
    with open(acc_path, "rb") as f:
        normalized_imputed_acc_obj = pickle.load(f)

hvr_pkl_path = f"downstream_analysis/HVRs/{sample}__HVRs.pkl"

if os.path.exists(hvr_pkl_path):
    print(f"Loading {hvr_pkl_path}")
    with open(hvr_pkl_path, "rb") as f:
        variable_regions = pickle.load(f)
else:
    print("HVRs do not exist!")

cto.cell_data["fmx_sample_seurat_cell_type"] = (
    cto.cell_data["fmx_sample"] + "_" + cto.cell_data["seurat_cell_type__mega"]
)

contrasts = []
for cell_type in cto.cell_data["seurat_cell_type__mega"].unique():
    print(cell_type)
    ct_a = "sampleA_" + cell_type
    ct_b = "sampleB_" + cell_type
    contrast = [[[ct_a], [ct_b]], [[ct_b], [ct_a]]]
    contrasts = contrasts + contrast

markers_dict = find_diff_features(
    cto,
    normalized_imputed_acc_obj,
    variable="fmx_sample_seurat_cell_type",
    var_features=variable_regions,
    contrasts=contrasts,
    adjpval_thr=0.05,
    log2fc_thr=np.log2(1.2),
    n_cpu=10,
)

if not os.path.exists(f"seurat_male_female_DARs/{sample}"):
    os.mkdir(f"seurat_male_female_DARs/{sample}")

with open(
    f"seurat_male_female_DARs/{sample}/{sample}__DARs_dict_1-2xfoldchange.pkl",
    "wb",
) as f:
    pickle.dump(markers_dict, f, protocol=4)

for contrast in markers_dict.keys():
    markers = markers_dict[contrast].index.tolist()
    df = pd.DataFrame(markers)
    if len(df) == 0:
        print(f"no DARs found for {contrast} in {sample}")
    else:
        df[[0, 1]] = df[0].str.split(":", expand=True)
        df[[1, 2]] = df[1].str.split("-", expand=True)
        df[3] = markers_dict[contrast].index.tolist()
        df[3] = contrast.replace(" ", "_")  # + '_' + df[3].astype(str)
        score = markers_dict[contrast]["Log2FC"]
        score = score.reset_index(drop=True)
        df[4] = score
        df[5] = "."
        pval = markers_dict[contrast]["Adjusted_pval"]
        pval = pval.reset_index(drop=True)
        df[6] = pval
        name = contrast.replace(" ", "_")

        df.to_csv(
            f'seurat_male_female_DARs/{sample}/{sample}__{contrast.replace(" ", "_")}__1-2xfoldchange_DARs.bed',
            sep="\t",
            header=False,
            index=False,
        )

Loading downstream_analysis/imputed_acc_objs/master_sub_1.FIXEDCELLS__imputed_acc_obs.pkl
Loading downstream_analysis/HVRs/master_sub_1.FIXEDCELLS__HVRs.pkl
Cytotoxic T cell
B cell
CD14+ monocyte
CD4+ T cell
Natural killer cell
Dendritic cell
CD16+ monocyte


2023-01-04 17:25:01,425	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(markers_ray pid=2827682)[0m 2023-01-04 17:25:09,284 cisTopic     INFO     Formatting data for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=2827690)[0m 2023-01-04 17:25:12,754 cisTopic     INFO     Formatting data for sampleB_Cytotoxic T cell_VS_sampleA_Cytotoxic T cell
[2m[36m(markers_ray pid=2827689)[0m 2023-01-04 17:25:16,130 cisTopic     INFO     Formatting data for sampleA_B cell_VS_sampleB_B cell
[2m[36m(markers_ray pid=2827684)[0m 2023-01-04 17:25:19,655 cisTopic     INFO     Formatting data for sampleB_B cell_VS_sampleA_B cell
[2m[36m(markers_ray pid=2827689)[0m 2023-01-04 17:25:21,104 cisTopic     INFO     Computing p-value for sampleA_B cell_VS_sampleB_B cell
[2m[36m(markers_ray pid=2827686)[0m 2023-01-04 17:25:23,046 cisTopic     INFO     Formatting data for sampleA_CD14+ monocyte_VS_sampleB_CD14+ monocyte
[2m[36m(markers_ray pid=2827684)[0m 2023-01-04 17:25:24,765 cisTopic     INFO     Computing p-value for sampleB

[2m[36m(raylet)[0m Spilled 15930 MiB, 1 objects, write throughput 786 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.
[2m[36m(raylet)[0m Spilled 31861 MiB, 2 objects, write throughput 1414 MiB/s.
[2m[36m(raylet)[0m Spilled 47792 MiB, 3 objects, write throughput 1758 MiB/s.
[2m[36m(raylet)[0m Spilled 63723 MiB, 4 objects, write throughput 1926 MiB/s.


[2m[36m(markers_ray pid=2827684)[0m 2023-01-04 17:26:10,653 cisTopic     INFO     Formatting data for sampleB_Natural killer cell_VS_sampleA_Natural killer cell
[2m[36m(markers_ray pid=2827684)[0m 2023-01-04 17:26:13,600 cisTopic     INFO     Computing p-value for sampleB_Natural killer cell_VS_sampleA_Natural killer cell
[2m[36m(markers_ray pid=2827689)[0m 2023-01-04 17:26:14,831 cisTopic     INFO     Computing log2FC for sampleA_Natural killer cell_VS_sampleB_Natural killer cell
[2m[36m(markers_ray pid=2827689)[0m 2023-01-04 17:26:16,370 cisTopic     INFO     sampleA_Natural killer cell_VS_sampleB_Natural killer cell done!
[2m[36m(markers_ray pid=2827689)[0m 2023-01-04 17:26:16,424 cisTopic     INFO     Formatting data for sampleA_Dendritic cell_VS_sampleB_Dendritic cell
[2m[36m(markers_ray pid=2827689)[0m 2023-01-04 17:26:17,797 cisTopic     INFO     Computing p-value for sampleA_Dendritic cell_VS_sampleB_Dendritic cell
[2m[36m(markers_ray pid=2827686)[0m 2023-0

[2m[36m(raylet)[0m Spilled 79654 MiB, 7 objects, write throughput 1545 MiB/s.


[2m[36m(markers_ray pid=2827686)[0m 2023-01-04 17:26:42,497 cisTopic     INFO     sampleB_Dendritic cell_VS_sampleA_Dendritic cell done!
[2m[36m(markers_ray pid=2827690)[0m 2023-01-04 17:26:42,766 cisTopic     INFO     Computing log2FC for sampleB_Cytotoxic T cell_VS_sampleA_Cytotoxic T cell
[2m[36m(markers_ray pid=2827690)[0m 2023-01-04 17:26:44,703 cisTopic     INFO     sampleB_Cytotoxic T cell_VS_sampleA_Cytotoxic T cell done!


[2m[36m(raylet)[0m Spilled 95585 MiB, 10 objects, write throughput 1582 MiB/s.


[2m[36m(markers_ray pid=2827690)[0m 2023-01-04 17:28:04,835 cisTopic     INFO     Formatting data for sampleA_CD16+ monocyte_VS_sampleB_CD16+ monocyte
[2m[36m(markers_ray pid=2827686)[0m 2023-01-04 17:28:05,297 cisTopic     INFO     Formatting data for sampleB_CD16+ monocyte_VS_sampleA_CD16+ monocyte
[2m[36m(markers_ray pid=2827686)[0m 2023-01-04 17:28:06,116 cisTopic     INFO     Computing p-value for sampleB_CD16+ monocyte_VS_sampleA_CD16+ monocyte
[2m[36m(markers_ray pid=2827690)[0m 2023-01-04 17:28:14,309 cisTopic     INFO     Computing p-value for sampleA_CD16+ monocyte_VS_sampleB_CD16+ monocyte
[2m[36m(markers_ray pid=2827686)[0m 2023-01-04 17:28:24,060 cisTopic     INFO     Computing log2FC for sampleB_CD16+ monocyte_VS_sampleA_CD16+ monocyte
[2m[36m(markers_ray pid=2827686)[0m 2023-01-04 17:28:25,545 cisTopic     INFO     sampleB_CD16+ monocyte_VS_sampleA_CD16+ monocyte done!
[2m[36m(markers_ray pid=2827690)[0m 2023-01-04 17:28:32,127 cisTopic     INFO     

<IPython.core.display.Javascript object>

# calculate DARs with harmony

In [14]:
for sample in cistopic_obj_path_dict.keys():
    # for sample  in ['SAN_10xmultiome_1.FIXEDCELLS']:
    if os.path.exists(f"downstream_analysis/harmony_DARs/{sample}"):
        cto_path = cistopic_obj_path_dict[sample]
        if os.path.isfile(cto_path):
            print(f"Loading {cto_path}")
            with open(cto_path, "rb") as f:
                cto = pickle.load(f)

            cto.cell_names = cto.cell_data.index
            if [x.split("-")[0] for x in cto.selected_model.cell_topic.columns] == [
                x.split("___")[0] for x in list(cto.cell_names)
            ]:
                cto.selected_model.cell_topic.columns = list(cto.cell_names)

            acc_path = normalized_imputed_acc_obj_path_dict[sample]

            if os.path.isfile(acc_path):
                print(f"Loading {acc_path}")
                with open(acc_path, "rb") as f:
                    normalized_imputed_acc_obj = pickle.load(f)

            hvr_pkl_path = f"downstream_analysis/HVRs/{sample}__HVRs.pkl"

            if os.path.exists(hvr_pkl_path):
                print(f"Loading {hvr_pkl_path}")
                with open(hvr_pkl_path, "rb") as f:
                    variable_regions = pickle.load(f)
            else:
                print("HVRs do not exist!")

            markers_dict = find_diff_features(
                cto,
                normalized_imputed_acc_obj,
                variable="consensus_cell_type",
                var_features=variable_regions,
                contrasts=None,
                adjpval_thr=0.05,
                log2fc_thr=np.log2(1.5),
                n_cpu=16,
            )
            with open(
                f"downstream_analysis/harmony_DARs/{sample}/{sample}__DARs_dict.pkl",
                "wb",
            ) as f:
                pickle.dump(markers_dict, f, protocol=4)

            for cell_type in markers_dict.keys():
                markers = markers_dict[cell_type].index.tolist()
                df = pd.DataFrame(markers)
                if len(df) == 0:
                    print(f"no DARs found for {cell_type} in {sample}")
                else:
                    df[[0, 1]] = df[0].str.split(":", expand=True)
                    df[[1, 2]] = df[1].str.split("-", expand=True)
                    df[3] = markers_dict[cell_type].index.tolist()
                    df[3] = cell_type.replace(" ", "_")  # + '_' + df[3].astype(str)
                    score = markers_dict[cell_type]["Log2FC"]
                    score = score.reset_index(drop=True)
                    df[4] = score
                    df[5] = "."
                    pval = markers_dict[cell_type]["Adjusted_pval"]
                    pval = pval.reset_index(drop=True)
                    df[6] = pval
                    name = cell_type.replace(" ", "_")
                    # name = name.replace('/', '-')
                    if not os.path.exists(f"downstream_analysis/harmony_DARs/{sample}"):
                        os.mkdir(f"downstream_analysis/harmony_DARs/{sample}")

                    df.to_csv(
                        f'downstream_analysis/harmony_DARs/{sample}/{sample}__{cell_type.replace(" ", "_")}__DARs.bed',
                        sep="\t",
                        header=False,
                        index=False,
                    )
                    df[0:2000].to_csv(
                        f'downstream_analysis/harmony_DARs/{sample}/{sample}__{cell_type.replace(" ", "_")}__DARs.TOP2k.bed',
                        sep="\t",
                        header=False,
                        index=False,
                    )

Loading cistopic_objects/master_sub_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_20topics.dimreduc.consensus_harmony.pkl
Loading downstream_analysis/imputed_acc_objs/master_sub_1.FIXEDCELLS__imputed_acc_obs.pkl
Loading downstream_analysis/HVRs/master_sub_1.FIXEDCELLS__HVRs.pkl


2023-01-02 20:49:16,368	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(markers_ray pid=86873)[0m 2023-01-02 20:49:23,640 cisTopic     INFO     Formatting data for B cell
[2m[36m(markers_ray pid=86872)[0m 2023-01-02 20:49:27,099 cisTopic     INFO     Formatting data for CD14+ monocyte
[2m[36m(markers_ray pid=86876)[0m 2023-01-02 20:49:30,691 cisTopic     INFO     Formatting data for CD16+ monocyte
[2m[36m(markers_ray pid=86878)[0m 2023-01-02 20:49:34,086 cisTopic     INFO     Formatting data for CD4+ T cell
[2m[36m(markers_ray pid=86879)[0m 2023-01-02 20:49:37,385 cisTopic     INFO     Formatting data for Cytotoxic T cell
[2m[36m(markers_ray pid=86880)[0m 2023-01-02 20:49:40,979 cisTopic     INFO     Formatting data for Dendritic cell
[2m[36m(markers_ray pid=86884)[0m 2023-01-02 20:49:44,466 cisTopic     INFO     Formatting data for Natural killer cell
[2m[36m(markers_ray pid=86873)[0m 2023-01-02 20:50:35,409 cisTopic     INFO     Computing p-value for B cell
[2m[36m(markers_ray pid=86872)[0m 2023-01-02 20:50:37,054 cisTo

<IPython.core.display.Javascript object>

In [1]:
a = 1