# pycisTopic analysis

Full dataset, using consensus peak regions.

In [2]:
import pycisTopic

%load_ext nb_black
import warnings

warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")
import pickle
import pandas as pd
import os

<IPython.core.display.Javascript object>

In [3]:
!pwd

/lustre1/project/stg_00090/scatac_benchmark/fixedcells_7_merged_equalcells_celltypefair


<IPython.core.display.Javascript object>

In [4]:
wdir = "/lustre1/project/stg_00090/scatac_benchmark/fixedcells_7_merged_equalcells_celltypefair"
os.chdir(wdir)

<IPython.core.display.Javascript object>

In [5]:
# create output directory:
f_final_dir = os.path.join(wdir, "downstream_analysis")
if not os.path.exists(f_final_dir):
    os.makedirs(f_final_dir)

<IPython.core.display.Javascript object>

In [6]:
import glob

<IPython.core.display.Javascript object>

## Save/load cisTopic objects

In [7]:
cto_consensus_paths = sorted(glob.glob(f"cistopic_objects/*topics.pkl"))
cistopic_obj_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x for x in cto_consensus_paths
}
cistopic_obj_path_dict

{'master_celltypefair_1.FIXEDCELLS': 'cistopic_objects/master_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_28topics.pkl'}

<IPython.core.display.Javascript object>

## Topic binarization & QC

In [8]:
from pycisTopic.topic_binarization import binarize_topics
from pycisTopic.topic_qc import compute_topic_metrics, plot_topic_qc
from pycisTopic.utils import fig2img
import matplotlib.pyplot as plt
from pycisTopic.topic_qc import topic_annotation
from pycisTopic.diff_features import (
    impute_accessibility,
    normalize_scores,
    find_highly_variable_features,
    find_diff_features,
)
from pycisTopic.clust_vis import plot_imputed_features
import numpy as np
import copy

<IPython.core.display.Javascript object>

### Binarize the topic-region distributions

In [8]:
for sample in cistopic_obj_path_dict.keys():
    # for sample  in ['SAN_10xmultiome_1.FIXEDCELLS']:
    path = cistopic_obj_path_dict[sample]
    if not os.path.exists(f"downstream_analysis/DARs/{sample}"):
        if os.path.isfile(path):
            print(f"Loading {path}")
            with open(path, "rb") as f:
                cto = pickle.load(f)

            cto.cell_names = cto.cell_data.index
            if [x.split("-")[0] for x in cto.selected_model.cell_topic.columns] == [
                x.split("___")[0] for x in list(cto.cell_names)
            ]:
                cto.selected_model.cell_topic.columns = list(cto.cell_names)

            region_bin_topics = binarize_topics(
                cto,
                method="otsu",
                ntop=3000,
                plot=True,
                num_columns=6,
                save=f"plots_qc/{sample}__topic_region_binarized.png",
            )

            with open(
                f"downstream_analysis/binarized_topics/{sample}__topic_region_binarized.pkl",
                "wb",
            ) as f:
                pickle.dump(region_bin_topics, f, protocol=4)

            binarized_cell_topics = binarize_topics(
                cto,
                target="cell",
                method="li",
                plot=True,
                num_columns=5,
                nbins=100,
                save=f"plots_qc/{sample}__cells_topic_binarized.png",
            )

            with open(
                f"downstream_analysis/binarized_topics/{sample}__cells_topic_binarized.pkl",
                "wb",
            ) as f:
                pickle.dump(region_bin_topics, f, protocol=4)

            topic_qc_metrics = compute_topic_metrics(cto)

            with open(
                f"downstream_analysis/binarized_topics/{sample}__topic_qc.pkl", "wb"
            ) as f:
                pickle.dump(topic_qc_metrics, f, protocol=4)

            fig_dict = {
                "CoherenceVSAssignments": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Coherence",
                    var_y="Log10_Assignments",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
                "AssignmentsVSCells_in_bin": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Log10_Assignments",
                    var_y="Cells_in_binarized_topic",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
                "CoherenceVSCells_in_bin": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Coherence",
                    var_y="Cells_in_binarized_topic",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
                "CoherenceVSRegions_in_bin": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Coherence",
                    var_y="Regions_in_binarized_topic",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
                "CoherenceVSMarginal_dist": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Coherence",
                    var_y="Marginal_topic_dist",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
                "CoherenceVSGini_index": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Coherence",
                    var_y="Gini_index",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
            }

            fig = plt.figure(figsize=(40, 43))
            i = 1
            for fig_ in fig_dict.keys():
                plt.subplot(2, 3, i)
                img = fig2img(
                    fig_dict[fig_]
                )  # To convert figures to png to plot together, see .utils.py. This converts the figure to png.
                plt.imshow(img)
                plt.axis("off")
                i += 1
            plt.subplots_adjust(wspace=0, hspace=-0.70)
            # plt.tight_layout()
            plt.show()
            plt.savefig(
                f"plots_qc/{sample}__topic_qc_metrics.png", facecolor="white", dpi=150
            )

            topic_annot = topic_annotation(
                cto,
                annot_var="harmony_consensus_cell_type__mega",
                binarized_cell_topic=binarized_cell_topics,
                general_topic_thr=0.2,
            )
            topic_qc_metrics = pd.concat(
                [
                    topic_annot[
                        [
                            "harmony_consensus_cell_type__mega",
                            "Ratio_cells_in_topic",
                            "Ratio_group_in_population",
                        ]
                    ],
                    topic_qc_metrics,
                ],
                axis=1,
            )

            imputed_acc_obj = impute_accessibility(
                cto, selected_cells=None, selected_regions=None, scale_factor=10**6
            )

            normalized_imputed_acc_obj = normalize_scores(
                imputed_acc_obj, scale_factor=10**4
            )

            with open(
                f"downstream_analysis/imputed_acc_objs/{sample}__imputed_acc_obs.pkl",
                "wb",
            ) as f:
                pickle.dump(normalized_imputed_acc_obj, f, protocol=4)

            variable_regions = find_highly_variable_features(
                normalized_imputed_acc_obj,
                min_disp=0.05,
                min_mean=0.0125,
                max_mean=3,
                max_disp=np.inf,
                n_bins=20,
                n_top_features=None,
                plot=True,
                save=f"plots_qc/{sample}__HVR.png",
            )

            print(f"Found {len(variable_regions)} variable regions")
            with open(f"downstream_analysis/HVRs/{sample}__HVRs.bed", "w") as f:
                for line in [
                    x.replace(":", "\t").replace("-", "\t") for x in variable_regions
                ]:
                    f.write(f"{line}\n")

            with open(f"downstream_analysis/HVRs/{sample}__HVRs.pkl", "wb") as f:
                pickle.dump(variable_regions, f, protocol=4)

            if not os.path.exists(f"downstream_analysis/DARs/{sample}"):
                os.mkdir(f"downstream_analysis/DARs/{sample}")

            markers_dict = find_diff_features(
                cto,
                normalized_imputed_acc_obj,
                variable="harmony_consensus_cell_type__mega",
                var_features=variable_regions,
                contrasts=None,
                adjpval_thr=0.05,
                log2fc_thr=np.log2(1.5),
                n_cpu=16,
            )
            with open(
                f"downstream_analysis/DARs/{sample}/{sample}__DARs_dict.pkl", "wb"
            ) as f:
                pickle.dump(markers_dict, f, protocol=4)

            for cell_type in markers_dict.keys():
                markers = markers_dict[cell_type].index.tolist()
                df = pd.DataFrame(markers)
                if len(df) == 0:
                    print(f"no DARs found for {cell_type} in {sample}")
                else:
                    df[[0, 1]] = df[0].str.split(":", expand=True)
                    df[[1, 2]] = df[1].str.split("-", expand=True)
                    df[3] = markers_dict[cell_type].index.tolist()
                    df[3] = cell_type.replace(" ", "_")  # + '_' + df[3].astype(str)
                    score = markers_dict[cell_type]["Log2FC"]
                    score = score.reset_index(drop=True)
                    df[4] = score
                    df[5] = "."
                    pval = markers_dict[cell_type]["Adjusted_pval"]
                    pval = pval.reset_index(drop=True)
                    df[6] = pval
                    name = cell_type.replace(" ", "_")
                    # name = name.replace('/', '-')
                    if not os.path.exists(f"downstream_analysis/DARs/{sample}"):
                        os.mkdir(f"downstream_analysis/DARs/{sample}")

                    df.to_csv(
                        f'downstream_analysis/DARs/{sample}/{sample}__{cell_type.replace(" ", "_")}__DARs.bed',
                        sep="\t",
                        header=False,
                        index=False,
                    )
                    df[0:2000].to_csv(
                        f'downstream_analysis/DARs/{sample}/{sample}__{cell_type.replace(" ", "_")}__DARs.TOP2k.bed',
                        sep="\t",
                        header=False,
                        index=False,
                    )

        else:
            print(f"{path} does not exist!")
    else:
        print(f"downstream_analysis/DARs/{sample} already exists!")

downstream_analysis/DARs/master_celltypefair_1.FIXEDCELLS already exists!


<IPython.core.display.Javascript object>

# Calculate DARs between male and female

In [10]:
normalized_imputed_acc_obj_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(
        glob.glob(f"downstream_analysis/imputed_acc_objs/*imputed_acc_obs.pkl")
    )
}
normalized_imputed_acc_obj_path_dict

{'master_celltypefair_1.FIXEDCELLS': 'downstream_analysis/imputed_acc_objs/master_celltypefair_1.FIXEDCELLS__imputed_acc_obs.pkl'}

<IPython.core.display.Javascript object>

In [11]:
hvr_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob(f"downstream_analysis/HVRs/*bed"))
}
hvr_path_dict

{'master_celltypefair_1.FIXEDCELLS': 'downstream_analysis/HVRs/master_celltypefair_1.FIXEDCELLS__HVRs.bed'}

<IPython.core.display.Javascript object>

In [12]:
cto_consensus_paths = sorted(glob.glob(f"cistopic_objects/*topics.pkl"))
cistopic_obj_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x for x in cto_consensus_paths
}
cistopic_obj_path_dict

{'master_celltypefair_1.FIXEDCELLS': 'cistopic_objects/master_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_28topics.pkl'}

<IPython.core.display.Javascript object>

In [13]:
sample = "master_celltypefair_1.FIXEDCELLS"
with open(
    cistopic_obj_path_dict[sample],
    "rb",
) as f:
    cto = pickle.load(f)

<IPython.core.display.Javascript object>

In [None]:
acc_path = normalized_imputed_acc_obj_path_dict[sample]

if os.path.isfile(acc_path):
    print(f"Loading {acc_path}")
    with open(acc_path, "rb") as f:
        normalized_imputed_acc_obj = pickle.load(f)

hvr_pkl_path = f"downstream_analysis/HVRs/{sample}__HVRs.pkl"

if os.path.exists(hvr_pkl_path):
    print(f"Loading {hvr_pkl_path}")
    with open(hvr_pkl_path, "rb") as f:
        variable_regions = pickle.load(f)
else:
    print("HVRs do not exist!")

cto.cell_data["fmx_sample_harmony_consensus_cell_type__mega"] = (
    cto.cell_data["fmx_sample"] + "_" + cto.cell_data["harmony_consensus_cell_type__mega"]
)

contrasts = []
for cell_type in cto.cell_data["harmony_consensus_cell_type__mega"].unique():
    print(cell_type)
    ct_a = "sampleA_" + cell_type
    ct_b = "sampleB_" + cell_type
    contrast = [[[ct_a], [ct_b]], [[ct_b], [ct_a]]]
    contrasts = contrasts + contrast

markers_dict = find_diff_features(
    cto,
    normalized_imputed_acc_obj,
    variable="fmx_sample_harmony_consensus_cell_type__mega",
    var_features=variable_regions,
    contrasts=contrasts,
    adjpval_thr=0.05,
    log2fc_thr=np.log2(1.2),
    n_cpu=10,
)

if not os.path.exists(f"harmony_consensus_cell_type__mega_male_female_DARs/{sample}"):
    os.mkdir(f"harmony_consensus_cell_type__mega_male_female_DARs/{sample}")

with open(
    f"harmony_consensus_cell_type__mega_male_female_DARs/{sample}/{sample}__DARs_dict_1-2xfoldchange.pkl",
    "wb",
) as f:
    pickle.dump(markers_dict, f, protocol=4)

for contrast in markers_dict.keys():
    markers = markers_dict[contrast].index.tolist()
    df = pd.DataFrame(markers)
    if len(df) == 0:
        print(f"no DARs found for {contrast} in {sample}")
    else:
        df[[0, 1]] = df[0].str.split(":", expand=True)
        df[[1, 2]] = df[1].str.split("-", expand=True)
        df[3] = markers_dict[contrast].index.tolist()
        df[3] = contrast.replace(" ", "_")  # + '_' + df[3].astype(str)
        score = markers_dict[contrast]["Log2FC"]
        score = score.reset_index(drop=True)
        df[4] = score
        df[5] = "."
        pval = markers_dict[contrast]["Adjusted_pval"]
        pval = pval.reset_index(drop=True)
        df[6] = pval
        name = contrast.replace(" ", "_")

        df.to_csv(
            f'harmony_consensus_cell_type__mega_male_female_DARs/{sample}/{sample}__{contrast.replace(" ", "_")}__1-2xfoldchange_DARs.bed',
            sep="\t",
            header=False,
            index=False,
        )

In [None]:
if not os.path.exists(f"harmony_consensus_cell_type__mega_male_female_DARs/{sample}"):
    os.mkdir(f"harmony_consensus_cell_type__mega_male_female_DARs/{sample}")

with open(
    f"harmony_consensus_cell_type__mega_male_female_DARs/{sample}/{sample}__DARs_dict_1-2xfoldchange.pkl",
    "wb",
) as f:
    pickle.dump(markers_dict, f, protocol=4)

for contrast in markers_dict.keys():
    markers = markers_dict[contrast].index.tolist()
    df = pd.DataFrame(markers)
    if len(df) == 0:
        print(f"no DARs found for {contrast} in {sample}")
    else:
        df[[0, 1]] = df[0].str.split(":", expand=True)
        df[[1, 2]] = df[1].str.split("-", expand=True)
        df[3] = markers_dict[contrast].index.tolist()
        df[3] = contrast.replace(" ", "_")  # + '_' + df[3].astype(str)
        score = markers_dict[contrast]["Log2FC"]
        score = score.reset_index(drop=True)
        df[4] = score
        df[5] = "."
        pval = markers_dict[contrast]["Adjusted_pval"]
        pval = pval.reset_index(drop=True)
        df[6] = pval
        name = contrast.replace(" ", "_")

        df.to_csv(
            f'harmony_consensus_cell_type__mega_male_female_DARs/{sample}/{sample}__{contrast.replace(" ", "_")}__1-2xfoldchange_DARs.bed',
            sep="\t",
            header=False,
            index=False,
        )

no DARs found for sampleA_Natural killer cell_VS_sampleB_Natural killer cell in master_celltypefair_1.FIXEDCELLS
no DARs found for sampleB_Natural killer cell_VS_sampleA_Natural killer cell in master_celltypefair_1.FIXEDCELLS
no DARs found for sampleA_CD16+ monocyte_VS_sampleB_CD16+ monocyte in master_celltypefair_1.FIXEDCELLS
no DARs found for sampleA_Dendritic cell_VS_sampleB_Dendritic cell in master_celltypefair_1.FIXEDCELLS
no DARs found for sampleB_Dendritic cell_VS_sampleA_Dendritic cell in master_celltypefair_1.FIXEDCELLS


<IPython.core.display.Javascript object>

## Calculate DARs for each technology here

In [15]:
acc_path = normalized_imputed_acc_obj_path_dict[sample]

if os.path.isfile(acc_path):
    print(f"Loading {acc_path}")
    with open(acc_path, "rb") as f:
        normalized_imputed_acc_obj = pickle.load(f)

hvr_pkl_path = f"downstream_analysis/HVRs/{sample}__HVRs.pkl"

if os.path.exists(hvr_pkl_path):
    print(f"Loading {hvr_pkl_path}")
    with open(hvr_pkl_path, "rb") as f:
        variable_regions = pickle.load(f)
else:
    print("HVRs do not exist!")

Loading downstream_analysis/imputed_acc_objs/master_celltypefair_1.FIXEDCELLS__imputed_acc_obs.pkl
Loading downstream_analysis/HVRs/master_celltypefair_1.FIXEDCELLS__HVRs.pkl


<IPython.core.display.Javascript object>

In [19]:
cto.cell_data["harmony_consensus_cell_type__mega_bytech"] = cto.cell_data["harmony_consensus_cell_type__mega"]  + '__' +  cto.cell_data["tech"]

<IPython.core.display.Javascript object>

In [20]:
cto.cell_data["harmony_consensus_cell_type__mega_bytech"]

GAAGCAGCGGAATGATGCTCGTTCCAAT___OHS_s3atac_1.FIXEDCELLS    Cytotoxic T cell__s3atac
AGGCAGAATGGTATTATTGGTCACGA___OHS_s3atac_2.FIXEDCELLS      Cytotoxic T cell__s3atac
ATTGAGGAGCCAAGGCAAAGTACTCC___OHS_s3atac_2.FIXEDCELLS      Cytotoxic T cell__s3atac
CTCTCTACTGAACGGCGTATCTGCCA___OHS_s3atac_2.FIXEDCELLS      Cytotoxic T cell__s3atac
GAAGAGTATTGGCATAACCGCGCCGATC___OHS_s3atac_1.FIXEDCELLS    Cytotoxic T cell__s3atac
                                                                    ...           
TGATGCACAAGATTAG___VIB_10xv1_2.FIXEDCELLS                    Dendritic cell__10xv1
AAAGGATTCCACACCT___VIB_10xv1_2.FIXEDCELLS                    Dendritic cell__10xv1
TGAGCCGGTTTGACCA___VIB_10xv1_2.FIXEDCELLS                    Dendritic cell__10xv1
AGACAAAGTCACAGGA___VIB_10xv1_2.FIXEDCELLS                    Dendritic cell__10xv1
CACCTGTTCTCTATTG___VIB_10xv1_2.FIXEDCELLS                    Dendritic cell__10xv1
Name: harmony_consensus_cell_type__mega_bytech, Length: 35456, dtype: object

<IPython.core.display.Javascript object>

In [43]:
list(set(cto.cell_data[cto.cell_data["tech"] == tech]["harmony_consensus_cell_type__mega_bytech"].unique()) - set([cell_type]))

['B cell__10xv1',
 'CD16+ monocyte__10xv1',
 'Cytotoxic T cell__10xv1',
 'Natural killer cell__10xv1',
 'CD14+ monocyte__10xv1',
 'CD4+ T cell__10xv1']

<IPython.core.display.Javascript object>

In [49]:
contrasts = []
for tech in cto.cell_data["tech"].unique():
    print(tech)
    for cell_type in cto.cell_data[cto.cell_data["tech"] == tech]["harmony_consensus_cell_type__mega_bytech"].unique():
        print("\t" + cell_type)
        contrast_fg = cell_type
        contrast_bg = list(set(cto.cell_data[cto.cell_data["tech"] == tech]["harmony_consensus_cell_type__mega_bytech"].unique()) - set([cell_type]))
        contrast = [[[contrast_fg], contrast_bg]]
        contrasts = contrasts + contrast

s3atac
	Cytotoxic T cell__s3atac
	CD4+ T cell__s3atac
	Natural killer cell__s3atac
	CD14+ monocyte__s3atac
	B cell__s3atac
	CD16+ monocyte__s3atac
	Dendritic cell__s3atac
10xmultiome
	Cytotoxic T cell__10xmultiome
	CD4+ T cell__10xmultiome
	Natural killer cell__10xmultiome
	CD14+ monocyte__10xmultiome
	B cell__10xmultiome
	CD16+ monocyte__10xmultiome
	Dendritic cell__10xmultiome
10xv11
	Cytotoxic T cell__10xv11
	CD4+ T cell__10xv11
	Natural killer cell__10xv11
	CD14+ monocyte__10xv11
	B cell__10xv11
	CD16+ monocyte__10xv11
	Dendritic cell__10xv11
hydrop
	Cytotoxic T cell__hydrop
	CD4+ T cell__hydrop
	Natural killer cell__hydrop
	CD14+ monocyte__hydrop
	B cell__hydrop
	CD16+ monocyte__hydrop
	Dendritic cell__hydrop
10xv2
	Cytotoxic T cell__10xv2
	CD4+ T cell__10xv2
	Natural killer cell__10xv2
	CD14+ monocyte__10xv2
	B cell__10xv2
	CD16+ monocyte__10xv2
	Dendritic cell__10xv2
ddseq
	Cytotoxic T cell__ddseq
	CD4+ T cell__ddseq
	Natural killer cell__ddseq
	CD14+ monocyte__ddseq
	B cell__dd

<IPython.core.display.Javascript object>

In [50]:
markers_dict = find_diff_features(
    cto,
    normalized_imputed_acc_obj,
    variable="harmony_consensus_cell_type__mega_bytech",
    var_features=variable_regions,
    contrasts=contrasts,
    adjpval_thr=0.05,
    log2fc_thr=np.log2(1.2),
    n_cpu=10,
)

2023-01-30 14:11:30,563	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(markers_ray pid=1306574)[0m 2023-01-30 14:11:36,846 cisTopic     INFO     Formatting data for Cytotoxic T cell__s3atac_VS_CD4+ T cell__s3atac_B cell__s3atac_Natural killer cell__s3atac_Dendritic cell__s3atac_CD14+ monocyte__s3atac_CD16+ monocyte__s3atac
[2m[36m(markers_ray pid=1306578)[0m 2023-01-30 14:11:39,164 cisTopic     INFO     Formatting data for CD4+ T cell__s3atac_VS_B cell__s3atac_Natural killer cell__s3atac_Cytotoxic T cell__s3atac_Dendritic cell__s3atac_CD14+ monocyte__s3atac_CD16+ monocyte__s3atac
[2m[36m(markers_ray pid=1306570)[0m 2023-01-30 14:11:41,544 cisTopic     INFO     Formatting data for Natural killer cell__s3atac_VS_CD4+ T cell__s3atac_B cell__s3atac_Cytotoxic T cell__s3atac_Dendritic cell__s3atac_CD14+ monocyte__s3atac_CD16+ monocyte__s3atac
[2m[36m(markers_ray pid=1306574)[0m 2023-01-30 14:11:42,739 cisTopic     INFO     Computing p-value for Cytotoxic T cell__s3atac_VS_CD4+ T cell__s3atac_B cell__s3atac_Natural killer cell__s3atac_Dendrit

[2m[36m(raylet)[0m Spilled 14148 MiB, 2 objects, write throughput 824 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.
[2m[36m(raylet)[0m Spilled 28297 MiB, 3 objects, write throughput 1542 MiB/s.
[2m[36m(raylet)[0m Spilled 42445 MiB, 5 objects, write throughput 2024 MiB/s.
[2m[36m(raylet)[0m Spilled 56594 MiB, 7 objects, write throughput 2468 MiB/s.


[2m[36m(markers_ray pid=1306574)[0m 2023-01-30 14:12:21,699 cisTopic     INFO     Computing log2FC for Cytotoxic T cell__s3atac_VS_CD4+ T cell__s3atac_B cell__s3atac_Natural killer cell__s3atac_Dendritic cell__s3atac_CD14+ monocyte__s3atac_CD16+ monocyte__s3atac
[2m[36m(markers_ray pid=1306574)[0m 2023-01-30 14:12:23,327 cisTopic     INFO     Cytotoxic T cell__s3atac_VS_CD4+ T cell__s3atac_B cell__s3atac_Natural killer cell__s3atac_Dendritic cell__s3atac_CD14+ monocyte__s3atac_CD16+ monocyte__s3atac done!
[2m[36m(markers_ray pid=1306574)[0m 2023-01-30 14:12:23,423 cisTopic     INFO     Formatting data for Natural killer cell__10xmultiome_VS_CD16+ monocyte__10xmultiome_B cell__10xmultiome_CD4+ T cell__10xmultiome_CD14+ monocyte__10xmultiome_Cytotoxic T cell__10xmultiome_Dendritic cell__10xmultiome
[2m[36m(markers_ray pid=1306578)[0m 2023-01-30 14:12:24,681 cisTopic     INFO     Computing log2FC for CD4+ T cell__s3atac_VS_B cell__s3atac_Natural killer cell__s3atac_Cytotoxic T

[2m[36m(raylet)[0m Spilled 70744 MiB, 11 objects, write throughput 1832 MiB/s.
[2m[36m(raylet)[0m Spilled 84893 MiB, 14 objects, write throughput 2065 MiB/s.
[2m[36m(raylet)[0m Spilled 141492 MiB, 26 objects, write throughput 2498 MiB/s.


[2m[36m(markers_ray pid=1306572)[0m 2023-01-30 14:13:01,538 cisTopic     INFO     Formatting data for Dendritic cell__10xmultiome_VS_CD16+ monocyte__10xmultiome_B cell__10xmultiome_CD4+ T cell__10xmultiome_CD14+ monocyte__10xmultiome_Natural killer cell__10xmultiome_Cytotoxic T cell__10xmultiome
[2m[36m(markers_ray pid=1306571)[0m 2023-01-30 14:13:01,866 cisTopic     INFO     Formatting data for Cytotoxic T cell__10xv11_VS_B cell__10xv11_Dendritic cell__10xv11_CD16+ monocyte__10xv11_CD4+ T cell__10xv11_CD14+ monocyte__10xv11_Natural killer cell__10xv11
[2m[36m(markers_ray pid=1306575)[0m 2023-01-30 14:13:02,219 cisTopic     INFO     Formatting data for CD4+ T cell__10xv11_VS_B cell__10xv11_Dendritic cell__10xv11_CD16+ monocyte__10xv11_CD14+ monocyte__10xv11_Natural killer cell__10xv11_Cytotoxic T cell__10xv11
[2m[36m(markers_ray pid=1306573)[0m 2023-01-30 14:13:02,554 cisTopic     INFO     Formatting data for Natural killer cell__10xv11_VS_B cell__10xv11_Dendritic cell__10x

[2m[36m(raylet)[0m Spilled 297129 MiB, 45 objects, write throughput 2788 MiB/s.


[2m[36m(markers_ray pid=1306576)[0m 2023-01-30 14:13:50,257 cisTopic     INFO     Computing p-value for Cytotoxic T cell__hydrop_VS_B cell__hydrop_CD4+ T cell__hydrop_CD14+ monocyte__hydrop_CD16+ monocyte__hydrop_Dendritic cell__hydrop_Natural killer cell__hydrop
[2m[36m(markers_ray pid=1306573)[0m 2023-01-30 14:13:50,399 cisTopic     INFO     Natural killer cell__10xv11_VS_B cell__10xv11_Dendritic cell__10xv11_CD16+ monocyte__10xv11_CD4+ T cell__10xv11_CD14+ monocyte__10xv11_Cytotoxic T cell__10xv11 done!
[2m[36m(markers_ray pid=1306572)[0m 2023-01-30 14:13:51,053 cisTopic     INFO     Dendritic cell__10xmultiome_VS_CD16+ monocyte__10xmultiome_B cell__10xmultiome_CD4+ T cell__10xmultiome_CD14+ monocyte__10xmultiome_Natural killer cell__10xmultiome_Cytotoxic T cell__10xmultiome done!
[2m[36m(markers_ray pid=1306574)[0m 2023-01-30 14:13:51,513 cisTopic     INFO     Computing log2FC for B cell__10xv11_VS_Dendritic cell__10xv11_CD16+ monocyte__10xv11_CD4+ T cell__10xv11_CD14+ 

[2m[36m(raylet)[0m Spilled 537676 MiB, 101 objects, write throughput 2738 MiB/s.


[2m[36m(markers_ray pid=1306578)[0m 2023-01-30 14:15:21,140 cisTopic     INFO     Computing log2FC for B cell__10xv2_VS_Natural killer cell__10xv2_CD4+ T cell__10xv2_Cytotoxic T cell__10xv2_CD14+ monocyte__10xv2_CD16+ monocyte__10xv2_Dendritic cell__10xv2
[2m[36m(markers_ray pid=1306576)[0m 2023-01-30 14:15:21,511 cisTopic     INFO     CD14+ monocyte__10xv2_VS_Natural killer cell__10xv2_B cell__10xv2_CD4+ T cell__10xv2_Cytotoxic T cell__10xv2_CD16+ monocyte__10xv2_Dendritic cell__10xv2 done!
[2m[36m(markers_ray pid=1306570)[0m 2023-01-30 14:15:21,829 cisTopic     INFO     Computing log2FC for CD16+ monocyte__10xv2_VS_Natural killer cell__10xv2_B cell__10xv2_CD4+ T cell__10xv2_Cytotoxic T cell__10xv2_CD14+ monocyte__10xv2_Dendritic cell__10xv2
[2m[36m(markers_ray pid=1306578)[0m 2023-01-30 14:15:22,793 cisTopic     INFO     B cell__10xv2_VS_Natural killer cell__10xv2_CD4+ T cell__10xv2_Cytotoxic T cell__10xv2_CD14+ monocyte__10xv2_CD16+ monocyte__10xv2_Dendritic cell__10xv2 

<IPython.core.display.Javascript object>

In [54]:
if not os.path.exists(f"harmony_consensus_cell_type__mega_dars_bytech/{sample}"):
    os.mkdir(f"harmony_consensus_cell_type__mega_dars_bytech/{sample}")

with open(
    f"harmony_consensus_cell_type__mega_dars_bytech/{sample}/{sample}__DARs_dict_1-2xfoldchange.pkl",
    "wb",
) as f:
    pickle.dump(markers_dict, f, protocol=4)

<IPython.core.display.Javascript object>

In [56]:
markers_dict["Cytotoxic T cell__s3atac_VS_CD4+ T cell__s3atac_B cell__s3atac_Natural killer cell__s3atac_Dendritic cell__s3atac_CD14+ monocyte__s3atac_CD16+ monocyte__s3atac"].index.tolist()

['chr1:160738300-160738800',
 'chrX:107480215-107480715',
 'chr1:26384028-26384528',
 'chr2:144328951-144329451',
 'chr19:5086203-5086703',
 'chr16:15389801-15390301',
 'chr1:91753815-91754315',
 'chr1:18828143-18828643',
 'chr16:82452186-82452686',
 'chr22:21780254-21780754',
 'chr8:58961776-58962276',
 'chr1:227816992-227817492',
 'chr17:30655938-30656438',
 'chr7:3119560-3120060',
 'chr1:235885848-235886348',
 'chr4:230967-231467',
 'chr11:36408389-36408889',
 'chr17:79556257-79556757',
 'chr16:31414645-31415145',
 'chr20:35080132-35080632',
 'chr15:49036812-49037312',
 'chr2:159145665-159146165',
 'chr6:45895539-45896039',
 'chr11:76495033-76495533',
 'chr1:235697461-235697961',
 'chr2:88269676-88270176',
 'chr12:10307574-10308074',
 'chr2:113679433-113679933',
 'chr16:89020264-89020764',
 'chr20:51409301-51409801',
 'chr6:144304428-144304928',
 'chr12:10602582-10603082',
 'chr3:106033652-106034152',
 'chr9:6439216-6439716',
 'chr10:72941683-72942183',
 'chr8:101967771-101968271',


<IPython.core.display.Javascript object>

In [57]:
for contrast in markers_dict.keys():
    markers = markers_dict[contrast].index.tolist()
    name = contrast.split('_VS')[0].replace(' ', '_')
    df = pd.DataFrame(markers)
    if len(df) == 0:
        print(f"no DARs found for {contrast} in {sample}")
    else:
        df[[0, 1]] = df[0].str.split(":", expand=True)
        df[[1, 2]] = df[1].str.split("-", expand=True)
        df[3] = markers_dict[contrast].index.tolist()
        df[3] = contrast.replace(" ", "_")  # + '_' + df[3].astype(str)
        score = markers_dict[contrast]["Log2FC"]
        score = score.reset_index(drop=True)
        df[4] = score
        df[5] = "."
        pval = markers_dict[contrast]["Adjusted_pval"]
        pval = pval.reset_index(drop=True)
        df[6] = pval
        

        df.to_csv(
            f'harmony_consensus_cell_type__mega_dars_bytech/{sample}/{sample}__{name}__1-2xfoldchange_DARs.bed',
            sep="\t",
            header=False,
            index=False,
        )

<IPython.core.display.Javascript object>