# pycisTopic analysis

Full dataset, using consensus peak regions.

In [2]:
import pycisTopic

%load_ext nb_black
import warnings

warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")
import pickle
import pandas as pd
import os

<IPython.core.display.Javascript object>

In [3]:
!pwd

/lustre1/project/stg_00090/scatac_benchmark/fixedcells_8_individual_tech_cistopic_objects


<IPython.core.display.Javascript object>

In [4]:
wdir = "/lustre1/project/stg_00090/scatac_benchmark/fixedcells_8_individual_tech_cistopic_objects"
os.chdir(wdir)

<IPython.core.display.Javascript object>

In [5]:
# create output directory:
f_final_dir = os.path.join(wdir, "downstream_analysis")
if not os.path.exists(f_final_dir):
    os.makedirs(f_final_dir)

<IPython.core.display.Javascript object>

In [6]:
import glob

<IPython.core.display.Javascript object>

## Save/load cisTopic objects

In [18]:
cto_consensus_paths = sorted(glob.glob(f"cistopic_objects/*topics.pkl"))
cistopic_obj_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x for x in cto_consensus_paths
}
cistopic_obj_path_dict

{'10xmultiome_celltypefair_1.FIXEDCELLS': 'cistopic_objects/10xmultiome_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_19topics.pkl',
 '10xv11_celltypefair_1.FIXEDCELLS': 'cistopic_objects/10xv11_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_26topics.pkl',
 '10xv1_celltypefair_1.FIXEDCELLS': 'cistopic_objects/10xv1_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_21topics.pkl',
 '10xv2_celltypefair_1.FIXEDCELLS': 'cistopic_objects/10xv2_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_25topics.pkl',
 'ddseq_celltypefair_1.FIXEDCELLS': 'cistopic_objects/ddseq_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_17topics.pkl',
 'hydrop_celltypefair_1.FIXEDCELLS': 'cistopic_objects/hydrop_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_22topics.pkl',
 'mtscatac_celltypefair_1.FIXEDCELLS': 'cistopic_objects/mtscatac_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_25topics.pkl',
 's3atac_celltypefa

<IPython.core.display.Javascript object>

## Topic binarization & QC

In [19]:
from pycisTopic.topic_binarization import binarize_topics
from pycisTopic.topic_qc import compute_topic_metrics, plot_topic_qc
from pycisTopic.utils import fig2img
import matplotlib.pyplot as plt
from pycisTopic.topic_qc import topic_annotation
from pycisTopic.diff_features import (
    impute_accessibility,
    normalize_scores,
    find_highly_variable_features,
    find_diff_features,
)
from pycisTopic.clust_vis import plot_imputed_features
import numpy as np
import copy

<IPython.core.display.Javascript object>

### Binarize the topic-region distributions

In [20]:
for sample in cistopic_obj_path_dict.keys():
    # for sample  in ['SAN_10xmultiome_1.FIXEDCELLS']:
    path = cistopic_obj_path_dict[sample]
    if not os.path.exists(f"downstream_analysis/DARs/{sample}"):
        if os.path.isfile(path):
            print(f"Loading {path}")
            with open(path, "rb") as f:
                cto = pickle.load(f)

            cto.cell_names = cto.cell_data.index
            if [x.split("-")[0] for x in cto.selected_model.cell_topic.columns] == [
                x.split("___")[0] for x in list(cto.cell_names)
            ]:
                cto.selected_model.cell_topic.columns = list(cto.cell_names)

            region_bin_topics = binarize_topics(
                cto,
                method="otsu",
                ntop=3000,
                plot=True,
                num_columns=6,
                save=f"plots_qc/{sample}__topic_region_binarized.png",
            )

            with open(
                f"downstream_analysis/binarized_topics/{sample}__topic_region_binarized.pkl",
                "wb",
            ) as f:
                pickle.dump(region_bin_topics, f, protocol=4)

            binarized_cell_topics = binarize_topics(
                cto,
                target="cell",
                method="li",
                plot=True,
                num_columns=5,
                nbins=100,
                save=f"plots_qc/{sample}__cells_topic_binarized.png",
            )

            with open(
                f"downstream_analysis/binarized_topics/{sample}__cells_topic_binarized.pkl",
                "wb",
            ) as f:
                pickle.dump(region_bin_topics, f, protocol=4)

            topic_qc_metrics = compute_topic_metrics(cto)

            with open(
                f"downstream_analysis/binarized_topics/{sample}__topic_qc.pkl", "wb"
            ) as f:
                pickle.dump(topic_qc_metrics, f, protocol=4)

            fig_dict = {
                "CoherenceVSAssignments": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Coherence",
                    var_y="Log10_Assignments",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
                "AssignmentsVSCells_in_bin": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Log10_Assignments",
                    var_y="Cells_in_binarized_topic",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
                "CoherenceVSCells_in_bin": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Coherence",
                    var_y="Cells_in_binarized_topic",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
                "CoherenceVSRegions_in_bin": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Coherence",
                    var_y="Regions_in_binarized_topic",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
                "CoherenceVSMarginal_dist": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Coherence",
                    var_y="Marginal_topic_dist",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
                "CoherenceVSGini_index": plot_topic_qc(
                    topic_qc_metrics,
                    var_x="Coherence",
                    var_y="Gini_index",
                    var_color="Gini_index",
                    plot=False,
                    return_fig=True,
                ),
            }

            fig = plt.figure(figsize=(40, 43))
            i = 1
            for fig_ in fig_dict.keys():
                plt.subplot(2, 3, i)
                img = fig2img(
                    fig_dict[fig_]
                )  # To convert figures to png to plot together, see .utils.py. This converts the figure to png.
                plt.imshow(img)
                plt.axis("off")
                i += 1
            plt.subplots_adjust(wspace=0, hspace=-0.70)
            # plt.tight_layout()
            plt.show()
            plt.savefig(
                f"plots_qc/{sample}__topic_qc_metrics.png", facecolor="white", dpi=150
            )

            topic_annot = topic_annotation(
                cto,
                annot_var="harmony_consensus_cell_type__mega",
                binarized_cell_topic=binarized_cell_topics,
                general_topic_thr=0.2,
            )
            topic_qc_metrics = pd.concat(
                [
                    topic_annot[
                        [
                            "harmony_consensus_cell_type__mega",
                            "Ratio_cells_in_topic",
                            "Ratio_group_in_population",
                        ]
                    ],
                    topic_qc_metrics,
                ],
                axis=1,
            )

            imputed_acc_obj = impute_accessibility(
                cto, selected_cells=None, selected_regions=None, scale_factor=10**6
            )

            normalized_imputed_acc_obj = normalize_scores(
                imputed_acc_obj, scale_factor=10**4
            )

            with open(
                f"downstream_analysis/imputed_acc_objs/{sample}__imputed_acc_obs.pkl",
                "wb",
            ) as f:
                pickle.dump(normalized_imputed_acc_obj, f, protocol=4)

            variable_regions = find_highly_variable_features(
                normalized_imputed_acc_obj,
                min_disp=0.05,
                min_mean=0.0125,
                max_mean=3,
                max_disp=np.inf,
                n_bins=20,
                n_top_features=None,
                plot=True,
                save=f"plots_qc/{sample}__HVR.png",
            )

            print(f"Found {len(variable_regions)} variable regions")
            with open(f"downstream_analysis/HVRs/{sample}__HVRs.bed", "w") as f:
                for line in [
                    x.replace(":", "\t").replace("-", "\t") for x in variable_regions
                ]:
                    f.write(f"{line}\n")

            with open(f"downstream_analysis/HVRs/{sample}__HVRs.pkl", "wb") as f:
                pickle.dump(variable_regions, f, protocol=4)

            if not os.path.exists(f"downstream_analysis/DARs/{sample}"):
                os.mkdir(f"downstream_analysis/DARs/{sample}")

            markers_dict = find_diff_features(
                cto,
                normalized_imputed_acc_obj,
                variable="harmony_consensus_cell_type__mega",
                var_features=variable_regions,
                contrasts=None,
                adjpval_thr=0.05,
                log2fc_thr=np.log2(1.5),
                n_cpu=16,
            )
            with open(
                f"downstream_analysis/DARs/{sample}/{sample}__DARs_dict.pkl", "wb"
            ) as f:
                pickle.dump(markers_dict, f, protocol=4)

            for cell_type in markers_dict.keys():
                markers = markers_dict[cell_type].index.tolist()
                df = pd.DataFrame(markers)
                if len(df) == 0:
                    print(f"no DARs found for {cell_type} in {sample}")
                else:
                    df[[0, 1]] = df[0].str.split(":", expand=True)
                    df[[1, 2]] = df[1].str.split("-", expand=True)
                    df[3] = markers_dict[cell_type].index.tolist()
                    df[3] = cell_type.replace(" ", "_")  # + '_' + df[3].astype(str)
                    score = markers_dict[cell_type]["Log2FC"]
                    score = score.reset_index(drop=True)
                    df[4] = score
                    df[5] = "."
                    pval = markers_dict[cell_type]["Adjusted_pval"]
                    pval = pval.reset_index(drop=True)
                    df[6] = pval
                    name = cell_type.replace(" ", "_")
                    # name = name.replace('/', '-')
                    if not os.path.exists(f"downstream_analysis/DARs/{sample}"):
                        os.mkdir(f"downstream_analysis/DARs/{sample}")

                    df.to_csv(
                        f'downstream_analysis/DARs/{sample}/{sample}__{cell_type.replace(" ", "_")}__DARs.bed',
                        sep="\t",
                        header=False,
                        index=False,
                    )
                    df[0:2000].to_csv(
                        f'downstream_analysis/DARs/{sample}/{sample}__{cell_type.replace(" ", "_")}__DARs.TOP2k.bed',
                        sep="\t",
                        header=False,
                        index=False,
                    )

        else:
            print(f"{path} does not exist!")
    else:
        print(f"downstream_analysis/DARs/{sample} already exists!")

downstream_analysis/DARs/10xmultiome_celltypefair_1.FIXEDCELLS already exists!
downstream_analysis/DARs/10xv11_celltypefair_1.FIXEDCELLS already exists!
downstream_analysis/DARs/10xv1_celltypefair_1.FIXEDCELLS already exists!
downstream_analysis/DARs/10xv2_celltypefair_1.FIXEDCELLS already exists!
downstream_analysis/DARs/ddseq_celltypefair_1.FIXEDCELLS already exists!
downstream_analysis/DARs/hydrop_celltypefair_1.FIXEDCELLS already exists!
downstream_analysis/DARs/mtscatac_celltypefair_1.FIXEDCELLS already exists!
downstream_analysis/DARs/s3atac_celltypefair_1.FIXEDCELLS already exists!


<IPython.core.display.Javascript object>

# Calculate DARs between male and female

In [1]:
normalized_imputed_acc_obj_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(
        glob.glob(f"downstream_analysis/imputed_acc_objs/*imputed_acc_obs.pkl")
    )
}
normalized_imputed_acc_obj_path_dict

NameError: name 'glob' is not defined

In [22]:
hvr_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob(f"downstream_analysis/HVRs/*bed"))
}
hvr_path_dict

{'10xmultiome_celltypefair_1.FIXEDCELLS': 'downstream_analysis/HVRs/10xmultiome_celltypefair_1.FIXEDCELLS__HVRs.bed',
 '10xv11_celltypefair_1.FIXEDCELLS': 'downstream_analysis/HVRs/10xv11_celltypefair_1.FIXEDCELLS__HVRs.bed',
 '10xv1_celltypefair_1.FIXEDCELLS': 'downstream_analysis/HVRs/10xv1_celltypefair_1.FIXEDCELLS__HVRs.bed',
 '10xv2_celltypefair_1.FIXEDCELLS': 'downstream_analysis/HVRs/10xv2_celltypefair_1.FIXEDCELLS__HVRs.bed',
 'ddseq_celltypefair_1.FIXEDCELLS': 'downstream_analysis/HVRs/ddseq_celltypefair_1.FIXEDCELLS__HVRs.bed',
 'hydrop_celltypefair_1.FIXEDCELLS': 'downstream_analysis/HVRs/hydrop_celltypefair_1.FIXEDCELLS__HVRs.bed',
 'mtscatac_celltypefair_1.FIXEDCELLS': 'downstream_analysis/HVRs/mtscatac_celltypefair_1.FIXEDCELLS__HVRs.bed',
 's3atac_celltypefair_1.FIXEDCELLS': 'downstream_analysis/HVRs/s3atac_celltypefair_1.FIXEDCELLS__HVRs.bed'}

<IPython.core.display.Javascript object>

In [23]:
cto_consensus_paths = sorted(glob.glob(f"cistopic_objects/*topics.pkl"))
cistopic_obj_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x for x in cto_consensus_paths
}
cistopic_obj_path_dict

{'10xmultiome_celltypefair_1.FIXEDCELLS': 'cistopic_objects/10xmultiome_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_19topics.pkl',
 '10xv11_celltypefair_1.FIXEDCELLS': 'cistopic_objects/10xv11_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_26topics.pkl',
 '10xv1_celltypefair_1.FIXEDCELLS': 'cistopic_objects/10xv1_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_21topics.pkl',
 '10xv2_celltypefair_1.FIXEDCELLS': 'cistopic_objects/10xv2_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_25topics.pkl',
 'ddseq_celltypefair_1.FIXEDCELLS': 'cistopic_objects/ddseq_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_17topics.pkl',
 'hydrop_celltypefair_1.FIXEDCELLS': 'cistopic_objects/hydrop_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_22topics.pkl',
 'mtscatac_celltypefair_1.FIXEDCELLS': 'cistopic_objects/mtscatac_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_25topics.pkl',
 's3atac_celltypefa

<IPython.core.display.Javascript object>

In [26]:
for sample in cistopic_obj_path_dict.keys():
    with open(
        cistopic_obj_path_dict[sample],
        "rb",
    ) as f:
        cto = pickle.load(f)

    acc_path = normalized_imputed_acc_obj_path_dict[sample]

    if os.path.isfile(acc_path):
        print(f"Loading {acc_path}")
        with open(acc_path, "rb") as f:
            normalized_imputed_acc_obj = pickle.load(f)

    hvr_pkl_path = f"downstream_analysis/HVRs/{sample}__HVRs.pkl"

    if os.path.exists(hvr_pkl_path):
        print(f"Loading {hvr_pkl_path}")
        with open(hvr_pkl_path, "rb") as f:
            variable_regions = pickle.load(f)
    else:
        print("HVRs do not exist!")

    cto.cell_data["fmx_sample_harmony_consensus_cell_type__mega"] = (
        cto.cell_data["fmx_sample"] + "_" + cto.cell_data["harmony_consensus_cell_type__mega"]
    )

    contrasts = []
    for cell_type in cto.cell_data["harmony_consensus_cell_type__mega"].unique():
        print(cell_type)
        ct_a = "sampleA_" + cell_type
        ct_b = "sampleB_" + cell_type
        contrast = [[[ct_a], [ct_b]], [[ct_b], [ct_a]]]
        contrasts = contrasts + contrast

    markers_dict = find_diff_features(
        cto,
        normalized_imputed_acc_obj,
        variable="fmx_sample_harmony_consensus_cell_type__mega",
        var_features=variable_regions,
        contrasts=contrasts,
        adjpval_thr=0.05,
        log2fc_thr=np.log2(1.2),
        n_cpu=10,
    )

    if not os.path.exists(f"harmony_consensus_cell_type__mega_male_female_DARs/{sample}"):
        os.mkdir(f"harmony_consensus_cell_type__mega_male_female_DARs/{sample}")

    with open(
        f"harmony_consensus_cell_type__mega_male_female_DARs/{sample}/{sample}__DARs_dict_1-2xfoldchange.pkl",
        "wb",
    ) as f:
        pickle.dump(markers_dict, f, protocol=4)

    for contrast in markers_dict.keys():
        markers = markers_dict[contrast].index.tolist()
        df = pd.DataFrame(markers)
        if len(df) == 0:
            print(f"no DARs found for {contrast} in {sample}")
        else:
            df[[0, 1]] = df[0].str.split(":", expand=True)
            df[[1, 2]] = df[1].str.split("-", expand=True)
            df[3] = markers_dict[contrast].index.tolist()
            df[3] = contrast.replace(" ", "_")  # + '_' + df[3].astype(str)
            score = markers_dict[contrast]["Log2FC"]
            score = score.reset_index(drop=True)
            df[4] = score
            df[5] = "."
            pval = markers_dict[contrast]["Adjusted_pval"]
            pval = pval.reset_index(drop=True)
            df[6] = pval
            name = contrast.replace(" ", "_")

            df.to_csv(
                f'harmony_consensus_cell_type__mega_male_female_DARs/{sample}/{sample}__{contrast.replace(" ", "_")}__1-2xfoldchange_DARs.bed',
                sep="\t",
                header=False,
                index=False,
            )

Loading downstream_analysis/imputed_acc_objs/10xmultiome_celltypefair_1.FIXEDCELLS__imputed_acc_obs.pkl
Loading downstream_analysis/HVRs/10xmultiome_celltypefair_1.FIXEDCELLS__HVRs.pkl
Cytotoxic T cell
CD4+ T cell
Natural killer cell
CD14+ monocyte
B cell
CD16+ monocyte
Dendritic cell


2023-02-15 15:50:38,756	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(markers_ray pid=996606)[0m 2023-02-15 15:50:42,920 cisTopic     INFO     Formatting data for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=996607)[0m 2023-02-15 15:50:43,459 cisTopic     INFO     Formatting data for sampleB_Cytotoxic T cell_VS_sampleA_Cytotoxic T cell
[2m[36m(markers_ray pid=996604)[0m 2023-02-15 15:50:43,768 cisTopic     INFO     Formatting data for sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell
[2m[36m(markers_ray pid=996601)[0m 2023-02-15 15:50:44,253 cisTopic     INFO     Formatting data for sampleB_CD4+ T cell_VS_sampleA_CD4+ T cell
[2m[36m(markers_ray pid=996606)[0m 2023-02-15 15:50:44,398 cisTopic     INFO     Computing p-value for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=996604)[0m 2023-02-15 15:50:44,624 cisTopic     INFO     Computing p-value for sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell
[2m[36m(markers_ray pid=996598)[0m 2023-02-15 15:50:44,701 cisTopic     INFO     

2023-02-15 15:51:30,855	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(markers_ray pid=997288)[0m 2023-02-15 15:51:34,906 cisTopic     INFO     Formatting data for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=997292)[0m 2023-02-15 15:51:35,289 cisTopic     INFO     Formatting data for sampleB_Cytotoxic T cell_VS_sampleA_Cytotoxic T cell
[2m[36m(markers_ray pid=997296)[0m 2023-02-15 15:51:35,649 cisTopic     INFO     Formatting data for sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell
[2m[36m(markers_ray pid=997288)[0m 2023-02-15 15:51:35,992 cisTopic     INFO     Computing p-value for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=997294)[0m 2023-02-15 15:51:36,046 cisTopic     INFO     Formatting data for sampleB_CD4+ T cell_VS_sampleA_CD4+ T cell
[2m[36m(markers_ray pid=997296)[0m 2023-02-15 15:51:36,202 cisTopic     INFO     Computing p-value for sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell
[2m[36m(markers_ray pid=997287)[0m 2023-02-15 15:51:36,282 cisTopic     INFO     

2023-02-15 15:52:22,993	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(markers_ray pid=998194)[0m 2023-02-15 15:52:27,185 cisTopic     INFO     Formatting data for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=998186)[0m 2023-02-15 15:52:27,459 cisTopic     INFO     Formatting data for sampleB_Cytotoxic T cell_VS_sampleA_Cytotoxic T cell
[2m[36m(markers_ray pid=998188)[0m 2023-02-15 15:52:27,867 cisTopic     INFO     Formatting data for sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell
[2m[36m(markers_ray pid=998193)[0m 2023-02-15 15:52:28,239 cisTopic     INFO     Formatting data for sampleB_CD4+ T cell_VS_sampleA_CD4+ T cell
[2m[36m(markers_ray pid=998189)[0m 2023-02-15 15:52:28,469 cisTopic     INFO     Formatting data for sampleA_Natural killer cell_VS_sampleB_Natural killer cell
[2m[36m(markers_ray pid=998194)[0m 2023-02-15 15:52:28,634 cisTopic     INFO     Computing p-value for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=998189)[0m 2023-02-15 15:52:28,667 cisTopic

2023-02-15 15:53:13,205	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(markers_ray pid=998909)[0m 2023-02-15 15:53:17,198 cisTopic     INFO     Formatting data for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=998904)[0m 2023-02-15 15:53:17,629 cisTopic     INFO     Formatting data for sampleB_Cytotoxic T cell_VS_sampleA_Cytotoxic T cell
[2m[36m(markers_ray pid=998906)[0m 2023-02-15 15:53:17,856 cisTopic     INFO     Formatting data for sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell
[2m[36m(markers_ray pid=998909)[0m 2023-02-15 15:53:17,933 cisTopic     INFO     Computing p-value for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=998906)[0m 2023-02-15 15:53:18,270 cisTopic     INFO     Computing p-value for sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell
[2m[36m(markers_ray pid=998910)[0m 2023-02-15 15:53:18,361 cisTopic     INFO     Formatting data for sampleB_CD4+ T cell_VS_sampleA_CD4+ T cell
[2m[36m(markers_ray pid=998904)[0m 2023-02-15 15:53:18,404 cisTopic     INFO     

2023-02-15 15:54:02,852	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(markers_ray pid=999597)[0m 2023-02-15 15:54:06,753 cisTopic     INFO     Formatting data for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=999605)[0m 2023-02-15 15:54:07,012 cisTopic     INFO     Formatting data for sampleB_Cytotoxic T cell_VS_sampleA_Cytotoxic T cell
[2m[36m(markers_ray pid=999597)[0m 2023-02-15 15:54:07,104 cisTopic     INFO     Computing p-value for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=999600)[0m 2023-02-15 15:54:07,336 cisTopic     INFO     Formatting data for sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell
[2m[36m(markers_ray pid=999605)[0m 2023-02-15 15:54:07,421 cisTopic     INFO     Computing p-value for sampleB_Cytotoxic T cell_VS_sampleA_Cytotoxic T cell
[2m[36m(markers_ray pid=999600)[0m 2023-02-15 15:54:07,551 cisTopic     INFO     Computing p-value for sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell
[2m[36m(markers_ray pid=999599)[0m 2023-02-15 15:54:07,627 cisTopic  

2023-02-15 15:54:41,135	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(markers_ray pid=1000257)[0m 2023-02-15 15:54:45,178 cisTopic     INFO     Formatting data for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=1000263)[0m 2023-02-15 15:54:45,410 cisTopic     INFO     Formatting data for sampleB_Cytotoxic T cell_VS_sampleA_Cytotoxic T cell
[2m[36m(markers_ray pid=1000258)[0m 2023-02-15 15:54:45,818 cisTopic     INFO     Formatting data for sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell
[2m[36m(markers_ray pid=1000262)[0m 2023-02-15 15:54:46,118 cisTopic     INFO     Formatting data for sampleB_CD4+ T cell_VS_sampleA_CD4+ T cell
[2m[36m(markers_ray pid=1000257)[0m 2023-02-15 15:54:46,459 cisTopic     INFO     Computing p-value for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=1000256)[0m 2023-02-15 15:54:46,412 cisTopic     INFO     Formatting data for sampleA_Natural killer cell_VS_sampleB_Natural killer cell
[2m[36m(markers_ray pid=1000258)[0m 2023-02-15 15:54:46,546 c

2023-02-15 15:55:29,834	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(markers_ray pid=1001590)[0m 2023-02-15 15:55:33,826 cisTopic     INFO     Formatting data for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=1001586)[0m 2023-02-15 15:55:34,340 cisTopic     INFO     Formatting data for sampleB_Cytotoxic T cell_VS_sampleA_Cytotoxic T cell
[2m[36m(markers_ray pid=1001593)[0m 2023-02-15 15:55:34,675 cisTopic     INFO     Formatting data for sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell
[2m[36m(markers_ray pid=1001589)[0m 2023-02-15 15:55:35,120 cisTopic     INFO     Formatting data for sampleB_CD4+ T cell_VS_sampleA_CD4+ T cell
[2m[36m(markers_ray pid=1001592)[0m 2023-02-15 15:55:35,482 cisTopic     INFO     Formatting data for sampleA_Natural killer cell_VS_sampleB_Natural killer cell
[2m[36m(markers_ray pid=1001592)[0m 2023-02-15 15:55:35,735 cisTopic     INFO     Computing p-value for sampleA_Natural killer cell_VS_sampleB_Natural killer cell
[2m[36m(markers_ray pid=1001593)[0m 2023-02-15 15:55:35

2023-02-15 15:56:32,973	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(markers_ray pid=1002266)[0m 2023-02-15 15:56:37,359 cisTopic     INFO     Formatting data for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=1002269)[0m 2023-02-15 15:56:37,851 cisTopic     INFO     Formatting data for sampleB_Cytotoxic T cell_VS_sampleA_Cytotoxic T cell
[2m[36m(markers_ray pid=1002272)[0m 2023-02-15 15:56:38,408 cisTopic     INFO     Formatting data for sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell
[2m[36m(markers_ray pid=1002266)[0m 2023-02-15 15:56:38,644 cisTopic     INFO     Computing p-value for sampleA_Cytotoxic T cell_VS_sampleB_Cytotoxic T cell
[2m[36m(markers_ray pid=1002272)[0m 2023-02-15 15:56:39,024 cisTopic     INFO     Computing p-value for sampleA_CD4+ T cell_VS_sampleB_CD4+ T cell
[2m[36m(markers_ray pid=1002267)[0m 2023-02-15 15:56:39,023 cisTopic     INFO     Formatting data for sampleB_CD4+ T cell_VS_sampleA_CD4+ T cell
[2m[36m(markers_ray pid=1002269)[0m 2023-02-15 15:56:39,252 cisTopic     IN

<IPython.core.display.Javascript object>

In [4]:
import glob

<IPython.core.display.Javascript object>

In [6]:
for file_path in sorted(glob.glob("downstream_analysis/DARs/*/*__DARs.bed")):
    print(file_path)
    df = pd.read_csv(file_path, sep="\t", header=None)
    df[3] = df[3] + "_" + [str(x) for x in pd.DataFrame(range(len(df)))[0]]
    df.to_csv(
        file_path.replace(".bed", ".unique.bed"), header=None, index=False, sep="\t"
    )
    df[:2000].to_csv(
        file_path.replace(".bed", ".unique.TOP2K.bed"), header=None, index=False, sep="\t"
    )

downstream_analysis/DARs/10xmultiome_celltypefair_1.FIXEDCELLS/10xmultiome_celltypefair_1.FIXEDCELLS__B_cell__DARs.bed
downstream_analysis/DARs/10xmultiome_celltypefair_1.FIXEDCELLS/10xmultiome_celltypefair_1.FIXEDCELLS__CD14+_monocyte__DARs.bed
downstream_analysis/DARs/10xmultiome_celltypefair_1.FIXEDCELLS/10xmultiome_celltypefair_1.FIXEDCELLS__CD16+_monocyte__DARs.bed
downstream_analysis/DARs/10xmultiome_celltypefair_1.FIXEDCELLS/10xmultiome_celltypefair_1.FIXEDCELLS__CD4+_T_cell__DARs.bed
downstream_analysis/DARs/10xmultiome_celltypefair_1.FIXEDCELLS/10xmultiome_celltypefair_1.FIXEDCELLS__Cytotoxic_T_cell__DARs.bed
downstream_analysis/DARs/10xmultiome_celltypefair_1.FIXEDCELLS/10xmultiome_celltypefair_1.FIXEDCELLS__Dendritic_cell__DARs.bed
downstream_analysis/DARs/10xmultiome_celltypefair_1.FIXEDCELLS/10xmultiome_celltypefair_1.FIXEDCELLS__Natural_killer_cell__DARs.bed
downstream_analysis/DARs/10xv11_celltypefair_1.FIXEDCELLS/10xv11_celltypefair_1.FIXEDCELLS__B_cell__DARs.bed
downst

<IPython.core.display.Javascript object>

In [7]:
for file_path in sorted(glob.glob("harmony_consensus_cell_type__mega_male_female_DARs/10xmultiome_celltypefair_1.FIXEDCELLS/*/*__DARs.bed")):
    print(file_path)
    df = pd.read_csv(file_path, sep="\t", header=None)
    df[3] = df[3] + "_" + [str(x) for x in pd.DataFrame(range(len(df)))[0]]
    df.to_csv(
        file_path.replace(".bed", ".unique.bed"), header=None, index=False, sep="\t"
    )

<IPython.core.display.Javascript object>