In [1]:
from pycisTopic.pseudobulk_peak_calling import export_pseudobulk, peak_calling
import pyranges as pr
import requests
import os
import pandas as pd
import glob
import pickle

In [2]:
%load_ext lab_black

In [3]:
# get chromosome sizes (hg38)
if not os.path.exists("chromsizes.txt"):
    target_url = (
        "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes"
    )
    chromsizes = pd.read_csv(target_url, sep="\t", header=None)
    chromsizes.columns = ["Chromosome", "End"]
    chromsizes["Start"] = [0] * chromsizes.shape[0]
    chromsizes = chromsizes.loc[:, ["Chromosome", "Start", "End"]]
    chromsizes = pr.PyRanges(chromsizes)
    chromsizes.to_csv("chromsizes.txt")
    chromsizes
else:
    chromsizes = pd.read_csv("chromsizes.txt")

In [4]:
cto_consensus_paths = sorted(glob.glob(f"cistopic_objects/*consensus.pkl"))
cto_consensus_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x for x in cto_consensus_paths
}
cto_consensus_path_dict

{'master_sub_1.FIXEDCELLS': 'cistopic_objects/master_sub_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_20topics.dimreduc.consensus.pkl'}

In [5]:
fragments_path_dict = {
    "master_sub_1.FIXEDCELLS": "/lustre1/project/stg_00090/scatac_benchmark/fixedcells_4_merged/merged_all_1.fragments.ID.sorted.tsv.gz"
}

In [6]:
import gc
import logging
import os
import re
import subprocess
import sys
from typing import Dict, List, Optional, Union

import numpy as np
import pandas as pd
import pyBigWig
import pyranges as pr
import ray

from pycisTopic.cistopic_class import *
from pycisTopic.utils import *


def export_pseudobulk(
    input_data: Union["CistopicObject", pd.DataFrame, Dict[str, pd.DataFrame]],
    variable: str,
    chromsizes: Union[pd.DataFrame, pr.PyRanges],
    bed_path: str,
    bigwig_path: str,
    path_to_fragments: Optional[Dict[str, str]] = None,
    sample_id_col: Optional[str] = "sample_id",
    n_cpu: Optional[int] = 1,
    normalize_bigwig: Optional[bool] = True,
    remove_duplicates: Optional[bool] = True,
    split_pattern: Optional[str] = "___",
    use_polars: Optional[bool] = True,
    **kwargs
):
    """
    Create pseudobulks as bed and bigwig from single cell fragments file given a barcode annotation.
    Parameters
    ---------
    input_data: CistopicObject or pd.DataFrame
            A :class:`CistopicObject` containing the specified `variable` as a column in :class:`CistopicObject.cell_data` or a cell metadata
            :class:`pd.DataFrame` containing barcode as rows, containing the specified `variable` as a column (additional columns are
            possible) and a `sample_id` column. Index names must contain the BARCODE (e.g. ATGTCGTC-1), additional tags are possible separating with -
            (e.g. ATGCTGTGCG-1-Sample_1). The levels in the sample_id column must agree with the keys in the path_to_fragments dictionary.
            Alternatively, if the cell metadata contains a column named barcode it will be used instead of the index names.
    variable: str
            A character string indicating the column that will be used to create the different group pseudobulk. It must be included in
            the cell metadata provided as input_data.
    chromsizes: pd.DataFrame or pr.PyRanges
            A data frame or :class:`pr.PyRanges` containing size of each chromosome, containing 'Chromosome', 'Start' and 'End' columns.
    bed_path: str
            Path to folder where the fragments bed files per group will be saved. If None, files will not be generated.
    bigwig_path: str
            Path to folder where the bigwig files per group will be saved. If None, files will not be generated.
    path_to_fragments: str or dict, optional
            A dictionary of character strings, with sample name as names indicating the path to the fragments file/s from which pseudobulk profiles have to
            be created. If a :class:`CistopicObject` is provided as input it will be ignored, but if a cell metadata :class:`pd.DataFrame` is provided it
            is necessary to provide it. The keys of the dictionary need to match with the sample_id tag added to the index names of the input data frame.
    sample_id_col: str, optional
            Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'.
    n_cpu: int, optional
            Number of cores to use. Default: 1.
    normalize_bigwig: bool, optional
            Whether bigwig files should be CPM normalized. Default: True.
    remove_duplicates: bool, optional
            Whether duplicates should be removed before converting the data to bigwig.
    split_pattern: str, optional
            Pattern to split cell barcode from sample id. Default: ___ .
    use_polars: bool, optional
            Whether to use polars to read fragments files. Default: True.
    **kwargs
            Additional parameters for ray.init()
    Return
    ------
    dict
            A dictionary containing the paths to the newly created bed fragments files per group a dictionary containing the paths to the
            newly created bigwig files per group.
    """
    # Create logger
    level = logging.INFO
    log_format = "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger("cisTopic")

    # Get fragments file
    if isinstance(input_data, CistopicObject):
        path_to_fragments = input_data.path_to_fragments
        if path_to_fragments is None:
            log.error("No path_to_fragments in this cisTopic object.")
        cell_data = input_data.cell_data
    elif isinstance(input_data, pd.DataFrame):
        if path_to_fragments is None:
            log.error("Please, provide path_to_fragments.")
        cell_data = input_data
    # Check for sample_id column
    try:
        sample_ids = list(set(cell_data[sample_id_col]))
    except ValueError:
        print(
            'Please, include a sample identification column (e.g. "sample_id") in your cell metadata!'
        )

    # Get fragments
    fragments_df_dict = {}
    for sample_id in path_to_fragments.keys():
        if sample_id not in sample_ids:
            log.info(
                "The following path_to_fragments entry is not found in the cell metadata sample_id_col: ",
                sample_id,
                ". It will be ignored.",
            )
        else:
            log.info("Reading fragments from " + path_to_fragments[sample_id])
            fragments_df = read_fragments_from_file(
                path_to_fragments[sample_id], use_polars=use_polars
            ).df
            # Convert to int32 for memory efficiency
            fragments_df.Start = np.int32(fragments_df.Start)
            fragments_df.End = np.int32(fragments_df.End)
            if "Score" in fragments_df:
                fragments_df.Score = np.int32(fragments_df.Score)
            if "barcode" in cell_data:
                fragments_df = fragments_df.loc[
                    fragments_df["Name"].isin(cell_data["barcode"].tolist())
                ]
            else:
                fragments_df = fragments_df.loc[
                    fragments_df["Name"].isin(
                        prepare_tag_cells(cell_data.index.tolist(), split_pattern)
                    )
                ]
            fragments_df_dict[sample_id] = fragments_df

    # Set groups
    if "barcode" in cell_data:
        cell_data = cell_data.loc[:, [variable, sample_id_col, "barcode"]]
    else:
        cell_data = cell_data.loc[:, [variable, sample_id_col]]
    cell_data[variable] = cell_data[variable].replace(" ", "", regex=True)
    cell_data[variable] = cell_data[variable].replace("[^A-Za-z0-9]+", "_", regex=True)
    groups = sorted(list(set(cell_data[variable])))
    # Check chromosome sizes
    if isinstance(chromsizes, pd.DataFrame):
        chromsizes = chromsizes.loc[:, ["Chromosome", "Start", "End"]]
        chromsizes = pr.PyRanges(chromsizes)
    # Check that output dir exist and generate output paths
    if isinstance(bed_path, str):
        if not os.path.exists(bed_path):
            os.makedirs(bed_path)
        bed_paths = {
            group: os.path.join(bed_path, str(group) + ".bed.gz") for group in groups
        }
    else:
        bed_paths = {}
    if isinstance(bigwig_path, str):
        if not os.path.exists(bigwig_path):
            os.makedirs(bigwig_path)
        bw_paths = {
            group: os.path.join(bigwig_path, str(group) + ".bw") for group in groups
        }
    else:
        bw_paths = {}
    # Create pseudobulks
    if n_cpu > 1:
        ray.init(num_cpus=n_cpu, **kwargs)
        ray_handle = ray.wait(
            [
                export_pseudobulk_ray.remote(
                    cell_data,
                    group,
                    fragments_df_dict,
                    chromsizes,
                    bigwig_path,
                    bed_path,
                    sample_id_col,
                    normalize_bigwig,
                    remove_duplicates,
                    split_pattern,
                )
                for group in groups
            ],
            num_returns=len(groups),
        )
        ray.shutdown()
    else:
        [
            export_pseudobulk_one_sample(
                cell_data,
                group,
                fragments_df_dict,
                chromsizes,
                bigwig_path,
                bed_path,
                sample_id_col,
                normalize_bigwig,
                remove_duplicates,
                split_pattern,
            )
            for group in groups
        ]

    return bw_paths, bed_paths


def export_pseudobulk_one_sample(
    cell_data: pd.DataFrame,
    group: str,
    fragments_df_dict: Dict[str, pd.DataFrame],
    chromsizes: pr.PyRanges,
    bigwig_path: str,
    bed_path: str,
    sample_id_col: Optional[str] = "sample_id",
    normalize_bigwig: Optional[bool] = True,
    remove_duplicates: Optional[bool] = True,
    split_pattern: Optional[str] = "___",
):
    """
    Create pseudobulk as bed and bigwig from single cell fragments file given a barcode annotation and a group.
    Parameters
    ---------
    cell_data: pd.DataFrame
            A cell metadata :class:`pd.Dataframe` containing barcodes, their annotation and their sample of origin.
    group: str
            A character string indicating the group for which pseudobulks will be created.
    fragments_df_dict: dict
            A dictionary containing data frames as values with 'Chromosome', 'Start', 'End', 'Name', and 'Score' as columns; and sample label
            as keys. 'Score' indicates the number of times that a fragments is found assigned to that barcode.
    chromsizes: pr.PyRanges
            A :class:`pr.PyRanges` containing size of each column, containing 'Chromosome', 'Start' and 'End' columns.
    bigwig_path: str
            Path to folder where the bigwig file will be saved.
    bed_path: str
            Path to folder where the fragments bed file will be saved.
    sample_id_col: str, optional
            Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'.
    normalize_bigwig: bool, optional
            Whether bigwig files should be CPM normalized. Default: True.
    remove_duplicates: bool, optional
            Whether duplicates should be removed before converting the data to bigwig.
    split_pattern: str
            Pattern to split cell barcode from sample id. Default: ___ .
    """
    # Create logger
    level = logging.INFO
    log_format = "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger("cisTopic")

    log.info("Creating pseudobulk for " + str(group))
    group_fragments_list = []
    group_fragments_dict = {}
    for sample_id in fragments_df_dict:
        sample_data = cell_data[cell_data.loc[:, sample_id_col].isin([sample_id])]
        if "barcode" in sample_data:
            sample_data.index = sample_data["barcode"].tolist()
        else:
            sample_data.index = prepare_tag_cells(
                sample_data.index.tolist(), split_pattern
            )
        group_var = sample_data.iloc[:, 0]
        barcodes = group_var[group_var.isin([group])].index.tolist()
        fragments_df = fragments_df_dict[sample_id]
        group_fragments = fragments_df.loc[fragments_df["Name"].isin(barcodes)]
        if len(fragments_df_dict) > 1:
            group_fragments_dict[sample_id] = group_fragments

    if len(fragments_df_dict) > 1:
        group_fragments_list = [
            group_fragments_dict[list(group_fragments_dict.keys())[x]]
            for x in range(len(fragments_df_dict))
        ]
        group_fragments = group_fragments_list[0].append(group_fragments_list[1:])

    group_fragments = group_fragments[
        group_fragments["Chromosome"].isin(chromsizes.Chromosome)
    ]

    del group_fragments_dict
    del group_fragments_list
    del fragments_df
    gc.collect()

    group_pr = pr.PyRanges(group_fragments)
    if isinstance(bigwig_path, str):
        bigwig_path_group = os.path.join(bigwig_path, str(group) + ".bw")
        if remove_duplicates:
            group_pr.to_bigwig(
                path=bigwig_path_group,
                chromosome_sizes=chromsizes,
                rpm=normalize_bigwig,
            )
        else:
            group_pr.to_bigwig(
                path=bigwig_path_group,
                chromosome_sizes=chromsizes,
                rpm=normalize_bigwig,
                value_col="Score",
            )
    if isinstance(bed_path, str):
        bed_path_group = os.path.join(bed_path, str(group) + ".bed.gz")
        group_pr.to_bed(
            path=bed_path_group, keep=False, compression="infer", chain=False
        )

    log.info(str(group) + " done!")


@ray.remote
def export_pseudobulk_ray(
    cell_data: pd.DataFrame,
    group: str,
    fragments_df_dict: Dict[str, pd.DataFrame],
    chromsizes: pr.PyRanges,
    bigwig_path: str,
    bed_path: str,
    sample_id_col: Optional[str] = "sample_id",
    normalize_bigwig: Optional[bool] = True,
    remove_duplicates: Optional[bool] = True,
    split_pattern: Optional[str] = "___",
):
    """
    Create pseudobulk as bed and bigwig from single cell fragments file given a barcode annotation and a group.
    Parameters
    ---------
    cell_data: pd.DataFrame
            A cell metadata :class:`pd.Dataframe` containing barcodes, their annotation and their sample of origin.
    group: str
            A character string indicating the group for which pseudobulks will be created.
    fragments_df_dict: dict
            A dictionary containing data frames as values with 'Chromosome', 'Start', 'End', 'Name', and 'Score' as columns; and sample label
            as keys. 'Score' indicates the number of times that a fragments is found assigned to that barcode.
    chromsizes: pr.PyRanges
            A :class:`pr.PyRanges` containing size of each column, containing 'Chromosome', 'Start' and 'End' columns.
    bed_path: str
            Path to folder where the fragments bed file will be saved.
    bigwig_path: str
            Path to folder where the bigwig file will be saved.
    sample_id_col: str, optional
            Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'.
    normalize_bigwig: bool, optional
            Whether bigwig files should be CPM normalized. Default: True.
    remove_duplicates: bool, optional
            Whether duplicates should be removed before converting the data to bigwig.
    split_pattern: str
            Pattern to split cell barcode from sample id. Default: ___ .
    """
    export_pseudobulk_one_sample(
        cell_data,
        group,
        fragments_df_dict,
        chromsizes,
        bigwig_path,
        bed_path,
        sample_id_col,
        normalize_bigwig,
        remove_duplicates,
        split_pattern,
    )

In [14]:
# for sample in ["BRO_mtscatac_1.LIBDS"]:
bw_paths_dict = {}
bed_paths_dict = {}

import ray

if ray.is_initialized():
    print("Shutting down Ray")
    ray.shutdown()

for sample in cto_consensus_path_dict.keys():
    supersample = ".".join(sample.split(".")[:2])

    bed_path = os.path.join(
        "final_consensus_peaks", f"{sample}__SCREEN_pseudobulk_bed_files"
    )
    bw_path = os.path.join(
        "final_consensus_peaks", f"{sample}__SCREEN_pseudobulk_bw_files"
    )
    if not os.path.exists(bed_path):
        cto_path = cto_consensus_path_dict[sample]
        with open(cto_path, "rb") as f:
            cto = pickle.load(f)

            bw_paths, bed_paths = export_pseudobulk(
                input_data=cto,
                variable="consensus_cell_type",
                sample_id_col="sample_id",
                chromsizes=chromsizes,
                bed_path=bed_path,
                bigwig_path=bw_path,
                path_to_fragments=fragments_path_dict[supersample],
                n_cpu=16,
                normalize_bigwig=True,
                remove_duplicates=True,
            )

            if ray.is_initialized():
                print("Shutting down Ray")
                ray.shutdown()
    else:
        print(f"{bed_path} exists, skipping...")

2022-12-27 13:40:44,166 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/OHS_s3atac_2.FIXEDCELLS.fragments.tsv.gz
2022-12-27 13:41:34,437 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/SAN_10xmultiome_2.FIXEDCELLS.fragments.tsv.gz
2022-12-27 13:43:19,830 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/OHS_s3atac_1.FIXEDCELLS.fragments.tsv.gz
2022-12-27 13:44:25,126 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/TXG_10xv11_1.FIXEDCELLS.fragments.tsv.gz
2022-12-27 13:47:27,091 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/CNA_10xv11_5.FIXEDCELLS.fragments.tsv.gz
2022-12-27 13:47:45,873 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/STA_10xv11_1.FIXEDCELLS.fragments.tsv.gz
2022-12-27 13:48:07,018 cisTopic     INFO     Reading fragments from ../1_data_repo

2022-12-27 14:31:01,562	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=2532517)[0m 2022-12-27 14:31:31,365 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=2532521)[0m 2022-12-27 14:31:50,638 cisTopic     INFO     Creating pseudobulk for CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=2532523)[0m 2022-12-27 14:32:09,041 cisTopic     INFO     Creating pseudobulk for CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=2532514)[0m 2022-12-27 14:32:27,747 cisTopic     INFO     Creating pseudobulk for CD4_Tcell




[2m[36m(export_pseudobulk_ray pid=2532509)[0m 2022-12-27 14:32:49,101 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell




[2m[36m(export_pseudobulk_ray pid=2532515)[0m 2022-12-27 14:33:06,593 cisTopic     INFO     Creating pseudobulk for Dendriticcell




[2m[36m(export_pseudobulk_ray pid=2532513)[0m 2022-12-27 14:33:25,064 cisTopic     INFO     Creating pseudobulk for Naturalkillercell




[2m[36m(export_pseudobulk_ray pid=2532515)[0m 2022-12-27 14:35:17,921 cisTopic     INFO     Dendriticcell done!
[2m[36m(export_pseudobulk_ray pid=2532523)[0m 2022-12-27 14:36:10,760 cisTopic     INFO     CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=2532513)[0m 2022-12-27 14:39:20,890 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=2532509)[0m 2022-12-27 14:45:36,100 cisTopic     INFO     CytotoxicTcell done!
[2m[36m(export_pseudobulk_ray pid=2532517)[0m 2022-12-27 14:50:14,895 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=2532521)[0m 2022-12-27 14:53:59,706 cisTopic     INFO     CD14_monocyte done!


# write per tech pseudobulk

In [7]:
cto_consensus_path_dict

{'master_sub_1.FIXEDCELLS': 'cistopic_objects/master_sub_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_20topics.dimreduc.consensus.pkl'}

In [None]:
# for sample in ["BRO_mtscatac_1.LIBDS"]:
bw_paths_dict = {}
bed_paths_dict = {}

import ray

if ray.is_initialized():
    print("Shutting down Ray")
    ray.shutdown()

for sample in cto_consensus_path_dict.keys():
    supersample = ".".join(sample.split(".")[:2])

    bed_path = os.path.join(
        "per_tech_bigwigs", f"{sample}__CONSENSUS_pseudobulk_bed_files"
    )
    bw_path = os.path.join(
        "per_tech_bigwigs", f"{sample}__CONSENSUS_pseudobulk_bw_files"
    )
    if not os.path.exists(bed_path):
        cto_path = cto_consensus_path_dict[sample]
        with open(cto_path, "rb") as f:
            cto = pickle.load(f)

            bw_paths, bed_paths = export_pseudobulk(
                input_data=cto,
                variable="tech",
                sample_id_col="sample_id",
                chromsizes=chromsizes,
                bed_path=bed_path,
                bigwig_path=bw_path,
                path_to_fragments=fragments_path_dict[supersample],
                n_cpu=16,
                normalize_bigwig=True,
                remove_duplicates=True,
            )

            if ray.is_initialized():
                print("Shutting down Ray")
                ray.shutdown()
    else:
        print(f"{bed_path} exists, skipping...")

2023-01-01 20:12:10,692 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/OHS_s3atac_2.FIXEDCELLS.fragments.tsv.gz
2023-01-01 20:13:01,237 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/SAN_10xmultiome_2.FIXEDCELLS.fragments.tsv.gz
2023-01-01 20:14:45,969 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/OHS_s3atac_1.FIXEDCELLS.fragments.tsv.gz
2023-01-01 20:15:53,996 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/TXG_10xv11_1.FIXEDCELLS.fragments.tsv.gz
2023-01-01 20:19:05,060 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/CNA_10xv11_5.FIXEDCELLS.fragments.tsv.gz
2023-01-01 20:19:23,949 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/STA_10xv11_1.FIXEDCELLS.fragments.tsv.gz
2023-01-01 20:19:45,912 cisTopic     INFO     Reading fragments from ../1_data_repo

2023-01-01 21:04:16,261	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=3293102)[0m 2023-01-01 21:04:44,624 cisTopic     INFO     Creating pseudobulk for 10xmultiome




[2m[36m(export_pseudobulk_ray pid=3293099)[0m 2023-01-01 21:05:04,280 cisTopic     INFO     Creating pseudobulk for 10xv1




[2m[36m(export_pseudobulk_ray pid=3293107)[0m 2023-01-01 21:05:21,826 cisTopic     INFO     Creating pseudobulk for 10xv11




[2m[36m(export_pseudobulk_ray pid=3293109)[0m 2023-01-01 21:05:42,767 cisTopic     INFO     Creating pseudobulk for 10xv2




[2m[36m(export_pseudobulk_ray pid=3293108)[0m 2023-01-01 21:05:59,621 cisTopic     INFO     Creating pseudobulk for ddseq




[2m[36m(export_pseudobulk_ray pid=3293110)[0m 2023-01-01 21:06:17,613 cisTopic     INFO     Creating pseudobulk for hydrop




[2m[36m(export_pseudobulk_ray pid=3293111)[0m 2023-01-01 21:06:36,898 cisTopic     INFO     Creating pseudobulk for mtscatac




[2m[36m(export_pseudobulk_ray pid=3293097)[0m 2023-01-01 21:06:54,230 cisTopic     INFO     Creating pseudobulk for s3atac




[2m[36m(export_pseudobulk_ray pid=3293110)[0m 2023-01-01 21:11:12,218 cisTopic     INFO     hydrop done!
[2m[36m(export_pseudobulk_ray pid=3293102)[0m 2023-01-01 21:17:30,949 cisTopic     INFO     10xmultiome done!


# write per tech-cell type pseudobulk

In [7]:
# for sample in ["BRO_mtscatac_1.LIBDS"]:
bw_paths_dict = {}
bed_paths_dict = {}

import ray

if ray.is_initialized():
    print("Shutting down Ray")
    ray.shutdown()

for sample in cto_consensus_path_dict.keys():
    supersample = ".".join(sample.split(".")[:2])

    bed_path = os.path.join(
        "per_tech_cell_type_bigwigs", f"{sample}__CONSENSUS_pseudobulk_bed_files"
    )
    bw_path = os.path.join(
        "per_tech_cell_type_bigwigs", f"{sample}__CONSENSUS_pseudobulk_bw_files"
    )
    if not os.path.exists(bed_path):
        cto_path = cto_consensus_path_dict[sample]
        with open(cto_path, "rb") as f:
            cto = pickle.load(f)
            
            cto.cell_data["tech_consensus_cell_type"] = (
                cto.cell_data["tech"]
                + "__"
                + [x.replace(" ", "_") for x in cto.cell_data["consensus_cell_type"]]
            )
            
            bw_paths, bed_paths = export_pseudobulk(
                input_data=cto,
                variable="tech_consensus_cell_type",
                sample_id_col="sample_id",
                chromsizes=chromsizes,
                bed_path=bed_path,
                bigwig_path=bw_path,
                path_to_fragments=fragments_path_dict[supersample],
                n_cpu=8,
                normalize_bigwig=True,
                remove_duplicates=True,
            )

            if ray.is_initialized():
                print("Shutting down Ray")
                ray.shutdown()
    else:
        print(f"{bed_path} exists, skipping...")

2023-01-02 13:31:52,660 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/OHS_s3atac_2.FIXEDCELLS.fragments.tsv.gz
2023-01-02 13:32:40,237 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/SAN_10xmultiome_2.FIXEDCELLS.fragments.tsv.gz
2023-01-02 13:34:20,990 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/OHS_s3atac_1.FIXEDCELLS.fragments.tsv.gz
2023-01-02 13:35:26,649 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/TXG_10xv11_1.FIXEDCELLS.fragments.tsv.gz
2023-01-02 13:38:32,516 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/CNA_10xv11_5.FIXEDCELLS.fragments.tsv.gz
2023-01-02 13:38:51,712 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/STA_10xv11_1.FIXEDCELLS.fragments.tsv.gz
2023-01-02 13:39:13,318 cisTopic     INFO     Reading fragments from ../1_data_repo

2023-01-02 14:22:00,038	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=3531939)[0m 2023-01-02 14:22:29,295 cisTopic     INFO     Creating pseudobulk for 10xmultiome_B_cell




[2m[36m(export_pseudobulk_ray pid=3531931)[0m 2023-01-02 14:22:47,766 cisTopic     INFO     Creating pseudobulk for 10xmultiome_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=3531934)[0m 2023-01-02 14:23:07,435 cisTopic     INFO     Creating pseudobulk for 10xmultiome_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:23:32,781 cisTopic     INFO     Creating pseudobulk for 10xmultiome_CD4_T_cell
[2m[36m(export_pseudobulk_ray pid=3531939)[0m 2023-01-02 14:23:45,466 cisTopic     INFO     10xmultiome_B_cell done!




[2m[36m(export_pseudobulk_ray pid=3531939)[0m 2023-01-02 14:23:52,135 cisTopic     INFO     Creating pseudobulk for 10xmultiome_Cytotoxic_T_cell
[2m[36m(export_pseudobulk_ray pid=3531934)[0m 2023-01-02 14:23:57,902 cisTopic     INFO     10xmultiome_CD16_monocyte done!




[2m[36m(export_pseudobulk_ray pid=3531934)[0m 2023-01-02 14:24:11,597 cisTopic     INFO     Creating pseudobulk for 10xmultiome_Dendritic_cell




[2m[36m(export_pseudobulk_ray pid=3531937)[0m 2023-01-02 14:24:32,640 cisTopic     INFO     Creating pseudobulk for 10xmultiome_Natural_killer_cell
[2m[36m(export_pseudobulk_ray pid=3531934)[0m 2023-01-02 14:24:42,135 cisTopic     INFO     10xmultiome_Dendritic_cell done!




[2m[36m(export_pseudobulk_ray pid=3531934)[0m 2023-01-02 14:24:51,838 cisTopic     INFO     Creating pseudobulk for 10xv11_B_cell




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 14:25:12,698 cisTopic     INFO     Creating pseudobulk for 10xv11_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=3531936)[0m 2023-01-02 14:25:31,444 cisTopic     INFO     Creating pseudobulk for 10xv11_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 14:25:51,013 cisTopic     INFO     Creating pseudobulk for 10xv11_CD4_T_cell
[2m[36m(export_pseudobulk_ray pid=3531939)[0m 2023-01-02 14:25:52,984 cisTopic     INFO     10xmultiome_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=3531937)[0m 2023-01-02 14:25:56,979 cisTopic     INFO     10xmultiome_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=3531931)[0m 2023-01-02 14:26:00,739 cisTopic     INFO     10xmultiome_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=3531934)[0m 2023-01-02 14:26:04,468 cisTopic     INFO     10xv11_B_cell done!




[2m[36m(export_pseudobulk_ray pid=3531931)[0m 2023-01-02 14:26:08,567 cisTopic     INFO     Creating pseudobulk for 10xv11_Cytotoxic_T_cell
[2m[36m(export_pseudobulk_ray pid=3531936)[0m 2023-01-02 14:26:20,099 cisTopic     INFO     10xv11_CD16_monocyte done!




[2m[36m(export_pseudobulk_ray pid=3531936)[0m 2023-01-02 14:26:29,002 cisTopic     INFO     Creating pseudobulk for 10xv11_Dendritic_cell


[2m[36m(raylet)[0m Spilled 12189 MiB, 2 objects, write throughput 848 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.


[2m[36m(export_pseudobulk_ray pid=3531934)[0m 2023-01-02 14:26:47,947 cisTopic     INFO     Creating pseudobulk for 10xv11_Natural_killer_cell
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:26:55,346 cisTopic     INFO     10xmultiome_CD4_T_cell done!


[2m[36m(raylet)[0m Spilled 24379 MiB, 4 objects, write throughput 856 MiB/s.


[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:27:06,077 cisTopic     INFO     Creating pseudobulk for 10xv1_B_cell
[2m[36m(export_pseudobulk_ray pid=3531936)[0m 2023-01-02 14:27:07,180 cisTopic     INFO     10xv11_Dendritic_cell done!


[2m[36m(raylet)[0m Spilled 36569 MiB, 6 objects, write throughput 864 MiB/s.


[2m[36m(export_pseudobulk_ray pid=3531931)[0m 2023-01-02 14:28:02,902 cisTopic     INFO     10xv11_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=3531934)[0m 2023-01-02 14:28:18,575 cisTopic     INFO     10xv11_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:28:33,565 cisTopic     INFO     10xv1_B_cell done!
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:28:43,920 cisTopic     INFO     Creating pseudobulk for 10xv1_CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 14:28:51,144 cisTopic     INFO     10xv11_CD14_monocyte done!


[2m[36m(raylet)[0m Spilled 48759 MiB, 8 objects, write throughput 824 MiB/s.


[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 14:30:23,899 cisTopic     INFO     10xv11_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 14:30:36,267 cisTopic     INFO     Creating pseudobulk for 10xv1_CD16_monocyte


[2m[36m(raylet)[0m Spilled 60949 MiB, 10 objects, write throughput 797 MiB/s.


[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 14:31:28,011 cisTopic     INFO     10xv1_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 14:32:18,152 cisTopic     INFO     Creating pseudobulk for 10xv1_CD4_T_cell


[2m[36m(raylet)[0m Spilled 73139 MiB, 12 objects, write throughput 786 MiB/s.


[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:32:47,838 cisTopic     INFO     10xv1_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:35:42,214 cisTopic     INFO     Creating pseudobulk for 10xv1_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 14:36:29,108 cisTopic     INFO     10xv1_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:37:16,073 cisTopic     INFO     10xv1_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:37:23,187 cisTopic     INFO     Creating pseudobulk for 10xv1_Dendritic_cell




[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:37:56,863 cisTopic     INFO     10xv1_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:39:00,480 cisTopic     INFO     Creating pseudobulk for 10xv1_Natural_killer_cell




[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:40:23,116 cisTopic     INFO     10xv1_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:40:37,700 cisTopic     INFO     Creating pseudobulk for 10xv2_B_cell




[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 14:42:13,409 cisTopic     INFO     Creating pseudobulk for 10xv2_CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:42:17,123 cisTopic     INFO     10xv2_B_cell done!


[2m[36m(raylet)[0m Spilled 134089 MiB, 22 objects, write throughput 759 MiB/s.


[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:43:49,631 cisTopic     INFO     Creating pseudobulk for 10xv2_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:44:54,351 cisTopic     INFO     10xv2_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:45:35,264 cisTopic     INFO     Creating pseudobulk for 10xv2_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 14:47:14,424 cisTopic     INFO     Creating pseudobulk for 10xv2_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 14:47:33,455 cisTopic     INFO     10xv2_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 14:48:49,331 cisTopic     INFO     Creating pseudobulk for 10xv2_Dendritic_cell




[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 14:49:25,185 cisTopic     INFO     10xv2_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 14:49:49,590 cisTopic     INFO     10xv2_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 14:50:31,125 cisTopic     INFO     Creating pseudobulk for 10xv2_Natural_killer_cell




[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 14:52:27,637 cisTopic     INFO     Creating pseudobulk for ddseq_B_cell
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 14:52:32,535 cisTopic     INFO     10xv2_Natural_killer_cell done!




[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:53:39,744 cisTopic     INFO     10xv2_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 14:53:44,444 cisTopic     INFO     ddseq_B_cell done!
[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 14:54:06,966 cisTopic     INFO     Creating pseudobulk for ddseq_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:55:54,272 cisTopic     INFO     Creating pseudobulk for ddseq_CD16_monocyte
[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 14:56:04,899 cisTopic     INFO     ddseq_CD14_monocyte done!




[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:56:36,601 cisTopic     INFO     ddseq_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 14:57:37,268 cisTopic     INFO     Creating pseudobulk for ddseq_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 14:59:18,467 cisTopic     INFO     Creating pseudobulk for ddseq_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:00:58,362 cisTopic     INFO     Creating pseudobulk for ddseq_Dendritic_cell
[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 15:00:58,477 cisTopic     INFO     ddseq_Cytotoxic_T_cell done!


[2m[36m(raylet)[0m Spilled 268178 MiB, 44 objects, write throughput 743 MiB/s.


[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:01:26,348 cisTopic     INFO     ddseq_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:02:58,158 cisTopic     INFO     Creating pseudobulk for ddseq_Natural_killer_cell




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:03:17,387 cisTopic     INFO     ddseq_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 15:04:07,073 cisTopic     INFO     ddseq_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 15:04:36,636 cisTopic     INFO     Creating pseudobulk for hydrop_B_cell




[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 15:05:22,274 cisTopic     INFO     hydrop_B_cell done!
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 15:06:18,541 cisTopic     INFO     Creating pseudobulk for hydrop_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:08:09,267 cisTopic     INFO     Creating pseudobulk for hydrop_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 15:08:20,969 cisTopic     INFO     hydrop_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:08:21,884 cisTopic     INFO     hydrop_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:09:53,329 cisTopic     INFO     Creating pseudobulk for hydrop_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:10:41,797 cisTopic     INFO     hydrop_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:11:36,403 cisTopic     INFO     Creating pseudobulk for hydrop_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:13:08,495 cisTopic     INFO     hydrop_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:13:16,515 cisTopic     INFO     Creating pseudobulk for hydrop_Dendritic_cell




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:13:33,891 cisTopic     INFO     hydrop_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:15:00,117 cisTopic     INFO     Creating pseudobulk for hydrop_Natural_killer_cell




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:15:13,752 cisTopic     INFO     hydrop_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:16:42,707 cisTopic     INFO     Creating pseudobulk for mtscatac_B_cell




[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 15:18:30,298 cisTopic     INFO     Creating pseudobulk for mtscatac_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:18:54,100 cisTopic     INFO     mtscatac_B_cell done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:20:16,785 cisTopic     INFO     Creating pseudobulk for mtscatac_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:21:27,724 cisTopic     INFO     mtscatac_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:21:57,681 cisTopic     INFO     Creating pseudobulk for mtscatac_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 15:23:02,008 cisTopic     INFO     mtscatac_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 15:23:37,435 cisTopic     INFO     Creating pseudobulk for mtscatac_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 15:25:17,585 cisTopic     INFO     Creating pseudobulk for mtscatac_Dendritic_cell




[2m[36m(export_pseudobulk_ray pid=3531933)[0m 2023-01-02 15:26:01,282 cisTopic     INFO     mtscatac_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:26:34,892 cisTopic     INFO     mtscatac_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 15:26:37,060 cisTopic     INFO     mtscatac_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 15:26:55,355 cisTopic     INFO     Creating pseudobulk for mtscatac_Natural_killer_cell




[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 15:27:48,581 cisTopic     INFO     mtscatac_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=3531938)[0m 2023-01-02 15:28:32,888 cisTopic     INFO     Creating pseudobulk for s3atac_B_cell




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:30:12,252 cisTopic     INFO     Creating pseudobulk for s3atac_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:30:29,064 cisTopic     INFO     s3atac_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:31:45,782 cisTopic     INFO     Creating pseudobulk for s3atac_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:32:00,292 cisTopic     INFO     s3atac_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:33:34,258 cisTopic     INFO     Creating pseudobulk for s3atac_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:33:50,854 cisTopic     INFO     s3atac_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:35:16,598 cisTopic     INFO     Creating pseudobulk for s3atac_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:36:03,953 cisTopic     INFO     s3atac_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:36:54,321 cisTopic     INFO     Creating pseudobulk for s3atac_Dendritic_cell




[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:37:08,677 cisTopic     INFO     s3atac_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:38:37,406 cisTopic     INFO     Creating pseudobulk for s3atac_Natural_killer_cell


[2m[36m(raylet)[0m Spilled 536357 MiB, 88 objects, write throughput 713 MiB/s.


[2m[36m(export_pseudobulk_ray pid=3531935)[0m 2023-01-02 15:38:51,918 cisTopic     INFO     s3atac_Natural_killer_cell done!


In [7]:
# for sample in ["BRO_mtscatac_1.LIBDS"]:
bw_paths_dict = {}
bed_paths_dict = {}

import ray

if ray.is_initialized():
    print("Shutting down Ray")
    ray.shutdown()

for sample in cto_consensus_path_dict.keys():
    supersample = ".".join(sample.split(".")[:2])

    bed_path = os.path.join(
        "per_tech_cell_type_bigwigs_nonnorm",
        f"{sample}__CONSENSUS_pseudobulk_bed_files_nonnorm",
    )
    bw_path = os.path.join(
        "per_tech_cell_type_bigwigs_nonnorm",
        f"{sample}__CONSENSUS_pseudobulk_bw_files_nonnorm",
    )
    if not os.path.exists(bed_path):
        cto_path = cto_consensus_path_dict[sample]
        with open(cto_path, "rb") as f:
            cto = pickle.load(f)

            cto.cell_data["tech_consensus_cell_type"] = (
                cto.cell_data["tech"]
                + "__"
                + [x.replace(" ", "_") for x in cto.cell_data["consensus_cell_type"]]
            )

            bw_paths, bed_paths = export_pseudobulk(
                input_data=cto,
                variable="tech_consensus_cell_type",
                sample_id_col="sample_id",
                chromsizes=chromsizes,
                bed_path=bed_path,
                bigwig_path=bw_path,
                path_to_fragments=fragments_path_dict[supersample],
                n_cpu=8,
                normalize_bigwig=False,
                remove_duplicates=True,
            )

            if ray.is_initialized():
                print("Shutting down Ray")
                ray.shutdown()
    else:
        print(f"{bed_path} exists, skipping...")

2023-01-02 17:03:44,806 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/OHS_s3atac_2.FIXEDCELLS.fragments.tsv.gz
2023-01-02 17:04:30,698 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/SAN_10xmultiome_2.FIXEDCELLS.fragments.tsv.gz
2023-01-02 17:06:07,932 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/OHS_s3atac_1.FIXEDCELLS.fragments.tsv.gz
2023-01-02 17:07:11,091 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/TXG_10xv11_1.FIXEDCELLS.fragments.tsv.gz
2023-01-02 17:10:13,323 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/CNA_10xv11_5.FIXEDCELLS.fragments.tsv.gz
2023-01-02 17:10:32,018 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/STA_10xv11_1.FIXEDCELLS.fragments.tsv.gz
2023-01-02 17:10:52,957 cisTopic     INFO     Reading fragments from ../1_data_repo

2023-01-02 17:54:28,778	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=5161)[0m 2023-01-02 17:54:59,240 cisTopic     INFO     Creating pseudobulk for 10xmultiome_B_cell




[2m[36m(export_pseudobulk_ray pid=5156)[0m 2023-01-02 17:55:16,229 cisTopic     INFO     Creating pseudobulk for 10xmultiome_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=5155)[0m 2023-01-02 17:55:34,443 cisTopic     INFO     Creating pseudobulk for 10xmultiome_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 17:55:54,386 cisTopic     INFO     Creating pseudobulk for 10xmultiome_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=5158)[0m 2023-01-02 17:56:12,214 cisTopic     INFO     Creating pseudobulk for 10xmultiome_Cytotoxic_T_cell
[2m[36m(export_pseudobulk_ray pid=5161)[0m 2023-01-02 17:56:21,132 cisTopic     INFO     10xmultiome_B_cell done!




[2m[36m(export_pseudobulk_ray pid=5155)[0m 2023-01-02 17:56:27,125 cisTopic     INFO     10xmultiome_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=5155)[0m 2023-01-02 17:56:32,611 cisTopic     INFO     Creating pseudobulk for 10xmultiome_Dendritic_cell




[2m[36m(export_pseudobulk_ray pid=5161)[0m 2023-01-02 17:56:51,465 cisTopic     INFO     Creating pseudobulk for 10xmultiome_Natural_killer_cell




[2m[36m(export_pseudobulk_ray pid=5155)[0m 2023-01-02 17:57:04,277 cisTopic     INFO     10xmultiome_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=5155)[0m 2023-01-02 17:57:11,533 cisTopic     INFO     Creating pseudobulk for 10xv11_B_cell




[2m[36m(export_pseudobulk_ray pid=5160)[0m 2023-01-02 17:57:31,702 cisTopic     INFO     Creating pseudobulk for 10xv11_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=5162)[0m 2023-01-02 17:57:51,195 cisTopic     INFO     Creating pseudobulk for 10xv11_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 17:58:10,053 cisTopic     INFO     Creating pseudobulk for 10xv11_CD4_T_cell
[2m[36m(export_pseudobulk_ray pid=5158)[0m 2023-01-02 17:58:13,327 cisTopic     INFO     10xmultiome_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=5161)[0m 2023-01-02 17:58:14,190 cisTopic     INFO     10xmultiome_Natural_killer_cell done!




[2m[36m(export_pseudobulk_ray pid=5155)[0m 2023-01-02 17:58:27,856 cisTopic     INFO     10xv11_B_cell done!
[2m[36m(export_pseudobulk_ray pid=5161)[0m 2023-01-02 17:58:27,990 cisTopic     INFO     Creating pseudobulk for 10xv11_Cytotoxic_T_cell
[2m[36m(export_pseudobulk_ray pid=5156)[0m 2023-01-02 17:58:31,359 cisTopic     INFO     10xmultiome_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=5162)[0m 2023-01-02 17:58:40,054 cisTopic     INFO     10xv11_CD16_monocyte done!




[2m[36m(export_pseudobulk_ray pid=5162)[0m 2023-01-02 17:58:47,295 cisTopic     INFO     Creating pseudobulk for 10xv11_Dendritic_cell


[2m[36m(raylet)[0m Spilled 12189 MiB, 2 objects, write throughput 850 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.
[2m[36m(raylet)[0m Spilled 12193 MiB, 3 objects, write throughput 849 MiB/s.


[2m[36m(export_pseudobulk_ray pid=5156)[0m 2023-01-02 17:59:06,381 cisTopic     INFO     Creating pseudobulk for 10xv11_Natural_killer_cell


[2m[36m(raylet)[0m Spilled 24379 MiB, 4 objects, write throughput 870 MiB/s.


[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 17:59:16,451 cisTopic     INFO     10xmultiome_CD4_T_cell done!


[2m[36m(raylet)[0m Spilled 24383 MiB, 5 objects, write throughput 870 MiB/s.


[2m[36m(export_pseudobulk_ray pid=5162)[0m 2023-01-02 17:59:22,407 cisTopic     INFO     10xv11_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 17:59:27,314 cisTopic     INFO     Creating pseudobulk for 10xv1_B_cell


[2m[36m(raylet)[0m Spilled 36569 MiB, 6 objects, write throughput 898 MiB/s.


[2m[36m(export_pseudobulk_ray pid=5161)[0m 2023-01-02 18:00:24,469 cisTopic     INFO     10xv11_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=5156)[0m 2023-01-02 18:00:36,646 cisTopic     INFO     10xv11_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:00:58,083 cisTopic     INFO     10xv1_B_cell done!
[2m[36m(export_pseudobulk_ray pid=5160)[0m 2023-01-02 18:01:15,634 cisTopic     INFO     10xv11_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:01:18,531 cisTopic     INFO     Creating pseudobulk for 10xv1_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:02:52,248 cisTopic     INFO     10xv11_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:03:05,667 cisTopic     INFO     Creating pseudobulk for 10xv1_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:04:00,324 cisTopic     INFO     10xv1_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:05:18,369 cisTopic     INFO     Creating pseudobulk for 10xv1_CD4_T_cell
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:05:31,339 cisTopic     INFO     10xv1_CD14_monocyte done!


[2m[36m(raylet)[0m Spilled 73139 MiB, 12 objects, write throughput 585 MiB/s.


[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:07:24,149 cisTopic     INFO     Creating pseudobulk for 10xv1_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=5160)[0m 2023-01-02 18:08:56,876 cisTopic     INFO     Creating pseudobulk for 10xv1_Dendritic_cell
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:08:58,769 cisTopic     INFO     10xv1_Cytotoxic_T_cell done!




[2m[36m(export_pseudobulk_ray pid=5160)[0m 2023-01-02 18:09:30,709 cisTopic     INFO     10xv1_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:09:34,861 cisTopic     INFO     10xv1_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:10:39,379 cisTopic     INFO     Creating pseudobulk for 10xv1_Natural_killer_cell




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:12:01,880 cisTopic     INFO     10xv1_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:12:19,621 cisTopic     INFO     Creating pseudobulk for 10xv2_B_cell




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:14:01,693 cisTopic     INFO     10xv2_B_cell done!
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:14:25,647 cisTopic     INFO     Creating pseudobulk for 10xv2_CD14_monocyte


[2m[36m(raylet)[0m Spilled 134089 MiB, 22 objects, write throughput 592 MiB/s.


[2m[36m(export_pseudobulk_ray pid=5160)[0m 2023-01-02 18:16:09,782 cisTopic     INFO     Creating pseudobulk for 10xv2_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=5160)[0m 2023-01-02 18:17:13,348 cisTopic     INFO     10xv2_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=5160)[0m 2023-01-02 18:18:12,151 cisTopic     INFO     Creating pseudobulk for 10xv2_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:19:45,567 cisTopic     INFO     10xv2_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:19:45,703 cisTopic     INFO     Creating pseudobulk for 10xv2_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:21:28,650 cisTopic     INFO     Creating pseudobulk for 10xv2_Dendritic_cell




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:22:02,624 cisTopic     INFO     10xv2_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:22:24,219 cisTopic     INFO     10xv2_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:23:15,665 cisTopic     INFO     Creating pseudobulk for 10xv2_Natural_killer_cell




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:24:54,336 cisTopic     INFO     Creating pseudobulk for ddseq_B_cell




[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:25:17,852 cisTopic     INFO     10xv2_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:26:10,612 cisTopic     INFO     ddseq_B_cell done!
[2m[36m(export_pseudobulk_ray pid=5160)[0m 2023-01-02 18:26:24,811 cisTopic     INFO     10xv2_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=5160)[0m 2023-01-02 18:26:34,143 cisTopic     INFO     Creating pseudobulk for ddseq_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:28:16,133 cisTopic     INFO     Creating pseudobulk for ddseq_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=5160)[0m 2023-01-02 18:28:35,856 cisTopic     INFO     ddseq_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:28:58,763 cisTopic     INFO     ddseq_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:29:57,602 cisTopic     INFO     Creating pseudobulk for ddseq_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=5160)[0m 2023-01-02 18:31:30,985 cisTopic     INFO     Creating pseudobulk for ddseq_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:33:07,442 cisTopic     INFO     Creating pseudobulk for ddseq_Dendritic_cell
[2m[36m(export_pseudobulk_ray pid=5160)[0m 2023-01-02 18:33:11,117 cisTopic     INFO     ddseq_Cytotoxic_T_cell done!


[2m[36m(raylet)[0m Spilled 268178 MiB, 44 objects, write throughput 643 MiB/s.


[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:33:36,716 cisTopic     INFO     ddseq_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:34:48,878 cisTopic     INFO     Creating pseudobulk for ddseq_Natural_killer_cell




[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:35:07,958 cisTopic     INFO     ddseq_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:36:27,150 cisTopic     INFO     Creating pseudobulk for hydrop_B_cell
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:36:32,199 cisTopic     INFO     ddseq_CD4_T_cell done!




[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:37:15,553 cisTopic     INFO     hydrop_B_cell done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:38:08,353 cisTopic     INFO     Creating pseudobulk for hydrop_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:39:46,787 cisTopic     INFO     Creating pseudobulk for hydrop_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:40:01,086 cisTopic     INFO     hydrop_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:40:14,912 cisTopic     INFO     hydrop_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:41:25,646 cisTopic     INFO     Creating pseudobulk for hydrop_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:42:14,145 cisTopic     INFO     hydrop_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:43:05,940 cisTopic     INFO     Creating pseudobulk for hydrop_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:44:42,157 cisTopic     INFO     hydrop_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:44:47,663 cisTopic     INFO     Creating pseudobulk for hydrop_Dendritic_cell




[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:45:03,865 cisTopic     INFO     hydrop_Dendritic_cell done!




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:49:51,053 cisTopic     INFO     Creating pseudobulk for mtscatac_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:50:22,131 cisTopic     INFO     mtscatac_B_cell done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:51:31,925 cisTopic     INFO     Creating pseudobulk for mtscatac_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:52:42,555 cisTopic     INFO     mtscatac_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:53:12,985 cisTopic     INFO     Creating pseudobulk for mtscatac_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:54:24,997 cisTopic     INFO     mtscatac_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:54:52,384 cisTopic     INFO     Creating pseudobulk for mtscatac_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=5160)[0m 2023-01-02 18:56:31,018 cisTopic     INFO     Creating pseudobulk for mtscatac_Dendritic_cell




[2m[36m(export_pseudobulk_ray pid=5160)[0m 2023-01-02 18:57:13,719 cisTopic     INFO     mtscatac_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 18:57:55,670 cisTopic     INFO     mtscatac_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:57:58,188 cisTopic     INFO     mtscatac_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:58:08,015 cisTopic     INFO     Creating pseudobulk for mtscatac_Natural_killer_cell




[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:59:06,923 cisTopic     INFO     mtscatac_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=5159)[0m 2023-01-02 18:59:43,785 cisTopic     INFO     Creating pseudobulk for s3atac_B_cell




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 19:01:21,569 cisTopic     INFO     Creating pseudobulk for s3atac_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 19:01:37,749 cisTopic     INFO     s3atac_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 19:02:56,635 cisTopic     INFO     Creating pseudobulk for s3atac_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 19:03:10,784 cisTopic     INFO     s3atac_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 19:04:36,985 cisTopic     INFO     Creating pseudobulk for s3atac_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 19:04:56,350 cisTopic     INFO     s3atac_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 19:06:16,592 cisTopic     INFO     Creating pseudobulk for s3atac_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 19:07:02,616 cisTopic     INFO     s3atac_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 19:08:06,367 cisTopic     INFO     Creating pseudobulk for s3atac_Dendritic_cell




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 19:08:19,670 cisTopic     INFO     s3atac_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 19:09:46,052 cisTopic     INFO     Creating pseudobulk for s3atac_Natural_killer_cell




[2m[36m(export_pseudobulk_ray pid=5157)[0m 2023-01-02 19:09:59,452 cisTopic     INFO     s3atac_Natural_killer_cell done!


[2m[36m(raylet)[0m Spilled 536357 MiB, 88 objects, write throughput 660 MiB/s.


In [9]:
cto_consensus_path_dict

{'master_sub_1.FIXEDCELLS': 'cistopic_objects/master_sub_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_20topics.dimreduc.consensus.pkl'}

In [10]:
# for sample in ["BRO_mtscatac_1.LIBDS"]:
bw_paths_dict = {}
bed_paths_dict = {}

import ray

if ray.is_initialized():
    print("Shutting down Ray")
    ray.shutdown()

for sample in cto_consensus_path_dict.keys():
    supersample = ".".join(sample.split(".")[:2])

    bed_path = os.path.join(
        "per_tech_harmony_cell_type_bigwigs_nonnorm",
        f"{sample}__CONSENSUS_pseudobulk_bed_files_nonnorm",
    )
    bw_path = os.path.join(
        "per_tech_harmony_cell_type_bigwigs_nonnorm",
        f"{sample}__CONSENSUS_pseudobulk_bw_files_nonnorm",
    )
    if not os.path.exists(bed_path):
        cto_path = "cistopic_objects/master_sub_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_20topics.dimreduc.consensus_harmony.pkl"
        with open(cto_path, "rb") as f:
            cto = pickle.load(f)

            cto.cell_data["tech_harmony_consensus_cell_type"] = (
                cto.cell_data["tech"]
                + "__"
                + [
                    x.replace(" ", "_")
                    for x in cto.cell_data["harmony_consensus_cell_type"]
                ]
            )

            bw_paths, bed_paths = export_pseudobulk(
                input_data=cto,
                variable="tech_harmony_consensus_cell_type",
                sample_id_col="sample_id",
                chromsizes=chromsizes,
                bed_path=bed_path,
                bigwig_path=bw_path,
                path_to_fragments=fragments_path_dict[supersample],
                n_cpu=8,
                normalize_bigwig=False,
                remove_duplicates=True,
            )

            if ray.is_initialized():
                print("Shutting down Ray")
                ray.shutdown()
    else:
        print(f"{bed_path} exists, skipping...")

2023-01-02 21:28:02,283 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/OHS_s3atac_2.FIXEDCELLS.fragments.tsv.gz
2023-01-02 21:28:45,744 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/SAN_10xmultiome_2.FIXEDCELLS.fragments.tsv.gz
2023-01-02 21:30:24,319 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/OHS_s3atac_1.FIXEDCELLS.fragments.tsv.gz
2023-01-02 21:31:27,237 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/TXG_10xv11_1.FIXEDCELLS.fragments.tsv.gz
2023-01-02 21:34:31,961 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/CNA_10xv11_5.FIXEDCELLS.fragments.tsv.gz
2023-01-02 21:34:50,484 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/STA_10xv11_1.FIXEDCELLS.fragments.tsv.gz
2023-01-02 21:35:11,690 cisTopic     INFO     Reading fragments from ../1_data_repo

2023-01-02 22:17:55,749	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=94468)[0m 2023-01-02 22:18:26,830 cisTopic     INFO     Creating pseudobulk for 10xmultiome_B_cell




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:18:46,637 cisTopic     INFO     Creating pseudobulk for 10xmultiome_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=94471)[0m 2023-01-02 22:19:08,362 cisTopic     INFO     Creating pseudobulk for 10xmultiome_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=94469)[0m 2023-01-02 22:19:27,692 cisTopic     INFO     Creating pseudobulk for 10xmultiome_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 22:19:47,891 cisTopic     INFO     Creating pseudobulk for 10xmultiome_Cytotoxic_T_cell
[2m[36m(export_pseudobulk_ray pid=94471)[0m 2023-01-02 22:19:59,538 cisTopic     INFO     10xmultiome_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=94468)[0m 2023-01-02 22:20:00,917 cisTopic     INFO     10xmultiome_B_cell done!




[2m[36m(export_pseudobulk_ray pid=94471)[0m 2023-01-02 22:20:05,548 cisTopic     INFO     Creating pseudobulk for 10xmultiome_Dendritic_cell




[2m[36m(export_pseudobulk_ray pid=94468)[0m 2023-01-02 22:20:24,458 cisTopic     INFO     Creating pseudobulk for 10xmultiome_Natural_killer_cell
[2m[36m(export_pseudobulk_ray pid=94471)[0m 2023-01-02 22:20:36,557 cisTopic     INFO     10xmultiome_Dendritic_cell done!




[2m[36m(export_pseudobulk_ray pid=94471)[0m 2023-01-02 22:20:43,258 cisTopic     INFO     Creating pseudobulk for 10xv11_B_cell




[2m[36m(export_pseudobulk_ray pid=94467)[0m 2023-01-02 22:21:05,279 cisTopic     INFO     Creating pseudobulk for 10xv11_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=94470)[0m 2023-01-02 22:21:23,558 cisTopic     INFO     Creating pseudobulk for 10xv11_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=94468)[0m 2023-01-02 22:21:39,673 cisTopic     INFO     10xmultiome_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 22:21:40,226 cisTopic     INFO     10xmultiome_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:21:41,006 cisTopic     INFO     Creating pseudobulk for 10xv11_CD4_T_cell
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:21:49,610 cisTopic     INFO     10xmultiome_CD14_monocyte done!




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:21:59,152 cisTopic     INFO     Creating pseudobulk for 10xv11_Cytotoxic_T_cell
[2m[36m(export_pseudobulk_ray pid=94471)[0m 2023-01-02 22:22:07,979 cisTopic     INFO     10xv11_B_cell done!




[2m[36m(export_pseudobulk_ray pid=94470)[0m 2023-01-02 22:22:13,447 cisTopic     INFO     10xv11_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=94470)[0m 2023-01-02 22:22:21,091 cisTopic     INFO     Creating pseudobulk for 10xv11_Dendritic_cell


[2m[36m(raylet)[0m Spilled 12189 MiB, 2 objects, write throughput 830 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.
[2m[36m(raylet)[0m Spilled 12193 MiB, 3 objects, write throughput 829 MiB/s.


[2m[36m(export_pseudobulk_ray pid=94471)[0m 2023-01-02 22:22:40,809 cisTopic     INFO     Creating pseudobulk for 10xv11_Natural_killer_cell


[2m[36m(raylet)[0m Spilled 24379 MiB, 4 objects, write throughput 882 MiB/s.
[2m[36m(raylet)[0m Spilled 24383 MiB, 5 objects, write throughput 881 MiB/s.


[2m[36m(export_pseudobulk_ray pid=94470)[0m 2023-01-02 22:22:57,786 cisTopic     INFO     10xv11_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=94469)[0m 2023-01-02 22:22:59,023 cisTopic     INFO     10xmultiome_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 22:23:00,180 cisTopic     INFO     Creating pseudobulk for 10xv1_B_cell


[2m[36m(raylet)[0m Spilled 36569 MiB, 6 objects, write throughput 880 MiB/s.


[2m[36m(export_pseudobulk_ray pid=94471)[0m 2023-01-02 22:23:57,252 cisTopic     INFO     10xv11_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:24:16,399 cisTopic     INFO     10xv11_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:24:35,426 cisTopic     INFO     Creating pseudobulk for 10xv1_CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=94467)[0m 2023-01-02 22:24:37,904 cisTopic     INFO     10xv11_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 22:24:42,835 cisTopic     INFO     10xv1_B_cell done!




[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:26:04,569 cisTopic     INFO     10xv11_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:26:19,016 cisTopic     INFO     Creating pseudobulk for 10xv1_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:27:10,123 cisTopic     INFO     10xv1_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:27:57,073 cisTopic     INFO     Creating pseudobulk for 10xv1_CD4_T_cell


[2m[36m(raylet)[0m Spilled 73139 MiB, 12 objects, write throughput 809 MiB/s.


[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:28:38,115 cisTopic     INFO     10xv1_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:29:33,779 cisTopic     INFO     Creating pseudobulk for 10xv1_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 22:31:09,200 cisTopic     INFO     Creating pseudobulk for 10xv1_Dendritic_cell
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:31:19,544 cisTopic     INFO     10xv1_Cytotoxic_T_cell done!




[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 22:31:43,603 cisTopic     INFO     10xv1_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:32:00,385 cisTopic     INFO     10xv1_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:32:45,038 cisTopic     INFO     Creating pseudobulk for 10xv1_Natural_killer_cell




[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:34:04,021 cisTopic     INFO     10xv1_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:34:25,179 cisTopic     INFO     Creating pseudobulk for 10xv2_B_cell




[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 22:36:05,957 cisTopic     INFO     Creating pseudobulk for 10xv2_CD14_monocyte


[2m[36m(raylet)[0m Spilled 134089 MiB, 22 objects, write throughput 775 MiB/s.


[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:36:24,907 cisTopic     INFO     10xv2_B_cell done!
[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:37:46,099 cisTopic     INFO     Creating pseudobulk for 10xv2_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:38:48,818 cisTopic     INFO     10xv2_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:39:24,728 cisTopic     INFO     Creating pseudobulk for 10xv2_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:41:03,640 cisTopic     INFO     Creating pseudobulk for 10xv2_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 22:41:25,018 cisTopic     INFO     10xv2_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 22:42:40,295 cisTopic     INFO     Creating pseudobulk for 10xv2_Dendritic_cell




[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 22:43:17,550 cisTopic     INFO     10xv2_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:44:15,075 cisTopic     INFO     10xv2_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:44:22,115 cisTopic     INFO     Creating pseudobulk for 10xv2_Natural_killer_cell




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:46:00,327 cisTopic     INFO     10xv2_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 22:46:02,006 cisTopic     INFO     Creating pseudobulk for ddseq_B_cell




[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:46:55,220 cisTopic     INFO     10xv2_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 22:47:31,500 cisTopic     INFO     ddseq_B_cell done!
[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 22:47:41,535 cisTopic     INFO     Creating pseudobulk for ddseq_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:49:18,232 cisTopic     INFO     Creating pseudobulk for ddseq_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 22:49:41,941 cisTopic     INFO     ddseq_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:49:52,783 cisTopic     INFO     ddseq_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:50:57,307 cisTopic     INFO     Creating pseudobulk for ddseq_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 22:52:34,768 cisTopic     INFO     Creating pseudobulk for ddseq_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:54:11,623 cisTopic     INFO     Creating pseudobulk for ddseq_Dendritic_cell
[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 22:54:16,007 cisTopic     INFO     ddseq_Cytotoxic_T_cell done!


[2m[36m(raylet)[0m Spilled 268179 MiB, 44 objects, write throughput 752 MiB/s.


[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:54:40,564 cisTopic     INFO     ddseq_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:55:51,448 cisTopic     INFO     Creating pseudobulk for ddseq_Natural_killer_cell
[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 22:55:54,596 cisTopic     INFO     ddseq_CD4_T_cell done!




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:57:00,798 cisTopic     INFO     ddseq_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:57:32,161 cisTopic     INFO     Creating pseudobulk for hydrop_B_cell




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 22:58:31,932 cisTopic     INFO     hydrop_B_cell done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:00:05,385 cisTopic     INFO     Creating pseudobulk for hydrop_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:01:42,187 cisTopic     INFO     hydrop_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:03:09,196 cisTopic     INFO     Creating pseudobulk for hydrop_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:03:36,127 cisTopic     INFO     hydrop_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:05:15,537 cisTopic     INFO     Creating pseudobulk for hydrop_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:06:30,650 cisTopic     INFO     hydrop_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:06:55,937 cisTopic     INFO     Creating pseudobulk for hydrop_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:07:49,303 cisTopic     INFO     hydrop_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:08:33,437 cisTopic     INFO     Creating pseudobulk for hydrop_Dendritic_cell




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:08:51,027 cisTopic     INFO     hydrop_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:10:09,339 cisTopic     INFO     Creating pseudobulk for hydrop_Natural_killer_cell




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:10:35,802 cisTopic     INFO     hydrop_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:11:49,245 cisTopic     INFO     Creating pseudobulk for mtscatac_B_cell




[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 23:13:27,442 cisTopic     INFO     Creating pseudobulk for mtscatac_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:14:14,419 cisTopic     INFO     mtscatac_B_cell done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:15:08,069 cisTopic     INFO     Creating pseudobulk for mtscatac_CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:16:09,720 cisTopic     INFO     mtscatac_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:16:45,688 cisTopic     INFO     Creating pseudobulk for mtscatac_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 23:17:13,441 cisTopic     INFO     mtscatac_CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 23:18:22,504 cisTopic     INFO     Creating pseudobulk for mtscatac_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 23:19:59,425 cisTopic     INFO     Creating pseudobulk for mtscatac_Dendritic_cell




[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 23:20:44,300 cisTopic     INFO     mtscatac_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:21:29,849 cisTopic     INFO     mtscatac_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 23:21:32,128 cisTopic     INFO     mtscatac_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:21:37,464 cisTopic     INFO     Creating pseudobulk for mtscatac_Natural_killer_cell




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:23:07,799 cisTopic     INFO     mtscatac_Natural_killer_cell done!
[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:23:20,650 cisTopic     INFO     Creating pseudobulk for s3atac_B_cell




[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 23:24:58,305 cisTopic     INFO     Creating pseudobulk for s3atac_CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 23:26:40,963 cisTopic     INFO     Creating pseudobulk for s3atac_CD16_monocyte
[2m[36m(export_pseudobulk_ray pid=94466)[0m 2023-01-02 23:26:41,199 cisTopic     INFO     s3atac_CD14_monocyte done!




[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 23:27:13,493 cisTopic     INFO     s3atac_CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 23:28:22,313 cisTopic     INFO     Creating pseudobulk for s3atac_CD4_T_cell




[2m[36m(export_pseudobulk_ray pid=94473)[0m 2023-01-02 23:29:22,734 cisTopic     INFO     s3atac_B_cell done!
[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 23:29:53,991 cisTopic     INFO     s3atac_CD4_T_cell done!
[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 23:30:07,365 cisTopic     INFO     Creating pseudobulk for s3atac_Cytotoxic_T_cell




[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 23:31:32,629 cisTopic     INFO     s3atac_Cytotoxic_T_cell done!
[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 23:31:47,088 cisTopic     INFO     Creating pseudobulk for s3atac_Dendritic_cell




[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 23:32:13,459 cisTopic     INFO     s3atac_Dendritic_cell done!
[2m[36m(export_pseudobulk_ray pid=94472)[0m 2023-01-02 23:33:27,161 cisTopic     INFO     Creating pseudobulk for s3atac_Natural_killer_cell


[2m[36m(raylet)[0m Spilled 536358 MiB, 88 objects, write throughput 720 MiB/s.


# consensus

In [23]:
cto_path_dict = {
    x.split("/")[-1].split(f"__")[0] + "." + x.split("/")[-1].split(f".")[-6]: x
    for x in sorted(glob.glob("cistopic_objects/*.pkl"))
}
cto_path_dict

{'master_sub_1.FIXEDCELLS.fmx': 'cistopic_objects/master_sub_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_20topics.dimreduc.consensus.pkl',
 'master_sub_1.FIXEDCELLS.scrublet0-4': 'cistopic_objects/master_sub_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_20topics.dimreduc.pkl',
 'master_sub_1.FIXEDCELLS.FIXEDCELLS__cto': 'cistopic_objects/master_sub_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_20topics.pkl',
 'master_sub_1.FIXEDCELLS.master_sub_1': 'cistopic_objects/master_sub_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl'}

In [24]:
for sample, cto_path in cto_path_dict.items():
    with open(cto_path, "rb") as f:
        cto = pickle.load(f)

    cto.cell_data.to_csv(cto_path.replace(".pkl", ".cell_data.csv"))

In [32]:
cell_data_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("cistopic_objects/*consensus.cell_data.csv"))
}
cell_data_path_dict

{'master_sub_1.FIXEDCELLS': 'cistopic_objects/master_sub_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_20topics.dimreduc.consensus.cell_data.csv'}

In [33]:
bw_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*_pseudobulk_bw_files"))
}
bw_path_dict

{'master_sub_1.FIXEDCELLS': 'final_consensus_peaks/master_sub_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files'}

In [34]:
bed_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*_pseudobulk_bed_files"))
}
bed_path_dict

{'master_sub_1.FIXEDCELLS': 'final_consensus_peaks/master_sub_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files'}

In [35]:
for sample in bed_path_dict.keys():
    bed_paths = {
        x.split("/")[-1].split("__")[0].split(".bed.gz")[0]: x
        for x in glob.glob(bed_path_dict[sample] + "/*")
    }

In [36]:
from pycisTopic.pseudobulk_peak_calling import *

In [37]:
import ray

In [38]:
narrow_peaks_dict = {}
ray.shutdown()
for sample in bed_path_dict.keys():
    narrow_peaks_dict_path = bed_path_dict[sample].replace(
        "_pseudobulk_bed_files", "_narrow_peaks_dict.pkl"
    )
    peak_path = os.path.join(
        "final_consensus_peaks", f"{sample}__SCREEN_consensus_peaks"
    )
    if not os.path.exists(peak_path):
        os.mkdir(peak_path)

    if not os.path.exists(narrow_peaks_dict_path):
        cell_data = pd.read_csv(cell_data_path_dict[sample])
        cto_celltypes = set(
            [
                x.replace(" ", "").replace("+", "").replace("_", "")
                for x in set(cell_data["consensus_cell_type"].unique())
            ]
        )

        bed_celltypes = set(
            [
                x.split(".")[0].replace("+", "").replace("_", "")
                for x in os.listdir(bed_path_dict[sample])
            ]
        )

        if cto_celltypes == bed_celltypes:
            print(f"Starting {narrow_peaks_dict_path}")
            bed_paths = {
                x.split("/")[-1].split("__")[0].split(".bed.gz")[0]: x
                for x in glob.glob(bed_path_dict[sample] + "/*")
            }
            narrow_peaks_dict = peak_calling(
                macs_path="macs2",
                bed_paths=bed_paths,
                outdir=peak_path,
                genome_size="hs",
                n_cpu=16,
                input_format="BEDPE",
                shift=73,
                ext_size=146,
                keep_dup="all",
                q_value=0.05,
            )
            with open(narrow_peaks_dict_path, "wb") as f:
                pickle.dump(narrow_peaks_dict, f)
        else:
            print(f"{sample} cell types not matching!! Rerun bed file writing.")
            print(f"\t{bed_celltypes}")
            print(f"\t{cto_celltypes}")
    else:
        print(f"{narrow_peaks_dict_path} already exists")

Starting final_consensus_peaks/master_sub_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl


2022-12-27 16:04:48,306	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(macs_call_peak_ray pid=2560294)[0m 2022-12-27 16:04:51,253 cisTopic     INFO     Calling peaks for CytotoxicTcell with macs2 callpeak --treatment final_consensus_peaks/master_sub_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/CytotoxicTcell.bed.gz --name CytotoxicTcell  --outdir final_consensus_peaks/master_sub_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=2560300)[0m 2022-12-27 16:04:51,344 cisTopic     INFO     Calling peaks for Naturalkillercell with macs2 callpeak --treatment final_consensus_peaks/master_sub_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/Naturalkillercell.bed.gz --name Naturalkillercell  --outdir final_consensus_peaks/master_sub_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=2560305)[0m 2

# call consensus peaks

In [39]:
from pycisTopic.iterative_peak_calling import get_consensus_peaks

In [40]:
narrow_peaks_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*_narrow_peaks_dict.pkl"))
}
narrow_peaks_path_dict

{'master_sub_1.FIXEDCELLS': 'final_consensus_peaks/master_sub_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl'}

In [41]:
path_to_blacklist = "../0_resources/regions/hg38-blacklist.v2.bed"

In [42]:
peak_half_width = 250

# Get consensus peaks
consensus_peaks_dict = {}
for sample in narrow_peaks_path_dict.keys():
    print(sample)
    consensus_out_path = narrow_peaks_path_dict[sample].replace(
        "_narrow_peaks_dict.pkl", "_consensus_peaks.bed"
    )
    if not os.path.exists(consensus_out_path):
        cell_data = pd.read_csv(cell_data_path_dict[sample])
        cto_celltypes = set(
            [
                x.replace(" ", "").replace("+", "").replace("_", "")
                for x in set(cell_data["consensus_cell_type"].unique())
            ]
        )

        with open(narrow_peaks_path_dict[sample], "rb") as f:
            narrow_peaks_dict = pickle.load(f)
        peaks_celltypes = set(
            [
                x.replace(" ", "").replace("+", "").replace("_", "")
                for x in set(narrow_peaks_dict.keys())
            ]
        )

        if cto_celltypes == peaks_celltypes:
            consensus_peaks = get_consensus_peaks(
                narrow_peaks_dict,
                peak_half_width,
                chromsizes=chromsizes,
                path_to_blacklist=path_to_blacklist,
            )

            consensus_peaks.to_bed(
                path=consensus_out_path, keep=True, compression="infer", chain=False
            )
        else:
            print("CELL TYPE SETS NOT MATCHING! Rerun peak calling.")
            print(peaks_celltypes - cto_celltypes)
            print(cto_celltypes - peaks_celltypes)
    else:
        print(f"{consensus_out_path} already exists, skipping...")

master_sub_1.FIXEDCELLS
2022-12-27 16:13:29,439 cisTopic     INFO     Extending and merging peaks per class


  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends


2022-12-27 16:15:29,832 cisTopic     INFO     Normalizing peak scores
2022-12-27 16:15:30,288 cisTopic     INFO     Merging peaks
2022-12-27 16:16:58,091 cisTopic     INFO     Done!


  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.co

# write top 20 and bot 20 pct

In [57]:
peak_path_dict = {
    x.split("/")[-1].split("__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*peaks.bed"))
}

In [59]:
df_merged = pd.DataFrame()
for sample, peak_path in peak_path_dict.items():
    print(sample)
    df = pd.read_csv(peak_path, sep="\t", header=None)
    df["sample"] = sample
    df.columns = [
        "chrom",
        "start",
        "end",
        "cell_type",
        "score",
        "strand",
        "sample",
    ]
    percentile_val = np.percentile(df["score"], 80)
    df_sub = df[df["score"] > percentile_val]
    peak_path_new = peak_path.replace(".bed", "__top20pct.bed")

    df_sub.drop("sample", axis=1).to_csv(
        peak_path_new, sep="\t", header=False, index=False
    )

    percentile_val = np.percentile(df["score"], 20)
    df_sub = df[df["score"] < percentile_val]
    peak_path_new = peak_path.replace(".bed", "__bot20pct.bed")

    df_sub.drop("sample", axis=1).to_csv(
        peak_path_new, sep="\t", header=False, index=False
    )

master_sub_1.FIXEDCELLS


# Check % chrM in consensus peaks

In [43]:
consensus_peaks_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*consensus_peaks.bed"))
}
consensus_peaks_path_dict

{'master_sub_1.FIXEDCELLS': 'final_consensus_peaks/master_sub_1.FIXEDCELLS__SCREEN_consensus_peaks.bed'}

In [44]:
for sample, path in consensus_peaks_path_dict.items():
    print(sample)
    peaks_df = pd.read_csv(path, sep="\t", header=None)

    chroms_in_df = list(sorted(peaks_df[0].unique()))
    chroms_standard = ["chr" + str(x + 1) for x in range(22)] + ["chrX"]
    chroms_nonstandard = list(set(chroms_in_df) - set(chroms_standard) - set(["chrM"]))

    n_standard = peaks_df[0].value_counts()[chroms_standard].sum()
    n_contigs = peaks_df[0].value_counts()[chroms_nonstandard].sum()
    n_chrm = peaks_df[0].value_counts()["chrM"].sum()
    pct_nonstandard = (n_contigs + n_chrm) / len(peaks_df) * 100

    print(f"\tpeaks on standard chromosomes: {n_standard}")
    print(f"\tpeaks on contigs: {n_contigs}")
    print(f"\tpeaks on chrM: {n_chrm}")
    print(f"\t% peaks non standard chromosomes: {pct_nonstandard}%")
    if "chrY" in chroms_in_df:
        n_chrY = peaks_df[0].value_counts()["chrY"].sum()
        print(f"\tpeaks on chrY: {n_chrY}")
    else:
        print(f"\tpeaks on chrY: 0")

master_sub_1.FIXEDCELLS
	peaks on standard chromosomes: 326759
	peaks on contigs: 659
	peaks on chrM: 21
	% peaks non standard chromosomes: 0.20767226872791572%
	peaks on chrY: 220
