In [1]:
from pycisTopic.pseudobulk_peak_calling import export_pseudobulk, peak_calling
import pyranges as pr
import requests
import os
import pandas as pd
import glob
import pickle

In [2]:
%load_ext lab_black

In [3]:
# get chromosome sizes (hg38)
if not os.path.exists("chromsizes.txt"):
    target_url = (
        "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes"
    )
    chromsizes = pd.read_csv(target_url, sep="\t", header=None)
    chromsizes.columns = ["Chromosome", "End"]
    chromsizes["Start"] = [0] * chromsizes.shape[0]
    chromsizes = chromsizes.loc[:, ["Chromosome", "Start", "End"]]
    chromsizes = pr.PyRanges(chromsizes)
    chromsizes.to_csv("chromsizes.txt")
    chromsizes
else:
    chromsizes = pd.read_csv("chromsizes.txt")

In [4]:
cto_consensus_paths = sorted(glob.glob(f"cistopic_objects/*topics.pkl"))
cto_consensus_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x for x in cto_consensus_paths
}
cto_consensus_path_dict

{'10xmultiome_celltypefair_1.FIXEDCELLS': 'cistopic_objects/10xmultiome_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_19topics.pkl',
 '10xv11_celltypefair_1.FIXEDCELLS': 'cistopic_objects/10xv11_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_26topics.pkl',
 '10xv1_celltypefair_1.FIXEDCELLS': 'cistopic_objects/10xv1_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_21topics.pkl',
 '10xv2_celltypefair_1.FIXEDCELLS': 'cistopic_objects/10xv2_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_25topics.pkl',
 'ddseq_celltypefair_1.FIXEDCELLS': 'cistopic_objects/ddseq_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_17topics.pkl',
 'hydrop_celltypefair_1.FIXEDCELLS': 'cistopic_objects/hydrop_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_22topics.pkl',
 'mtscatac_celltypefair_1.FIXEDCELLS': 'cistopic_objects/mtscatac_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_25topics.pkl',
 's3atac_celltypefa

In [5]:
fragments_path_dict = {
    x: "/lustre1/project/stg_00090/scatac_benchmark/fixedcells_4_merged/merged_all_1.fragments.ID.sorted.tsv.gz"
    for x in cto_consensus_path_dict.keys()
}
fragments_path_dict

{'10xmultiome_celltypefair_1.FIXEDCELLS': '/lustre1/project/stg_00090/scatac_benchmark/fixedcells_4_merged/merged_all_1.fragments.ID.sorted.tsv.gz',
 '10xv11_celltypefair_1.FIXEDCELLS': '/lustre1/project/stg_00090/scatac_benchmark/fixedcells_4_merged/merged_all_1.fragments.ID.sorted.tsv.gz',
 '10xv1_celltypefair_1.FIXEDCELLS': '/lustre1/project/stg_00090/scatac_benchmark/fixedcells_4_merged/merged_all_1.fragments.ID.sorted.tsv.gz',
 '10xv2_celltypefair_1.FIXEDCELLS': '/lustre1/project/stg_00090/scatac_benchmark/fixedcells_4_merged/merged_all_1.fragments.ID.sorted.tsv.gz',
 'ddseq_celltypefair_1.FIXEDCELLS': '/lustre1/project/stg_00090/scatac_benchmark/fixedcells_4_merged/merged_all_1.fragments.ID.sorted.tsv.gz',
 'hydrop_celltypefair_1.FIXEDCELLS': '/lustre1/project/stg_00090/scatac_benchmark/fixedcells_4_merged/merged_all_1.fragments.ID.sorted.tsv.gz',
 'mtscatac_celltypefair_1.FIXEDCELLS': '/lustre1/project/stg_00090/scatac_benchmark/fixedcells_4_merged/merged_all_1.fragments.ID.sort

In [6]:
fragments_path_dict = {
    x.split("/")[-1].split(f".fragments.tsv.gz")[0]: x
    for x in sorted(glob.glob("../1_data_repository/fixedcells_fragments/*.tsv.gz"))
}
fragments_path_dict

{'BIO_ddseq_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_1.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_2.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_3.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_3.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_4.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_4.FIXEDCELLS.fragments.tsv.gz',
 'BRO_mtscatac_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BRO_mtscatac_1.FIXEDCELLS.fragments.tsv.gz',
 'BRO_mtscatac_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BRO_mtscatac_2.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xmultiome_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xmultiome_1.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xmultiome_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xmultiome_2.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xv11_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragmen

In [7]:
import gc
import logging
import os
import re
import subprocess
import sys
from typing import Dict, List, Optional, Union

import numpy as np
import pandas as pd
import pyBigWig
import pyranges as pr
import ray

from pycisTopic.cistopic_class import *
from pycisTopic.utils import *


def export_pseudobulk(
    input_data: Union["CistopicObject", pd.DataFrame, Dict[str, pd.DataFrame]],
    variable: str,
    chromsizes: Union[pd.DataFrame, pr.PyRanges],
    bed_path: str,
    bigwig_path: str,
    path_to_fragments: Optional[Dict[str, str]] = None,
    sample_id_col: Optional[str] = "sample_id",
    n_cpu: Optional[int] = 1,
    normalize_bigwig: Optional[bool] = True,
    remove_duplicates: Optional[bool] = True,
    split_pattern: Optional[str] = "___",
    use_polars: Optional[bool] = True,
    **kwargs
):
    """
    Create pseudobulks as bed and bigwig from single cell fragments file given a barcode annotation.
    Parameters
    ---------
    input_data: CistopicObject or pd.DataFrame
            A :class:`CistopicObject` containing the specified `variable` as a column in :class:`CistopicObject.cell_data` or a cell metadata
            :class:`pd.DataFrame` containing barcode as rows, containing the specified `variable` as a column (additional columns are
            possible) and a `sample_id` column. Index names must contain the BARCODE (e.g. ATGTCGTC-1), additional tags are possible separating with -
            (e.g. ATGCTGTGCG-1-Sample_1). The levels in the sample_id column must agree with the keys in the path_to_fragments dictionary.
            Alternatively, if the cell metadata contains a column named barcode it will be used instead of the index names.
    variable: str
            A character string indicating the column that will be used to create the different group pseudobulk. It must be included in
            the cell metadata provided as input_data.
    chromsizes: pd.DataFrame or pr.PyRanges
            A data frame or :class:`pr.PyRanges` containing size of each chromosome, containing 'Chromosome', 'Start' and 'End' columns.
    bed_path: str
            Path to folder where the fragments bed files per group will be saved. If None, files will not be generated.
    bigwig_path: str
            Path to folder where the bigwig files per group will be saved. If None, files will not be generated.
    path_to_fragments: str or dict, optional
            A dictionary of character strings, with sample name as names indicating the path to the fragments file/s from which pseudobulk profiles have to
            be created. If a :class:`CistopicObject` is provided as input it will be ignored, but if a cell metadata :class:`pd.DataFrame` is provided it
            is necessary to provide it. The keys of the dictionary need to match with the sample_id tag added to the index names of the input data frame.
    sample_id_col: str, optional
            Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'.
    n_cpu: int, optional
            Number of cores to use. Default: 1.
    normalize_bigwig: bool, optional
            Whether bigwig files should be CPM normalized. Default: True.
    remove_duplicates: bool, optional
            Whether duplicates should be removed before converting the data to bigwig.
    split_pattern: str, optional
            Pattern to split cell barcode from sample id. Default: ___ .
    use_polars: bool, optional
            Whether to use polars to read fragments files. Default: True.
    **kwargs
            Additional parameters for ray.init()
    Return
    ------
    dict
            A dictionary containing the paths to the newly created bed fragments files per group a dictionary containing the paths to the
            newly created bigwig files per group.
    """
    # Create logger
    level = logging.INFO
    log_format = "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger("cisTopic")

    # Get fragments file
    if isinstance(input_data, CistopicObject):
        path_to_fragments = input_data.path_to_fragments
        if path_to_fragments is None:
            log.error("No path_to_fragments in this cisTopic object.")
        cell_data = input_data.cell_data
    elif isinstance(input_data, pd.DataFrame):
        if path_to_fragments is None:
            log.error("Please, provide path_to_fragments.")
        cell_data = input_data
    # Check for sample_id column
    try:
        sample_ids = list(set(cell_data[sample_id_col]))
    except ValueError:
        print(
            'Please, include a sample identification column (e.g. "sample_id") in your cell metadata!'
        )

    # Get fragments
    fragments_df_dict = {}
    for sample_id in path_to_fragments.keys():
        if sample_id not in sample_ids:
            log.info(
                "The following path_to_fragments entry is not found in the cell metadata sample_id_col: ",
                sample_id,
                ". It will be ignored.",
            )
        else:
            log.info("Reading fragments from " + path_to_fragments[sample_id])
            fragments_df = read_fragments_from_file(
                path_to_fragments[sample_id], use_polars=use_polars
            ).df
            # Convert to int32 for memory efficiency
            fragments_df.Start = np.int32(fragments_df.Start)
            fragments_df.End = np.int32(fragments_df.End)
            if "Score" in fragments_df:
                fragments_df.Score = np.int32(fragments_df.Score)
            if "barcode" in cell_data:
                fragments_df = fragments_df.loc[
                    fragments_df["Name"].isin(cell_data["barcode"].tolist())
                ]
            else:
                fragments_df = fragments_df.loc[
                    fragments_df["Name"].isin(
                        prepare_tag_cells(cell_data.index.tolist(), split_pattern)
                    )
                ]
            fragments_df_dict[sample_id] = fragments_df

    # Set groups
    if "barcode" in cell_data:
        cell_data = cell_data.loc[:, [variable, sample_id_col, "barcode"]]
    else:
        cell_data = cell_data.loc[:, [variable, sample_id_col]]
    cell_data[variable] = cell_data[variable].replace(" ", "", regex=True)
    cell_data[variable] = cell_data[variable].replace("[^A-Za-z0-9]+", "_", regex=True)
    groups = sorted(list(set(cell_data[variable])))
    # Check chromosome sizes
    if isinstance(chromsizes, pd.DataFrame):
        chromsizes = chromsizes.loc[:, ["Chromosome", "Start", "End"]]
        chromsizes = pr.PyRanges(chromsizes)
    # Check that output dir exist and generate output paths
    if isinstance(bed_path, str):
        if not os.path.exists(bed_path):
            os.makedirs(bed_path)
        bed_paths = {
            group: os.path.join(bed_path, str(group) + ".bed.gz") for group in groups
        }
    else:
        bed_paths = {}
    if isinstance(bigwig_path, str):
        if not os.path.exists(bigwig_path):
            os.makedirs(bigwig_path)
        bw_paths = {
            group: os.path.join(bigwig_path, str(group) + ".bw") for group in groups
        }
    else:
        bw_paths = {}
    # Create pseudobulks
    if n_cpu > 1:
        ray.init(num_cpus=n_cpu, **kwargs)
        ray_handle = ray.wait(
            [
                export_pseudobulk_ray.remote(
                    cell_data,
                    group,
                    fragments_df_dict,
                    chromsizes,
                    bigwig_path,
                    bed_path,
                    sample_id_col,
                    normalize_bigwig,
                    remove_duplicates,
                    split_pattern,
                )
                for group in groups
            ],
            num_returns=len(groups),
        )
        ray.shutdown()
    else:
        [
            export_pseudobulk_one_sample(
                cell_data,
                group,
                fragments_df_dict,
                chromsizes,
                bigwig_path,
                bed_path,
                sample_id_col,
                normalize_bigwig,
                remove_duplicates,
                split_pattern,
            )
            for group in groups
        ]

    return bw_paths, bed_paths


def export_pseudobulk_one_sample(
    cell_data: pd.DataFrame,
    group: str,
    fragments_df_dict: Dict[str, pd.DataFrame],
    chromsizes: pr.PyRanges,
    bigwig_path: str,
    bed_path: str,
    sample_id_col: Optional[str] = "sample_id",
    normalize_bigwig: Optional[bool] = True,
    remove_duplicates: Optional[bool] = True,
    split_pattern: Optional[str] = "___",
):
    """
    Create pseudobulk as bed and bigwig from single cell fragments file given a barcode annotation and a group.
    Parameters
    ---------
    cell_data: pd.DataFrame
            A cell metadata :class:`pd.Dataframe` containing barcodes, their annotation and their sample of origin.
    group: str
            A character string indicating the group for which pseudobulks will be created.
    fragments_df_dict: dict
            A dictionary containing data frames as values with 'Chromosome', 'Start', 'End', 'Name', and 'Score' as columns; and sample label
            as keys. 'Score' indicates the number of times that a fragments is found assigned to that barcode.
    chromsizes: pr.PyRanges
            A :class:`pr.PyRanges` containing size of each column, containing 'Chromosome', 'Start' and 'End' columns.
    bigwig_path: str
            Path to folder where the bigwig file will be saved.
    bed_path: str
            Path to folder where the fragments bed file will be saved.
    sample_id_col: str, optional
            Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'.
    normalize_bigwig: bool, optional
            Whether bigwig files should be CPM normalized. Default: True.
    remove_duplicates: bool, optional
            Whether duplicates should be removed before converting the data to bigwig.
    split_pattern: str
            Pattern to split cell barcode from sample id. Default: ___ .
    """
    # Create logger
    level = logging.INFO
    log_format = "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger("cisTopic")

    log.info("Creating pseudobulk for " + str(group))
    group_fragments_list = []
    group_fragments_dict = {}
    for sample_id in fragments_df_dict:
        sample_data = cell_data[cell_data.loc[:, sample_id_col].isin([sample_id])]
        if "barcode" in sample_data:
            sample_data.index = sample_data["barcode"].tolist()
        else:
            sample_data.index = prepare_tag_cells(
                sample_data.index.tolist(), split_pattern
            )
        group_var = sample_data.iloc[:, 0]
        barcodes = group_var[group_var.isin([group])].index.tolist()
        fragments_df = fragments_df_dict[sample_id]
        group_fragments = fragments_df.loc[fragments_df["Name"].isin(barcodes)]
        if len(fragments_df_dict) > 1:
            group_fragments_dict[sample_id] = group_fragments

    if len(fragments_df_dict) > 1:
        group_fragments_list = [
            group_fragments_dict[list(group_fragments_dict.keys())[x]]
            for x in range(len(fragments_df_dict))
        ]
        group_fragments = group_fragments_list[0].append(group_fragments_list[1:])

    group_fragments = group_fragments[
        group_fragments["Chromosome"].isin(chromsizes.Chromosome)
    ]

    del group_fragments_dict
    del group_fragments_list
    del fragments_df
    gc.collect()

    group_pr = pr.PyRanges(group_fragments)
    if isinstance(bigwig_path, str):
        bigwig_path_group = os.path.join(bigwig_path, str(group) + ".bw")
        if remove_duplicates:
            group_pr.to_bigwig(
                path=bigwig_path_group,
                chromosome_sizes=chromsizes,
                rpm=normalize_bigwig,
            )
        else:
            group_pr.to_bigwig(
                path=bigwig_path_group,
                chromosome_sizes=chromsizes,
                rpm=normalize_bigwig,
                value_col="Score",
            )
    if isinstance(bed_path, str):
        bed_path_group = os.path.join(bed_path, str(group) + ".bed.gz")
        group_pr.to_bed(
            path=bed_path_group, keep=False, compression="infer", chain=False
        )

    log.info(str(group) + " done!")


@ray.remote
def export_pseudobulk_ray(
    cell_data: pd.DataFrame,
    group: str,
    fragments_df_dict: Dict[str, pd.DataFrame],
    chromsizes: pr.PyRanges,
    bigwig_path: str,
    bed_path: str,
    sample_id_col: Optional[str] = "sample_id",
    normalize_bigwig: Optional[bool] = True,
    remove_duplicates: Optional[bool] = True,
    split_pattern: Optional[str] = "___",
):
    """
    Create pseudobulk as bed and bigwig from single cell fragments file given a barcode annotation and a group.
    Parameters
    ---------
    cell_data: pd.DataFrame
            A cell metadata :class:`pd.Dataframe` containing barcodes, their annotation and their sample of origin.
    group: str
            A character string indicating the group for which pseudobulks will be created.
    fragments_df_dict: dict
            A dictionary containing data frames as values with 'Chromosome', 'Start', 'End', 'Name', and 'Score' as columns; and sample label
            as keys. 'Score' indicates the number of times that a fragments is found assigned to that barcode.
    chromsizes: pr.PyRanges
            A :class:`pr.PyRanges` containing size of each column, containing 'Chromosome', 'Start' and 'End' columns.
    bed_path: str
            Path to folder where the fragments bed file will be saved.
    bigwig_path: str
            Path to folder where the bigwig file will be saved.
    sample_id_col: str, optional
            Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'.
    normalize_bigwig: bool, optional
            Whether bigwig files should be CPM normalized. Default: True.
    remove_duplicates: bool, optional
            Whether duplicates should be removed before converting the data to bigwig.
    split_pattern: str
            Pattern to split cell barcode from sample id. Default: ___ .
    """
    export_pseudobulk_one_sample(
        cell_data,
        group,
        fragments_df_dict,
        chromsizes,
        bigwig_path,
        bed_path,
        sample_id_col,
        normalize_bigwig,
        remove_duplicates,
        split_pattern,
    )

In [10]:
fragments_path_dict_sub

{'OHS_s3atac_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/OHS_s3atac_1.FIXEDCELLS.fragments.tsv.gz',
 'OHS_s3atac_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/OHS_s3atac_2.FIXEDCELLS.fragments.tsv.gz'}

In [13]:
# for sample in ["BRO_mtscatac_1.LIBDS"]:
bw_paths_dict = {}
bed_paths_dict = {}

import ray

if ray.is_initialized():
    print("Shutting down Ray")
    ray.shutdown()

for sample in cto_consensus_path_dict.keys():
    supersample = ".".join(sample.split(".")[:2])

    bed_path = os.path.join(
        "final_consensus_peaks", f"{sample}__SCREEN_pseudobulk_bed_files"
    )
    bw_path = os.path.join(
        "final_consensus_peaks", f"{sample}__SCREEN_pseudobulk_bw_files"
    )

    if not os.path.exists(bed_path):
        cto_path = cto_consensus_path_dict[sample]
        with open(cto_path, "rb") as f:
            cto = pickle.load(f)

        fragments_path_dict_sub = {}
        for sample in cto.cell_data["sample_id"].unique():
            fragments_path_dict_sub[sample] = fragments_path_dict[sample]

        print(fragments_path_dict_sub)

        bw_paths, bed_paths = export_pseudobulk(
            input_data=cto,
            variable="harmony_consensus_cell_type__mega",
            sample_id_col="sample_id",
            chromsizes=chromsizes,
            bed_path=bed_path,
            bigwig_path=bw_path,
            path_to_fragments=fragments_path_dict_sub,
            n_cpu=16,
            normalize_bigwig=True,
            remove_duplicates=True,
        )

        if ray.is_initialized():
            print("Shutting down Ray")
            ray.shutdown()
    else:
        print(f"{bed_path} exists, skipping...")

final_consensus_peaks/10xmultiome_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files exists, skipping...
{'TXG_10xv11_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/TXG_10xv11_1.FIXEDCELLS.fragments.tsv.gz', 'CNA_10xv11_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xv11_1.FIXEDCELLS.fragments.tsv.gz', 'CNA_10xv11_5.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xv11_5.FIXEDCELLS.fragments.tsv.gz', 'CNA_10xv11_4.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xv11_4.FIXEDCELLS.fragments.tsv.gz', 'CNA_10xv11_3.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xv11_3.FIXEDCELLS.fragments.tsv.gz', 'CNA_10xv11_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xv11_2.FIXEDCELLS.fragments.tsv.gz', 'STA_10xv11_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/STA_10xv11_1.FIXEDCELLS.fragments.tsv.gz', 'STA_10xv11_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/STA_10xv11_2.FIXEDCELLS.fragm

2023-02-20 17:48:55,673	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=964407)[0m 2023-02-20 17:49:00,140 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=964397)[0m 2023-02-20 17:49:01,335 cisTopic     INFO     Creating pseudobulk for CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=964408)[0m 2023-02-20 17:49:02,537 cisTopic     INFO     Creating pseudobulk for CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=964405)[0m 2023-02-20 17:49:03,863 cisTopic     INFO     Creating pseudobulk for CD4_Tcell
[2m[36m(export_pseudobulk_ray pid=964395)[0m 2023-02-20 17:49:05,339 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell




[2m[36m(export_pseudobulk_ray pid=964403)[0m 2023-02-20 17:49:06,800 cisTopic     INFO     Creating pseudobulk for Dendriticcell




[2m[36m(export_pseudobulk_ray pid=964406)[0m 2023-02-20 17:49:08,106 cisTopic     INFO     Creating pseudobulk for Naturalkillercell




[2m[36m(export_pseudobulk_ray pid=964403)[0m 2023-02-20 17:49:24,437 cisTopic     INFO     Dendriticcell done!
[2m[36m(export_pseudobulk_ray pid=964408)[0m 2023-02-20 17:49:25,218 cisTopic     INFO     CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=964406)[0m 2023-02-20 17:49:28,692 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=964407)[0m 2023-02-20 17:50:19,114 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=964397)[0m 2023-02-20 17:50:57,962 cisTopic     INFO     CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=964405)[0m 2023-02-20 17:51:26,220 cisTopic     INFO     CD4_Tcell done!
{'VIB_10xv1_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/VIB_10xv1_1.FIXEDCELLS.fragments.tsv.gz', 'VIB_10xv1_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/VIB_10xv1_2.FIXEDCELLS.fragments.tsv.gz'}


2023-02-20 17:58:22,698	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=968255)[0m 2023-02-20 17:58:26,029 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=968266)[0m 2023-02-20 17:58:26,703 cisTopic     INFO     Creating pseudobulk for CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=968252)[0m 2023-02-20 17:58:27,338 cisTopic     INFO     Creating pseudobulk for CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=968260)[0m 2023-02-20 17:58:27,981 cisTopic     INFO     Creating pseudobulk for CD4_Tcell




[2m[36m(export_pseudobulk_ray pid=968254)[0m 2023-02-20 17:58:28,647 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell
[2m[36m(export_pseudobulk_ray pid=968262)[0m 2023-02-20 17:58:29,295 cisTopic     INFO     Creating pseudobulk for Dendriticcell




[2m[36m(export_pseudobulk_ray pid=968256)[0m 2023-02-20 17:58:30,062 cisTopic     INFO     Creating pseudobulk for Naturalkillercell




[2m[36m(export_pseudobulk_ray pid=968262)[0m 2023-02-20 17:58:44,979 cisTopic     INFO     Dendriticcell done!
[2m[36m(export_pseudobulk_ray pid=968256)[0m 2023-02-20 17:58:49,602 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=968252)[0m 2023-02-20 17:58:49,923 cisTopic     INFO     CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=968255)[0m 2023-02-20 17:59:52,929 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=968266)[0m 2023-02-20 18:00:23,818 cisTopic     INFO     CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=968260)[0m 2023-02-20 18:01:04,036 cisTopic     INFO     CD4_Tcell done!
{'TXG_10xv2_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/TXG_10xv2_1.FIXEDCELLS.fragments.tsv.gz', 'CNA_10xv2_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xv2_1.FIXEDCELLS.fragments.tsv.gz', 'TXG_10xv2_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/TXG_10xv2_2.FIXEDCELLS.fragments

2023-02-20 18:12:43,378	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=1002228)[0m 2023-02-20 18:12:47,894 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=1002232)[0m 2023-02-20 18:12:49,328 cisTopic     INFO     Creating pseudobulk for CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=1002229)[0m 2023-02-20 18:12:50,712 cisTopic     INFO     Creating pseudobulk for CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=1002218)[0m 2023-02-20 18:12:52,145 cisTopic     INFO     Creating pseudobulk for CD4_Tcell
[2m[36m(export_pseudobulk_ray pid=1002217)[0m 2023-02-20 18:12:53,590 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell
[2m[36m(export_pseudobulk_ray pid=1002226)[0m 2023-02-20 18:12:54,975 cisTopic     INFO     Creating pseudobulk for Dendriticcell




[2m[36m(export_pseudobulk_ray pid=1002227)[0m 2023-02-20 18:12:56,432 cisTopic     INFO     Creating pseudobulk for Naturalkillercell




[2m[36m(export_pseudobulk_ray pid=1002226)[0m 2023-02-20 18:13:17,957 cisTopic     INFO     Dendriticcell done!
[2m[36m(export_pseudobulk_ray pid=1002229)[0m 2023-02-20 18:13:23,462 cisTopic     INFO     CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=1002227)[0m 2023-02-20 18:13:26,634 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=1002228)[0m 2023-02-20 18:14:43,845 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=1002232)[0m 2023-02-20 18:16:09,794 cisTopic     INFO     CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=1002218)[0m 2023-02-20 18:16:35,608 cisTopic     INFO     CD4_Tcell done!
{'HAR_ddseq_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/HAR_ddseq_2.FIXEDCELLS.fragments.tsv.gz', 'BIO_ddseq_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_2.FIXEDCELLS.fragments.tsv.gz', 'BIO_ddseq_3.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_3.FIXEDCELLS.fra

2023-02-20 18:30:31,491	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=1027876)[0m 2023-02-20 18:30:49,016 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=1027879)[0m 2023-02-20 18:31:01,686 cisTopic     INFO     Creating pseudobulk for CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=1027872)[0m 2023-02-20 18:31:13,162 cisTopic     INFO     Creating pseudobulk for CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=1027877)[0m 2023-02-20 18:31:24,557 cisTopic     INFO     Creating pseudobulk for CD4_Tcell




[2m[36m(export_pseudobulk_ray pid=1027873)[0m 2023-02-20 18:31:36,814 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell




[2m[36m(export_pseudobulk_ray pid=1027865)[0m 2023-02-20 18:31:46,566 cisTopic     INFO     Creating pseudobulk for Dendriticcell
[2m[36m(export_pseudobulk_ray pid=1027872)[0m 2023-02-20 18:31:50,226 cisTopic     INFO     CD16_monocyte done!




[2m[36m(export_pseudobulk_ray pid=1027872)[0m 2023-02-20 18:31:56,454 cisTopic     INFO     Creating pseudobulk for Naturalkillercell




[2m[36m(export_pseudobulk_ray pid=1027865)[0m 2023-02-20 18:32:17,139 cisTopic     INFO     Dendriticcell done!
[2m[36m(export_pseudobulk_ray pid=1027876)[0m 2023-02-20 18:32:19,918 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=1027872)[0m 2023-02-20 18:32:28,910 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=1027879)[0m 2023-02-20 18:32:47,999 cisTopic     INFO     CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=1027877)[0m 2023-02-20 18:33:35,649 cisTopic     INFO     CD4_Tcell done!
{'CNA_hydrop_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_hydrop_2.FIXEDCELLS.fragments.tsv.gz', 'EPF_hydrop_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/EPF_hydrop_2.FIXEDCELLS.fragments.tsv.gz', 'CNA_hydrop_3.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_hydrop_3.FIXEDCELLS.fragments.tsv.gz', 'VIB_hydrop_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/VIB_hydrop_1.FIXEDCELLS

2023-02-20 18:39:51,934	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=1030739)[0m 2023-02-20 18:39:55,477 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=1030737)[0m 2023-02-20 18:39:56,217 cisTopic     INFO     Creating pseudobulk for CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=1030742)[0m 2023-02-20 18:39:56,932 cisTopic     INFO     Creating pseudobulk for CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=1030740)[0m 2023-02-20 18:39:57,697 cisTopic     INFO     Creating pseudobulk for CD4_Tcell




[2m[36m(export_pseudobulk_ray pid=1030735)[0m 2023-02-20 18:39:58,625 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell
[2m[36m(export_pseudobulk_ray pid=1030729)[0m 2023-02-20 18:39:59,426 cisTopic     INFO     Creating pseudobulk for Dendriticcell




[2m[36m(export_pseudobulk_ray pid=1030738)[0m 2023-02-20 18:40:00,158 cisTopic     INFO     Creating pseudobulk for Naturalkillercell




[2m[36m(export_pseudobulk_ray pid=1030742)[0m 2023-02-20 18:40:04,470 cisTopic     INFO     CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=1030729)[0m 2023-02-20 18:40:05,452 cisTopic     INFO     Dendriticcell done!
[2m[36m(export_pseudobulk_ray pid=1030738)[0m 2023-02-20 18:40:06,255 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=1030739)[0m 2023-02-20 18:40:23,428 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=1030737)[0m 2023-02-20 18:40:46,491 cisTopic     INFO     CD14_monocyte done!
[2m[36m(export_pseudobulk_ray pid=1030740)[0m 2023-02-20 18:40:46,548 cisTopic     INFO     CD4_Tcell done!
{'MDC_mtscatac_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/MDC_mtscatac_2.FIXEDCELLS.fragments.tsv.gz', 'BRO_mtscatac_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BRO_mtscatac_1.FIXEDCELLS.fragments.tsv.gz', 'BRO_mtscatac_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BRO_mtscata

2023-02-20 18:47:52,510	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=1033522)[0m 2023-02-20 18:47:56,838 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=1033516)[0m 2023-02-20 18:47:58,078 cisTopic     INFO     Creating pseudobulk for CD14_monocyte




[2m[36m(export_pseudobulk_ray pid=1033510)[0m 2023-02-20 18:47:59,282 cisTopic     INFO     Creating pseudobulk for CD16_monocyte




[2m[36m(export_pseudobulk_ray pid=1033521)[0m 2023-02-20 18:48:00,652 cisTopic     INFO     Creating pseudobulk for CD4_Tcell




[2m[36m(export_pseudobulk_ray pid=1033511)[0m 2023-02-20 18:48:01,983 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell




[2m[36m(export_pseudobulk_ray pid=1033515)[0m 2023-02-20 18:48:03,324 cisTopic     INFO     Creating pseudobulk for Dendriticcell




[2m[36m(export_pseudobulk_ray pid=1033520)[0m 2023-02-20 18:48:04,630 cisTopic     INFO     Creating pseudobulk for Naturalkillercell




[2m[36m(export_pseudobulk_ray pid=1033520)[0m 2023-02-20 18:48:23,478 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=1033515)[0m 2023-02-20 18:48:26,025 cisTopic     INFO     Dendriticcell done!
[2m[36m(export_pseudobulk_ray pid=1033510)[0m 2023-02-20 18:48:27,266 cisTopic     INFO     CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=1033522)[0m 2023-02-20 18:49:58,391 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=1033521)[0m 2023-02-20 18:50:36,548 cisTopic     INFO     CD4_Tcell done!
[2m[36m(export_pseudobulk_ray pid=1033516)[0m 2023-02-20 18:51:32,404 cisTopic     INFO     CD14_monocyte done!
{'OHS_s3atac_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/OHS_s3atac_1.FIXEDCELLS.fragments.tsv.gz', 'OHS_s3atac_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/OHS_s3atac_2.FIXEDCELLS.fragments.tsv.gz'}


2023-02-20 18:54:56,535	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=1035917)[0m 2023-02-20 18:54:59,368 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=1035925)[0m 2023-02-20 18:54:59,643 cisTopic     INFO     Creating pseudobulk for CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=1035912)[0m 2023-02-20 18:55:00,000 cisTopic     INFO     Creating pseudobulk for CD16_monocyte
[2m[36m(export_pseudobulk_ray pid=1035923)[0m 2023-02-20 18:55:00,275 cisTopic     INFO     Creating pseudobulk for CD4_Tcell




[2m[36m(export_pseudobulk_ray pid=1035918)[0m 2023-02-20 18:55:00,591 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell




[2m[36m(export_pseudobulk_ray pid=1035914)[0m 2023-02-20 18:55:01,021 cisTopic     INFO     Creating pseudobulk for Dendriticcell
[2m[36m(export_pseudobulk_ray pid=1035909)[0m 2023-02-20 18:55:01,200 cisTopic     INFO     Creating pseudobulk for Naturalkillercell




[2m[36m(export_pseudobulk_ray pid=1035914)[0m 2023-02-20 18:55:14,169 cisTopic     INFO     Dendriticcell done!
[2m[36m(export_pseudobulk_ray pid=1035909)[0m 2023-02-20 18:55:18,067 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=1035912)[0m 2023-02-20 18:55:22,400 cisTopic     INFO     CD16_monocyte done!
[2m[36m(export_pseudobulk_ray pid=1035917)[0m 2023-02-20 18:56:08,421 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=1035923)[0m 2023-02-20 18:56:23,962 cisTopic     INFO     CD4_Tcell done!
[2m[36m(export_pseudobulk_ray pid=1035925)[0m 2023-02-20 18:57:52,969 cisTopic     INFO     CD14_monocyte done!


# consensus

In [14]:
cto_path_dict = {
    x.split("/")[-1].split(f"__")[0] + "." + x.split("/")[-1].split(f".")[-6]: x
    for x in sorted(glob.glob("cistopic_objects/*topics.pkl"))
}
cto_path_dict

{'10xmultiome_celltypefair_1.FIXEDCELLS.FIXEDCELLS__cto': 'cistopic_objects/10xmultiome_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_19topics.pkl',
 '10xv11_celltypefair_1.FIXEDCELLS.FIXEDCELLS__cto': 'cistopic_objects/10xv11_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_26topics.pkl',
 '10xv1_celltypefair_1.FIXEDCELLS.FIXEDCELLS__cto': 'cistopic_objects/10xv1_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_21topics.pkl',
 '10xv2_celltypefair_1.FIXEDCELLS.FIXEDCELLS__cto': 'cistopic_objects/10xv2_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_25topics.pkl',
 'ddseq_celltypefair_1.FIXEDCELLS.FIXEDCELLS__cto': 'cistopic_objects/ddseq_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_17topics.pkl',
 'hydrop_celltypefair_1.FIXEDCELLS.FIXEDCELLS__cto': 'cistopic_objects/hydrop_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_22topics.pkl',
 'mtscatac_celltypefair_1.FIXEDCELLS.FIXEDCELLS__cto': 'cistopic_o

In [15]:
for sample, cto_path in cto_path_dict.items():
    with open(cto_path, "rb") as f:
        cto = pickle.load(f)

    cto.cell_data.to_csv(cto_path.replace(".pkl", ".cell_data.csv"))

In [16]:
cell_data_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("cistopic_objects/*topics.cell_data.csv"))
}
cell_data_path_dict

{'10xmultiome_celltypefair_1.FIXEDCELLS': 'cistopic_objects/10xmultiome_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_19topics.cell_data.csv',
 '10xv11_celltypefair_1.FIXEDCELLS': 'cistopic_objects/10xv11_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_26topics.cell_data.csv',
 '10xv1_celltypefair_1.FIXEDCELLS': 'cistopic_objects/10xv1_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_21topics.cell_data.csv',
 '10xv2_celltypefair_1.FIXEDCELLS': 'cistopic_objects/10xv2_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_25topics.cell_data.csv',
 'ddseq_celltypefair_1.FIXEDCELLS': 'cistopic_objects/ddseq_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_17topics.cell_data.csv',
 'hydrop_celltypefair_1.FIXEDCELLS': 'cistopic_objects/hydrop_celltypefair_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_22topics.cell_data.csv',
 'mtscatac_celltypefair_1.FIXEDCELLS': 'cistopic_objects/mtscatac_celltypefair_1.FIXEDCELLS__cto.scrub

In [17]:
bw_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*_pseudobulk_bw_files"))
}
bw_path_dict

{'10xmultiome_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/10xmultiome_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 '10xv11_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/10xv11_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 '10xv1_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/10xv1_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 '10xv2_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/10xv2_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'ddseq_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/ddseq_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'hydrop_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/hydrop_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'mtscatac_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/mtscatac_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 's3atac_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/s3atac_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files'}

In [18]:
bed_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*_pseudobulk_bed_files"))
}
bed_path_dict

{'10xmultiome_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/10xmultiome_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 '10xv11_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/10xv11_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 '10xv1_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/10xv1_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 '10xv2_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/10xv2_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'ddseq_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/ddseq_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'hydrop_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/hydrop_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'mtscatac_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/mtscatac_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 's3atac_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/s3atac_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files'}

In [19]:
for sample in bed_path_dict.keys():
    bed_paths = {
        x.split("/")[-1].split("__")[0].split(".bed.gz")[0]: x
        for x in glob.glob(bed_path_dict[sample] + "/*")
    }

In [20]:
from pycisTopic.pseudobulk_peak_calling import *

In [21]:
import ray

In [22]:
narrow_peaks_dict = {}
ray.shutdown()
for sample in bed_path_dict.keys():
    narrow_peaks_dict_path = bed_path_dict[sample].replace(
        "_pseudobulk_bed_files", "_narrow_peaks_dict.pkl"
    )
    peak_path = os.path.join(
        "final_consensus_peaks", f"{sample}__SCREEN_consensus_peaks"
    )
    if not os.path.exists(peak_path):
        os.mkdir(peak_path)

    if not os.path.exists(narrow_peaks_dict_path):
        cell_data = pd.read_csv(cell_data_path_dict[sample])
        cto_celltypes = set(
            [
                x.replace(" ", "").replace("+", "").replace("_", "")
                for x in set(cell_data["harmony_consensus_cell_type__mega"].unique())
            ]
        )

        bed_celltypes = set(
            [
                x.split(".")[0].replace("+", "").replace("_", "")
                for x in os.listdir(bed_path_dict[sample])
            ]
        )

        if cto_celltypes == bed_celltypes:
            print(f"Starting {narrow_peaks_dict_path}")
            bed_paths = {
                x.split("/")[-1].split("__")[0].split(".bed.gz")[0]: x
                for x in glob.glob(bed_path_dict[sample] + "/*")
            }
            narrow_peaks_dict = peak_calling(
                macs_path="macs2",
                bed_paths=bed_paths,
                outdir=peak_path,
                genome_size="hs",
                n_cpu=16,
                input_format="BEDPE",
                shift=73,
                ext_size=146,
                keep_dup="all",
                q_value=0.05,
            )
            with open(narrow_peaks_dict_path, "wb") as f:
                pickle.dump(narrow_peaks_dict, f)
        else:
            print(f"{sample} cell types not matching!! Rerun bed file writing.")
            print(f"\t{bed_celltypes}")
            print(f"\t{cto_celltypes}")
    else:
        print(f"{narrow_peaks_dict_path} already exists")

Starting final_consensus_peaks/10xmultiome_celltypefair_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl


2023-02-20 18:58:52,829	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(macs_call_peak_ray pid=1037241)[0m 2023-02-20 18:58:55,755 cisTopic     INFO     Calling peaks for Dendriticcell with macs2 callpeak --treatment final_consensus_peaks/10xmultiome_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/Dendriticcell.bed.gz --name Dendriticcell  --outdir final_consensus_peaks/10xmultiome_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=1037251)[0m 2023-02-20 18:58:55,746 cisTopic     INFO     Calling peaks for Naturalkillercell with macs2 callpeak --treatment final_consensus_peaks/10xmultiome_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/Naturalkillercell.bed.gz --name Naturalkillercell  --outdir final_consensus_peaks/10xmultiome_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nol

2023-02-20 19:01:01,948	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(macs_call_peak_ray pid=1038186)[0m 2023-02-20 19:01:04,902 cisTopic     INFO     Calling peaks for CD4_Tcell with macs2 callpeak --treatment final_consensus_peaks/10xv11_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/CD4_Tcell.bed.gz --name CD4_Tcell  --outdir final_consensus_peaks/10xv11_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=1038189)[0m 2023-02-20 19:01:04,888 cisTopic     INFO     Calling peaks for Naturalkillercell with macs2 callpeak --treatment final_consensus_peaks/10xv11_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/Naturalkillercell.bed.gz --name Naturalkillercell  --outdir final_consensus_peaks/10xv11_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_r

2023-02-20 19:03:04,383	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(macs_call_peak_ray pid=1039147)[0m 2023-02-20 19:03:07,290 cisTopic     INFO     Calling peaks for Bcell with macs2 callpeak --treatment final_consensus_peaks/10xv1_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/Bcell.bed.gz --name Bcell  --outdir final_consensus_peaks/10xv1_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=1039145)[0m 2023-02-20 19:03:07,310 cisTopic     INFO     Calling peaks for Naturalkillercell with macs2 callpeak --treatment final_consensus_peaks/10xv1_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/Naturalkillercell.bed.gz --name Naturalkillercell  --outdir final_consensus_peaks/10xv1_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=1039152)

2023-02-20 19:04:44,071	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(macs_call_peak_ray pid=1040097)[0m 2023-02-20 19:04:47,029 cisTopic     INFO     Calling peaks for Naturalkillercell with macs2 callpeak --treatment final_consensus_peaks/10xv2_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/Naturalkillercell.bed.gz --name Naturalkillercell  --outdir final_consensus_peaks/10xv2_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=1040109)[0m 2023-02-20 19:04:47,023 cisTopic     INFO     Calling peaks for CD16_monocyte with macs2 callpeak --treatment final_consensus_peaks/10xv2_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/CD16_monocyte.bed.gz --name CD16_monocyte  --outdir final_consensus_peaks/10xv2_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_cal

2023-02-20 19:07:14,026	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(macs_call_peak_ray pid=1041127)[0m 2023-02-20 19:07:16,941 cisTopic     INFO     Calling peaks for Naturalkillercell with macs2 callpeak --treatment final_consensus_peaks/ddseq_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/Naturalkillercell.bed.gz --name Naturalkillercell  --outdir final_consensus_peaks/ddseq_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=1041130)[0m 2023-02-20 19:07:16,998 cisTopic     INFO     Calling peaks for CD16_monocyte with macs2 callpeak --treatment final_consensus_peaks/ddseq_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/CD16_monocyte.bed.gz --name CD16_monocyte  --outdir final_consensus_peaks/ddseq_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_cal

2023-02-20 19:08:33,024	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(macs_call_peak_ray pid=1042038)[0m 2023-02-20 19:08:36,053 cisTopic     INFO     Calling peaks for CD16_monocyte with macs2 callpeak --treatment final_consensus_peaks/hydrop_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/CD16_monocyte.bed.gz --name CD16_monocyte  --outdir final_consensus_peaks/hydrop_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=1042044)[0m 2023-02-20 19:08:36,038 cisTopic     INFO     Calling peaks for CytotoxicTcell with macs2 callpeak --treatment final_consensus_peaks/hydrop_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/CytotoxicTcell.bed.gz --name CytotoxicTcell  --outdir final_consensus_peaks/hydrop_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_pea

2023-02-20 19:09:18,439	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(macs_call_peak_ray pid=1042894)[0m 2023-02-20 19:09:21,333 cisTopic     INFO     Calling peaks for CD16_monocyte with macs2 callpeak --treatment final_consensus_peaks/mtscatac_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/CD16_monocyte.bed.gz --name CD16_monocyte  --outdir final_consensus_peaks/mtscatac_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=1042891)[0m 2023-02-20 19:09:21,389 cisTopic     INFO     Calling peaks for CD14_monocyte with macs2 callpeak --treatment final_consensus_peaks/mtscatac_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/CD14_monocyte.bed.gz --name CD14_monocyte  --outdir final_consensus_peaks/mtscatac_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_cal

2023-02-20 19:11:14,971	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(macs_call_peak_ray pid=1043813)[0m 2023-02-20 19:11:17,922 cisTopic     INFO     Calling peaks for Bcell with macs2 callpeak --treatment final_consensus_peaks/s3atac_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/Bcell.bed.gz --name Bcell  --outdir final_consensus_peaks/s3atac_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=1043806)[0m 2023-02-20 19:11:17,924 cisTopic     INFO     Calling peaks for CD14_monocyte with macs2 callpeak --treatment final_consensus_peaks/s3atac_celltypefair_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files/CD14_monocyte.bed.gz --name CD14_monocyte  --outdir final_consensus_peaks/s3atac_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=1043803)[0m 2023

# call consensus peaks

In [23]:
from pycisTopic.iterative_peak_calling import get_consensus_peaks

In [24]:
narrow_peaks_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*_narrow_peaks_dict.pkl"))
}
narrow_peaks_path_dict

{'10xmultiome_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/10xmultiome_celltypefair_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl',
 '10xv11_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/10xv11_celltypefair_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl',
 '10xv1_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/10xv1_celltypefair_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl',
 '10xv2_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/10xv2_celltypefair_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl',
 'ddseq_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/ddseq_celltypefair_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl',
 'hydrop_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/hydrop_celltypefair_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl',
 'mtscatac_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/mtscatac_celltypefair_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl',
 's3atac_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/s3atac_celltypefair_1.FIXEDCELLS__SCREEN_narrow_peaks_dic

In [25]:
path_to_blacklist = "../0_resources/regions/hg38-blacklist.v2.bed"

In [26]:
peak_half_width = 250

# Get consensus peaks
consensus_peaks_dict = {}
for sample in narrow_peaks_path_dict.keys():
    print(sample)
    consensus_out_path = narrow_peaks_path_dict[sample].replace(
        "_narrow_peaks_dict.pkl", "_consensus_peaks.bed"
    )
    if not os.path.exists(consensus_out_path):
        cell_data = pd.read_csv(cell_data_path_dict[sample])
        cto_celltypes = set(
            [
                x.replace(" ", "").replace("+", "").replace("_", "")
                for x in set(cell_data["harmony_consensus_cell_type__mega"].unique())
            ]
        )

        with open(narrow_peaks_path_dict[sample], "rb") as f:
            narrow_peaks_dict = pickle.load(f)
        peaks_celltypes = set(
            [
                x.replace(" ", "").replace("+", "").replace("_", "")
                for x in set(narrow_peaks_dict.keys())
            ]
        )

        if cto_celltypes == peaks_celltypes:
            consensus_peaks = get_consensus_peaks(
                narrow_peaks_dict,
                peak_half_width,
                chromsizes=chromsizes,
                path_to_blacklist=path_to_blacklist,
            )

            consensus_peaks.to_bed(
                path=consensus_out_path, keep=True, compression="infer", chain=False
            )
        else:
            print("CELL TYPE SETS NOT MATCHING! Rerun peak calling.")
            print(peaks_celltypes - cto_celltypes)
            print(cto_celltypes - peaks_celltypes)
    else:
        print(f"{consensus_out_path} already exists, skipping...")

10xmultiome_celltypefair_1.FIXEDCELLS


  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends




  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.co

10xv11_celltypefair_1.FIXEDCELLS


  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends




  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.co

10xv1_celltypefair_1.FIXEDCELLS


  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends




  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.co

10xv2_celltypefair_1.FIXEDCELLS


  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends




  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.co

ddseq_celltypefair_1.FIXEDCELLS


  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.co

hydrop_celltypefair_1.FIXEDCELLS


  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends




  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.co

mtscatac_celltypefair_1.FIXEDCELLS


  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends




  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.co

s3atac_celltypefair_1.FIXEDCELLS


  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends
  scdf.loc[:, "Start"] = new_starts
  scdf.loc[:, "End"] = new_ends




  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.co

# Check % chrM in consensus peaks

In [27]:
consensus_peaks_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*consensus_peaks.bed"))
}
consensus_peaks_path_dict

{'10xmultiome_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/10xmultiome_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 '10xv11_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/10xv11_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 '10xv1_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/10xv1_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 '10xv2_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/10xv2_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'ddseq_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/ddseq_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'hydrop_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/hydrop_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'mtscatac_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/mtscatac_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 's3atac_celltypefair_1.FIXEDCELLS': 'final_consensus_peaks/s3atac_celltypefair_1.FIXEDCELLS__SCREEN_consensus_peaks.bed'}

In [28]:
for sample, path in consensus_peaks_path_dict.items():
    print(sample)
    peaks_df = pd.read_csv(path, sep="\t", header=None)

    chroms_in_df = list(sorted(peaks_df[0].unique()))
    chroms_standard = ["chr" + str(x + 1) for x in range(22)] + ["chrX"]
    chroms_nonstandard = list(set(chroms_in_df) - set(chroms_standard) - set(["chrM"]))

    n_standard = peaks_df[0].value_counts()[chroms_standard].sum()
    n_contigs = peaks_df[0].value_counts()[chroms_nonstandard].sum()
    n_chrm = peaks_df[0].value_counts()["chrM"].sum()
    pct_nonstandard = (n_contigs + n_chrm) / len(peaks_df) * 100

    print(f"\tpeaks on standard chromosomes: {n_standard}")
    print(f"\tpeaks on contigs: {n_contigs}")
    print(f"\tpeaks on chrM: {n_chrm}")
    print(f"\t% peaks non standard chromosomes: {pct_nonstandard}%")
    if "chrY" in chroms_in_df:
        n_chrY = peaks_df[0].value_counts()["chrY"].sum()
        print(f"\tpeaks on chrY: {n_chrY}")
    else:
        print(f"\tpeaks on chrY: 0")

10xmultiome_celltypefair_1.FIXEDCELLS
	peaks on standard chromosomes: 197322
	peaks on contigs: 334
	peaks on chrM: 23
	% peaks non standard chromosomes: 0.1805958144264186%
	peaks on chrY: 86
10xv11_celltypefair_1.FIXEDCELLS
	peaks on standard chromosomes: 226766
	peaks on contigs: 420
	peaks on chrM: 24
	% peaks non standard chromosomes: 0.19541393424585188%
	peaks on chrY: 189
10xv1_celltypefair_1.FIXEDCELLS
	peaks on standard chromosomes: 179537
	peaks on contigs: 284
	peaks on chrM: 21
	% peaks non standard chromosomes: 0.16959330968294392%
	peaks on chrY: 94
10xv2_celltypefair_1.FIXEDCELLS
	peaks on standard chromosomes: 275687
	peaks on contigs: 381
	peaks on chrM: 23
	% peaks non standard chromosomes: 0.14632856558163795%
	peaks on chrY: 262
ddseq_celltypefair_1.FIXEDCELLS
	peaks on standard chromosomes: 169890
	peaks on contigs: 218
	peaks on chrM: 22
	% peaks non standard chromosomes: 0.14106859460412624%
	peaks on chrY: 91
hydrop_celltypefair_1.FIXEDCELLS
	peaks on standard 