In [1]:
from pycisTopic.pseudobulk_peak_calling import export_pseudobulk, peak_calling
import pyranges as pr
import requests
import os
import pandas as pd
import glob
import pickle

In [2]:
%load_ext lab_black

In [3]:
# get chromosome sizes (hg38)
if not os.path.exists("chromsizes.txt"):
    target_url = (
        "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes"
    )
    chromsizes = pd.read_csv(target_url, sep="\t", header=None)
    chromsizes.columns = ["Chromosome", "End"]
    chromsizes["Start"] = [0] * chromsizes.shape[0]
    chromsizes = chromsizes.loc[:, ["Chromosome", "Start", "End"]]
    chromsizes = pr.PyRanges(chromsizes)
    chromsizes.to_csv("chromsizes.txt")
    chromsizes
else:
    chromsizes = pd.read_csv("chromsizes.txt")

In [4]:
cto_consensus_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("cistopic_objects/*singlets.model*consensus.pkl"))
}
cto_consensus_path_dict

{'BIO_ddseq_1.FULL': 'cistopic_objects/BIO_ddseq_1.FULL__cto.scrublet0-4.fmx.singlets.model_11topics.dimreduc.consensus.pkl',
 'BIO_ddseq_2.FULL': 'cistopic_objects/BIO_ddseq_2.FULL__cto.scrublet0-4.fmx.singlets.model_9topics.dimreduc.consensus.pkl',
 'BIO_ddseq_3.FULL': 'cistopic_objects/BIO_ddseq_3.FULL__cto.scrublet0-4.fmx.singlets.model_11topics.dimreduc.consensus.pkl',
 'BIO_ddseq_4.FULL': 'cistopic_objects/BIO_ddseq_4.FULL__cto.scrublet0-4.fmx.singlets.model_9topics.dimreduc.consensus.pkl',
 'BRO_mtscatac_1.FULL': 'cistopic_objects/BRO_mtscatac_1.FULL__cto.scrublet0-4.fmx.singlets.model_11topics.dimreduc.consensus.pkl',
 'BRO_mtscatac_2.FULL': 'cistopic_objects/BRO_mtscatac_2.FULL__cto.scrublet0-4.fmx.singlets.model_10topics.dimreduc.consensus.pkl',
 'CNA_10xmultiome_1.FULL': 'cistopic_objects/CNA_10xmultiome_1.FULL__cto.scrublet0-4.fmx.singlets.model_12topics.dimreduc.consensus.pkl',
 'CNA_10xmultiome_2.FULL': 'cistopic_objects/CNA_10xmultiome_2.FULL__cto.scrublet0-4.fmx.singlet

In [5]:
fragments_path_dict = {
    x.split("/")[-1].split(f".fragments.tsv.gz")[0]: x
    for x in sorted(glob.glob("../1_data_repository/full_fragments/*.tsv.gz"))
}
fragments_path_dict

{'BIO_ddseq_1.FULL': '../1_data_repository/full_fragments/BIO_ddseq_1.FULL.fragments.tsv.gz',
 'BIO_ddseq_2.FULL': '../1_data_repository/full_fragments/BIO_ddseq_2.FULL.fragments.tsv.gz',
 'BIO_ddseq_3.FULL': '../1_data_repository/full_fragments/BIO_ddseq_3.FULL.fragments.tsv.gz',
 'BIO_ddseq_4.FULL': '../1_data_repository/full_fragments/BIO_ddseq_4.FULL.fragments.tsv.gz',
 'BRO_mtscatac_1.FULL': '../1_data_repository/full_fragments/BRO_mtscatac_1.FULL.fragments.tsv.gz',
 'BRO_mtscatac_2.FULL': '../1_data_repository/full_fragments/BRO_mtscatac_2.FULL.fragments.tsv.gz',
 'CNA_10xmultiome_1.FULL': '../1_data_repository/full_fragments/CNA_10xmultiome_1.FULL.fragments.tsv.gz',
 'CNA_10xmultiome_2.FULL': '../1_data_repository/full_fragments/CNA_10xmultiome_2.FULL.fragments.tsv.gz',
 'CNA_10xv11_1.FULL': '../1_data_repository/full_fragments/CNA_10xv11_1.FULL.fragments.tsv.gz',
 'CNA_10xv11_2.FULL': '../1_data_repository/full_fragments/CNA_10xv11_2.FULL.fragments.tsv.gz',
 'CNA_10xv11_3.FULL'

In [6]:
import gc
import logging
import os
import re
import subprocess
import sys
from typing import Dict, List, Optional, Union

import numpy as np
import pandas as pd
import pyBigWig
import pyranges as pr
import ray

from pycisTopic.cistopic_class import *
from pycisTopic.utils import *


def export_pseudobulk(
    input_data: Union["CistopicObject", pd.DataFrame, Dict[str, pd.DataFrame]],
    variable: str,
    chromsizes: Union[pd.DataFrame, pr.PyRanges],
    bed_path: str,
    bigwig_path: str,
    path_to_fragments: Optional[Dict[str, str]] = None,
    sample_id_col: Optional[str] = "sample_id",
    n_cpu: Optional[int] = 1,
    normalize_bigwig: Optional[bool] = True,
    remove_duplicates: Optional[bool] = True,
    split_pattern: Optional[str] = "___",
    use_polars: Optional[bool] = True,
    **kwargs
):
    """
    Create pseudobulks as bed and bigwig from single cell fragments file given a barcode annotation.
    Parameters
    ---------
    input_data: CistopicObject or pd.DataFrame
            A :class:`CistopicObject` containing the specified `variable` as a column in :class:`CistopicObject.cell_data` or a cell metadata
            :class:`pd.DataFrame` containing barcode as rows, containing the specified `variable` as a column (additional columns are
            possible) and a `sample_id` column. Index names must contain the BARCODE (e.g. ATGTCGTC-1), additional tags are possible separating with -
            (e.g. ATGCTGTGCG-1-Sample_1). The levels in the sample_id column must agree with the keys in the path_to_fragments dictionary.
            Alternatively, if the cell metadata contains a column named barcode it will be used instead of the index names.
    variable: str
            A character string indicating the column that will be used to create the different group pseudobulk. It must be included in
            the cell metadata provided as input_data.
    chromsizes: pd.DataFrame or pr.PyRanges
            A data frame or :class:`pr.PyRanges` containing size of each chromosome, containing 'Chromosome', 'Start' and 'End' columns.
    bed_path: str
            Path to folder where the fragments bed files per group will be saved. If None, files will not be generated.
    bigwig_path: str
            Path to folder where the bigwig files per group will be saved. If None, files will not be generated.
    path_to_fragments: str or dict, optional
            A dictionary of character strings, with sample name as names indicating the path to the fragments file/s from which pseudobulk profiles have to
            be created. If a :class:`CistopicObject` is provided as input it will be ignored, but if a cell metadata :class:`pd.DataFrame` is provided it
            is necessary to provide it. The keys of the dictionary need to match with the sample_id tag added to the index names of the input data frame.
    sample_id_col: str, optional
            Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'.
    n_cpu: int, optional
            Number of cores to use. Default: 1.
    normalize_bigwig: bool, optional
            Whether bigwig files should be CPM normalized. Default: True.
    remove_duplicates: bool, optional
            Whether duplicates should be removed before converting the data to bigwig.
    split_pattern: str, optional
            Pattern to split cell barcode from sample id. Default: ___ .
    use_polars: bool, optional
            Whether to use polars to read fragments files. Default: True.
    **kwargs
            Additional parameters for ray.init()
    Return
    ------
    dict
            A dictionary containing the paths to the newly created bed fragments files per group a dictionary containing the paths to the
            newly created bigwig files per group.
    """
    # Create logger
    level = logging.INFO
    log_format = "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger("cisTopic")

    # Get fragments file
    if isinstance(input_data, CistopicObject):
        path_to_fragments = input_data.path_to_fragments
        if path_to_fragments is None:
            log.error("No path_to_fragments in this cisTopic object.")
        cell_data = input_data.cell_data
    elif isinstance(input_data, pd.DataFrame):
        if path_to_fragments is None:
            log.error("Please, provide path_to_fragments.")
        cell_data = input_data
    # Check for sample_id column
    try:
        sample_ids = list(set(cell_data[sample_id_col]))
    except ValueError:
        print(
            'Please, include a sample identification column (e.g. "sample_id") in your cell metadata!'
        )

    # Get fragments
    fragments_df_dict = {}
    for sample_id in path_to_fragments.keys():
        if sample_id not in sample_ids:
            log.info(
                "The following path_to_fragments entry is not found in the cell metadata sample_id_col: ",
                sample_id,
                ". It will be ignored.",
            )
        else:
            log.info("Reading fragments from " + path_to_fragments[sample_id])
            fragments_df = read_fragments_from_file(
                path_to_fragments[sample_id], use_polars=use_polars
            ).df
            # Convert to int32 for memory efficiency
            fragments_df.Start = np.int32(fragments_df.Start)
            fragments_df.End = np.int32(fragments_df.End)
            if "Score" in fragments_df:
                fragments_df.Score = np.int32(fragments_df.Score)
            if "barcode" in cell_data:
                fragments_df = fragments_df.loc[
                    fragments_df["Name"].isin(cell_data["barcode"].tolist())
                ]
            else:
                fragments_df = fragments_df.loc[
                    fragments_df["Name"].isin(
                        prepare_tag_cells(cell_data.index.tolist(), split_pattern)
                    )
                ]
            fragments_df_dict[sample_id] = fragments_df

    # Set groups
    if "barcode" in cell_data:
        cell_data = cell_data.loc[:, [variable, sample_id_col, "barcode"]]
    else:
        cell_data = cell_data.loc[:, [variable, sample_id_col]]
    cell_data[variable] = cell_data[variable].replace(" ", "", regex=True)
    cell_data[variable] = cell_data[variable].replace("[^A-Za-z0-9]+", "_", regex=True)
    groups = sorted(list(set(cell_data[variable])))
    # Check chromosome sizes
    if isinstance(chromsizes, pd.DataFrame):
        chromsizes = chromsizes.loc[:, ["Chromosome", "Start", "End"]]
        chromsizes = pr.PyRanges(chromsizes)
    # Check that output dir exist and generate output paths
    if isinstance(bed_path, str):
        if not os.path.exists(bed_path):
            os.makedirs(bed_path)
        bed_paths = {
            group: os.path.join(bed_path, str(group) + ".bed.gz") for group in groups
        }
    else:
        bed_paths = {}
    if isinstance(bigwig_path, str):
        if not os.path.exists(bigwig_path):
            os.makedirs(bigwig_path)
        bw_paths = {
            group: os.path.join(bigwig_path, str(group) + ".bw") for group in groups
        }
    else:
        bw_paths = {}
    # Create pseudobulks
    if n_cpu > 1:
        ray.init(num_cpus=n_cpu, **kwargs)
        ray_handle = ray.wait(
            [
                export_pseudobulk_ray.remote(
                    cell_data,
                    group,
                    fragments_df_dict,
                    chromsizes,
                    bigwig_path,
                    bed_path,
                    sample_id_col,
                    normalize_bigwig,
                    remove_duplicates,
                    split_pattern,
                )
                for group in groups
            ],
            num_returns=len(groups),
        )
        ray.shutdown()
    else:
        [
            export_pseudobulk_one_sample(
                cell_data,
                group,
                fragments_df_dict,
                chromsizes,
                bigwig_path,
                bed_path,
                sample_id_col,
                normalize_bigwig,
                remove_duplicates,
                split_pattern,
            )
            for group in groups
        ]

    return bw_paths, bed_paths


def export_pseudobulk_one_sample(
    cell_data: pd.DataFrame,
    group: str,
    fragments_df_dict: Dict[str, pd.DataFrame],
    chromsizes: pr.PyRanges,
    bigwig_path: str,
    bed_path: str,
    sample_id_col: Optional[str] = "sample_id",
    normalize_bigwig: Optional[bool] = True,
    remove_duplicates: Optional[bool] = True,
    split_pattern: Optional[str] = "___",
):
    """
    Create pseudobulk as bed and bigwig from single cell fragments file given a barcode annotation and a group.
    Parameters
    ---------
    cell_data: pd.DataFrame
            A cell metadata :class:`pd.Dataframe` containing barcodes, their annotation and their sample of origin.
    group: str
            A character string indicating the group for which pseudobulks will be created.
    fragments_df_dict: dict
            A dictionary containing data frames as values with 'Chromosome', 'Start', 'End', 'Name', and 'Score' as columns; and sample label
            as keys. 'Score' indicates the number of times that a fragments is found assigned to that barcode.
    chromsizes: pr.PyRanges
            A :class:`pr.PyRanges` containing size of each column, containing 'Chromosome', 'Start' and 'End' columns.
    bigwig_path: str
            Path to folder where the bigwig file will be saved.
    bed_path: str
            Path to folder where the fragments bed file will be saved.
    sample_id_col: str, optional
            Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'.
    normalize_bigwig: bool, optional
            Whether bigwig files should be CPM normalized. Default: True.
    remove_duplicates: bool, optional
            Whether duplicates should be removed before converting the data to bigwig.
    split_pattern: str
            Pattern to split cell barcode from sample id. Default: ___ .
    """
    # Create logger
    level = logging.INFO
    log_format = "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger("cisTopic")

    log.info("Creating pseudobulk for " + str(group))
    group_fragments_list = []
    group_fragments_dict = {}
    for sample_id in fragments_df_dict:
        sample_data = cell_data[cell_data.loc[:, sample_id_col].isin([sample_id])]
        if "barcode" in sample_data:
            sample_data.index = sample_data["barcode"].tolist()
        else:
            sample_data.index = prepare_tag_cells(
                sample_data.index.tolist(), split_pattern
            )
        group_var = sample_data.iloc[:, 0]
        barcodes = group_var[group_var.isin([group])].index.tolist()
        fragments_df = fragments_df_dict[sample_id]
        group_fragments = fragments_df.loc[fragments_df["Name"].isin(barcodes)]
        if len(fragments_df_dict) > 1:
            group_fragments_dict[sample_id] = group_fragments

    if len(fragments_df_dict) > 1:
        group_fragments_list = [
            group_fragments_dict[list(group_fragments_dict.keys())[x]]
            for x in range(len(fragments_df_dict))
        ]
        group_fragments = group_fragments_list[0].append(group_fragments_list[1:])

    group_fragments = group_fragments[
        group_fragments["Chromosome"].isin(chromsizes.Chromosome)
    ]

    del group_fragments_dict
    del group_fragments_list
    del fragments_df
    gc.collect()

    group_pr = pr.PyRanges(group_fragments)
    if isinstance(bigwig_path, str):
        bigwig_path_group = os.path.join(bigwig_path, str(group) + ".bw")
        if remove_duplicates:
            group_pr.to_bigwig(
                path=bigwig_path_group,
                chromosome_sizes=chromsizes,
                rpm=normalize_bigwig,
            )
        else:
            group_pr.to_bigwig(
                path=bigwig_path_group,
                chromosome_sizes=chromsizes,
                rpm=normalize_bigwig,
                value_col="Score",
            )
    if isinstance(bed_path, str):
        bed_path_group = os.path.join(bed_path, str(group) + ".bed.gz")
        group_pr.to_bed(
            path=bed_path_group, keep=False, compression="infer", chain=False
        )

    log.info(str(group) + " done!")


@ray.remote
def export_pseudobulk_ray(
    cell_data: pd.DataFrame,
    group: str,
    fragments_df_dict: Dict[str, pd.DataFrame],
    chromsizes: pr.PyRanges,
    bigwig_path: str,
    bed_path: str,
    sample_id_col: Optional[str] = "sample_id",
    normalize_bigwig: Optional[bool] = True,
    remove_duplicates: Optional[bool] = True,
    split_pattern: Optional[str] = "___",
):
    """
    Create pseudobulk as bed and bigwig from single cell fragments file given a barcode annotation and a group.
    Parameters
    ---------
    cell_data: pd.DataFrame
            A cell metadata :class:`pd.Dataframe` containing barcodes, their annotation and their sample of origin.
    group: str
            A character string indicating the group for which pseudobulks will be created.
    fragments_df_dict: dict
            A dictionary containing data frames as values with 'Chromosome', 'Start', 'End', 'Name', and 'Score' as columns; and sample label
            as keys. 'Score' indicates the number of times that a fragments is found assigned to that barcode.
    chromsizes: pr.PyRanges
            A :class:`pr.PyRanges` containing size of each column, containing 'Chromosome', 'Start' and 'End' columns.
    bed_path: str
            Path to folder where the fragments bed file will be saved.
    bigwig_path: str
            Path to folder where the bigwig file will be saved.
    sample_id_col: str, optional
            Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'.
    normalize_bigwig: bool, optional
            Whether bigwig files should be CPM normalized. Default: True.
    remove_duplicates: bool, optional
            Whether duplicates should be removed before converting the data to bigwig.
    split_pattern: str
            Pattern to split cell barcode from sample id. Default: ___ .
    """
    export_pseudobulk_one_sample(
        cell_data,
        group,
        fragments_df_dict,
        chromsizes,
        bigwig_path,
        bed_path,
        sample_id_col,
        normalize_bigwig,
        remove_duplicates,
        split_pattern,
    )

In [7]:
# for sample in ["BRO_mtscatac_1.LIBDS"]:
bw_paths_dict = {}
bed_paths_dict = {}

import ray

if ray.is_initialized():
    print("Shutting down Ray")
    ray.shutdown()

for sample in cto_consensus_path_dict.keys():
    # for sample in ["BRO_mtscatac_1.LIBDS"]:
    bed_path = os.path.join(
        "SCREEN_peaks", f"{sample}__SCREEN_pseudobulk_bed_files"
    )
    bw_path = os.path.join(
        "SCREEN_peaks", f"{sample}__SCREEN_pseudobulk_bw_files"
    )
    if not os.path.exists(bed_path):
        cto_path = cto_consensus_path_dict[sample]
        with open(cto_path, "rb") as f:
            cto = pickle.load(f)

            bw_paths, bed_paths = export_pseudobulk(
                input_data=cto,
                variable="consensus_cell_type",
                sample_id_col="sample_id",
                chromsizes=chromsizes,
                bed_path=bed_path,
                bigwig_path=bw_path,
                path_to_fragments=fragments_path_dict[sample],
                n_cpu=16,
                normalize_bigwig=True,
                remove_duplicates=True,
            )

            if ray.is_initialized():
                print("Shutting down Ray")
                ray.shutdown()
    else:
        print(f"{bed_path} exists, skipping...")

SCREEN_peaks/BIO_ddseq_1.FULL__SCREEN_pseudobulk_bed_files exists, skipping...
SCREEN_peaks/BIO_ddseq_2.FULL__SCREEN_pseudobulk_bed_files exists, skipping...
SCREEN_peaks/BIO_ddseq_3.FULL__SCREEN_pseudobulk_bed_files exists, skipping...
SCREEN_peaks/BIO_ddseq_4.FULL__SCREEN_pseudobulk_bed_files exists, skipping...
SCREEN_peaks/BRO_mtscatac_1.FULL__SCREEN_pseudobulk_bed_files exists, skipping...
SCREEN_peaks/BRO_mtscatac_2.FULL__SCREEN_pseudobulk_bed_files exists, skipping...
SCREEN_peaks/CNA_10xmultiome_1.FULL__SCREEN_pseudobulk_bed_files exists, skipping...
SCREEN_peaks/CNA_10xmultiome_2.FULL__SCREEN_pseudobulk_bed_files exists, skipping...
SCREEN_peaks/CNA_10xv11_1.FULL__SCREEN_pseudobulk_bed_files exists, skipping...
SCREEN_peaks/CNA_10xv11_2.FULL__SCREEN_pseudobulk_bed_files exists, skipping...
SCREEN_peaks/CNA_10xv11_3.FULL__SCREEN_pseudobulk_bed_files exists, skipping...
SCREEN_peaks/CNA_10xv11_4.FULL__SCREEN_pseudobulk_bed_files exists, skipping...
SCREEN_peaks/CNA_10xv11_5.FULL

# consensus

In [8]:
cell_data_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("cistopic_objects/*consensus.cell_data.tsv"))
}
cell_data_path_dict

{'BIO_ddseq_1.FULL': 'cistopic_objects/BIO_ddseq_1.FULL__cto.scrublet0-4.fmx.singlets.model_11topics.dimreduc.consensus.cell_data.tsv',
 'BIO_ddseq_2.FULL': 'cistopic_objects/BIO_ddseq_2.FULL__cto.scrublet0-4.fmx.singlets.model_9topics.dimreduc.consensus.cell_data.tsv',
 'BIO_ddseq_3.FULL': 'cistopic_objects/BIO_ddseq_3.FULL__cto.scrublet0-4.fmx.singlets.model_11topics.dimreduc.consensus.cell_data.tsv',
 'BIO_ddseq_4.FULL': 'cistopic_objects/BIO_ddseq_4.FULL__cto.scrublet0-4.fmx.singlets.model_9topics.dimreduc.consensus.cell_data.tsv',
 'BRO_mtscatac_1.FULL': 'cistopic_objects/BRO_mtscatac_1.FULL__cto.scrublet0-4.fmx.singlets.model_11topics.dimreduc.consensus.cell_data.tsv',
 'BRO_mtscatac_2.FULL': 'cistopic_objects/BRO_mtscatac_2.FULL__cto.scrublet0-4.fmx.singlets.model_10topics.dimreduc.consensus.cell_data.tsv',
 'CNA_10xmultiome_1.FULL': 'cistopic_objects/CNA_10xmultiome_1.FULL__cto.scrublet0-4.fmx.singlets.model_12topics.dimreduc.consensus.cell_data.tsv',
 'CNA_10xmultiome_2.FULL':

In [9]:
bw_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("SCREEN_peaks/*_pseudobulk_bw_files"))
}
bw_path_dict

{'BIO_ddseq_1.FULL': 'SCREEN_peaks/BIO_ddseq_1.FULL__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_2.FULL': 'SCREEN_peaks/BIO_ddseq_2.FULL__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_3.FULL': 'SCREEN_peaks/BIO_ddseq_3.FULL__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_4.FULL': 'SCREEN_peaks/BIO_ddseq_4.FULL__SCREEN_pseudobulk_bw_files',
 'BRO_mtscatac_1.FULL': 'SCREEN_peaks/BRO_mtscatac_1.FULL__SCREEN_pseudobulk_bw_files',
 'BRO_mtscatac_2.FULL': 'SCREEN_peaks/BRO_mtscatac_2.FULL__SCREEN_pseudobulk_bw_files',
 'CNA_10xmultiome_1.FULL': 'SCREEN_peaks/CNA_10xmultiome_1.FULL__SCREEN_pseudobulk_bw_files',
 'CNA_10xmultiome_2.FULL': 'SCREEN_peaks/CNA_10xmultiome_2.FULL__SCREEN_pseudobulk_bw_files',
 'CNA_10xv11_1.FULL': 'SCREEN_peaks/CNA_10xv11_1.FULL__SCREEN_pseudobulk_bw_files',
 'CNA_10xv11_2.FULL': 'SCREEN_peaks/CNA_10xv11_2.FULL__SCREEN_pseudobulk_bw_files',
 'CNA_10xv11_3.FULL': 'SCREEN_peaks/CNA_10xv11_3.FULL__SCREEN_pseudobulk_bw_files',
 'CNA_10xv11_4.FULL': 'SCREEN_peaks/CNA_10xv11_4.FULL__S

In [10]:
bed_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("SCREEN_peaks/*_pseudobulk_bed_files"))
}
bed_path_dict

{'BIO_ddseq_1.FULL': 'SCREEN_peaks/BIO_ddseq_1.FULL__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_2.FULL': 'SCREEN_peaks/BIO_ddseq_2.FULL__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_3.FULL': 'SCREEN_peaks/BIO_ddseq_3.FULL__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_4.FULL': 'SCREEN_peaks/BIO_ddseq_4.FULL__SCREEN_pseudobulk_bed_files',
 'BRO_mtscatac_1.FULL': 'SCREEN_peaks/BRO_mtscatac_1.FULL__SCREEN_pseudobulk_bed_files',
 'BRO_mtscatac_2.FULL': 'SCREEN_peaks/BRO_mtscatac_2.FULL__SCREEN_pseudobulk_bed_files',
 'CNA_10xmultiome_1.FULL': 'SCREEN_peaks/CNA_10xmultiome_1.FULL__SCREEN_pseudobulk_bed_files',
 'CNA_10xmultiome_2.FULL': 'SCREEN_peaks/CNA_10xmultiome_2.FULL__SCREEN_pseudobulk_bed_files',
 'CNA_10xv11_1.FULL': 'SCREEN_peaks/CNA_10xv11_1.FULL__SCREEN_pseudobulk_bed_files',
 'CNA_10xv11_2.FULL': 'SCREEN_peaks/CNA_10xv11_2.FULL__SCREEN_pseudobulk_bed_files',
 'CNA_10xv11_3.FULL': 'SCREEN_peaks/CNA_10xv11_3.FULL__SCREEN_pseudobulk_bed_files',
 'CNA_10xv11_4.FULL': 'SCREEN_peaks/CNA_10xv1

In [11]:
for sample in bed_path_dict.keys():
    bed_paths = {
        x.split("/")[-1].split("__")[0].split(".bed.gz")[0]: x
        for x in glob.glob(bed_path_dict[sample] + "/*")
    }

In [12]:
from pycisTopic.pseudobulk_peak_calling import *

In [13]:
import ray

In [14]:
narrow_peaks_dict = {}
ray.shutdown()
for sample in bed_path_dict.keys():
    narrow_peaks_dict_path = bed_path_dict[sample].replace(
        "_pseudobulk_bed_files", "_narrow_peaks_dict.pkl"
    )
    peak_path = os.path.join("SCREEN_peaks", f"{sample}__SCREEN_consensus_peaks")
    if not os.path.exists(peak_path):
        os.mkdir(peak_path)

    if not os.path.exists(narrow_peaks_dict_path):
        cell_data = pd.read_csv(cell_data_path_dict[sample])
        cto_celltypes = set(
            [
                x.replace(" ", "").replace("+", "").replace("_", "")
                for x in set(cell_data["consensus_cell_type"].unique())
            ]
        )

        bed_celltypes = set(
            [
                x.split(".")[0].replace("+", "").replace("_", "")
                for x in os.listdir(bed_path_dict[sample])
            ]
        )

        if cto_celltypes == bed_celltypes:
            print(f"Starting {narrow_peaks_dict_path}")
            bed_paths = {
                x.split("/")[-1].split("__")[0].split(".bed.gz")[0]: x
                for x in glob.glob(bed_path_dict[sample] + "/*")
            }
            narrow_peaks_dict = peak_calling(
                macs_path="macs2",
                bed_paths=bed_paths,
                outdir=peak_path,
                genome_size="hs",
                n_cpu=20,
                input_format="BEDPE",
                shift=73,
                ext_size=146,
                keep_dup="all",
                q_value=0.05,
            )
            with open(narrow_peaks_dict_path, "wb") as f:
                pickle.dump(narrow_peaks_dict, f)
        else:
            print(f"{sample} cell types not matching!! Rerun bed file writing.")
            print(f"\t{bed_celltypes}")
            print(f"\t{cto_celltypes}")
    else:
        print(f"{narrow_peaks_dict_path} already exists")

SCREEN_peaks/BIO_ddseq_1.FULL__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/BIO_ddseq_2.FULL__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/BIO_ddseq_3.FULL__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/BIO_ddseq_4.FULL__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/BRO_mtscatac_1.FULL__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/BRO_mtscatac_2.FULL__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/CNA_10xmultiome_1.FULL__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/CNA_10xmultiome_2.FULL__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/CNA_10xv11_1.FULL__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/CNA_10xv11_2.FULL__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/CNA_10xv11_3.FULL__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/CNA_10xv11_4.FULL__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/CNA_10xv11_5.FULL__SCREEN_narrow_peaks_dict.pkl already exists
SC

2022-09-26 14:12:00,575	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(macs_call_peak_ray pid=2444689)[0m 2022-09-26 14:12:28,651 cisTopic     INFO     Calling peaks for CytotoxicTcell with macs2 callpeak --treatment SCREEN_peaks/OHS_s3atac_1.FULL__SCREEN_pseudobulk_bed_files/CytotoxicTcell.bed.gz --name CytotoxicTcell  --outdir SCREEN_peaks/OHS_s3atac_1.FULL__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=2444688)[0m 2022-09-26 14:12:28,654 cisTopic     INFO     Calling peaks for CD14_monocyte with macs2 callpeak --treatment SCREEN_peaks/OHS_s3atac_1.FULL__SCREEN_pseudobulk_bed_files/CD14_monocyte.bed.gz --name CD14_monocyte  --outdir SCREEN_peaks/OHS_s3atac_1.FULL__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=2444690)[0m 2022-09-26 14:12:28,673 cisTopic     INFO     Calling peaks for Naturalki

2022-09-26 14:20:28,433	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(macs_call_peak_ray pid=2445776)[0m 2022-09-26 14:20:54,580 cisTopic     INFO     Calling peaks for CD4_Tcell with macs2 callpeak --treatment SCREEN_peaks/OHS_s3atac_2.FULL__SCREEN_pseudobulk_bed_files/CD4_Tcell.bed.gz --name CD4_Tcell  --outdir SCREEN_peaks/OHS_s3atac_2.FULL__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=2445774)[0m 2022-09-26 14:20:54,594 cisTopic     INFO     Calling peaks for CD14_monocyte with macs2 callpeak --treatment SCREEN_peaks/OHS_s3atac_2.FULL__SCREEN_pseudobulk_bed_files/CD14_monocyte.bed.gz --name CD14_monocyte  --outdir SCREEN_peaks/OHS_s3atac_2.FULL__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=2445772)[0m 2022-09-26 14:20:54,590 cisTopic     INFO     Calling peaks for Naturalkillercell with m

2022-09-26 14:32:28,012	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(macs_call_peak_ray pid=2446976)[0m 2022-09-26 14:32:55,933 cisTopic     INFO     Calling peaks for CytotoxicTcell with macs2 callpeak --treatment SCREEN_peaks/UCS_ddseq_1.FULL__SCREEN_pseudobulk_bed_files/CytotoxicTcell.bed.gz --name CytotoxicTcell  --outdir SCREEN_peaks/UCS_ddseq_1.FULL__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=2446979)[0m 2022-09-26 14:32:55,919 cisTopic     INFO     Calling peaks for CD4_Tcell with macs2 callpeak --treatment SCREEN_peaks/UCS_ddseq_1.FULL__SCREEN_pseudobulk_bed_files/CD4_Tcell.bed.gz --name CD4_Tcell  --outdir SCREEN_peaks/UCS_ddseq_1.FULL__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=2446980)[0m 2022-09-26 14:32:55,923 cisTopic     INFO     Calling peaks for Bcell with macs2 callpeak

2022-09-26 14:38:11,564	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(macs_call_peak_ray pid=2448007)[0m 2022-09-26 14:38:39,414 cisTopic     INFO     Calling peaks for CD14_monocyte with macs2 callpeak --treatment SCREEN_peaks/UCS_ddseq_2.FULL__SCREEN_pseudobulk_bed_files/CD14_monocyte.bed.gz --name CD14_monocyte  --outdir SCREEN_peaks/UCS_ddseq_2.FULL__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=2448010)[0m 2022-09-26 14:38:39,428 cisTopic     INFO     Calling peaks for CD4_Tcell with macs2 callpeak --treatment SCREEN_peaks/UCS_ddseq_2.FULL__SCREEN_pseudobulk_bed_files/CD4_Tcell.bed.gz --name CD4_Tcell  --outdir SCREEN_peaks/UCS_ddseq_2.FULL__SCREEN_consensus_peaks --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=2448009)[0m 2022-09-26 14:38:39,427 cisTopic     INFO     Calling peaks for Naturalkillercell with macs2

# call consensus peaks

In [15]:
from pycisTopic.iterative_peak_calling import get_consensus_peaks

In [16]:
narrow_peaks_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("SCREEN_peaks/*_narrow_peaks_dict.pkl"))
}
narrow_peaks_path_dict

{'BIO_ddseq_1.FULL': 'SCREEN_peaks/BIO_ddseq_1.FULL__SCREEN_narrow_peaks_dict.pkl',
 'BIO_ddseq_2.FULL': 'SCREEN_peaks/BIO_ddseq_2.FULL__SCREEN_narrow_peaks_dict.pkl',
 'BIO_ddseq_3.FULL': 'SCREEN_peaks/BIO_ddseq_3.FULL__SCREEN_narrow_peaks_dict.pkl',
 'BIO_ddseq_4.FULL': 'SCREEN_peaks/BIO_ddseq_4.FULL__SCREEN_narrow_peaks_dict.pkl',
 'BRO_mtscatac_1.FULL': 'SCREEN_peaks/BRO_mtscatac_1.FULL__SCREEN_narrow_peaks_dict.pkl',
 'BRO_mtscatac_2.FULL': 'SCREEN_peaks/BRO_mtscatac_2.FULL__SCREEN_narrow_peaks_dict.pkl',
 'CNA_10xmultiome_1.FULL': 'SCREEN_peaks/CNA_10xmultiome_1.FULL__SCREEN_narrow_peaks_dict.pkl',
 'CNA_10xmultiome_2.FULL': 'SCREEN_peaks/CNA_10xmultiome_2.FULL__SCREEN_narrow_peaks_dict.pkl',
 'CNA_10xv11_1.FULL': 'SCREEN_peaks/CNA_10xv11_1.FULL__SCREEN_narrow_peaks_dict.pkl',
 'CNA_10xv11_2.FULL': 'SCREEN_peaks/CNA_10xv11_2.FULL__SCREEN_narrow_peaks_dict.pkl',
 'CNA_10xv11_3.FULL': 'SCREEN_peaks/CNA_10xv11_3.FULL__SCREEN_narrow_peaks_dict.pkl',
 'CNA_10xv11_4.FULL': 'SCREEN_peak

In [17]:
path_to_blacklist = "../0_resources/regions/hg38-blacklist.v2.bed"

In [18]:
peak_half_width = 250

# Get consensus peaks
consensus_peaks_dict = {}
for sample in narrow_peaks_path_dict.keys():
    print(sample)
    consensus_out_path = narrow_peaks_path_dict[sample].replace(
        "_narrow_peaks_dict.pkl", "_consensus_peaks.bed"
    )
    if not os.path.exists(consensus_out_path):
        cell_data = pd.read_csv(cell_data_path_dict[sample])
        cto_celltypes = set(
            [
                x.replace(" ", "").replace("+", "").replace("_", "")
                for x in set(cell_data["consensus_cell_type"].unique())
            ]
        )

        with open(narrow_peaks_path_dict[sample], "rb") as f:
            narrow_peaks_dict = pickle.load(f)
        peaks_celltypes = set(
            [
                x.replace(" ", "").replace("+", "").replace("_", "")
                for x in set(narrow_peaks_dict.keys())
            ]
        )

        if cto_celltypes == peaks_celltypes:
            consensus_peaks = get_consensus_peaks(
                narrow_peaks_dict,
                peak_half_width,
                chromsizes=chromsizes,
                path_to_blacklist=path_to_blacklist,
            )

            consensus_peaks.to_bed(
                path=consensus_out_path, keep=True, compression="infer", chain=False
            )
        else:
            print("CELL TYPE SETS NOT MATCHING! Rerun peak calling.")
            print(peaks_celltypes - cto_celltypes)
            print(cto_celltypes - peaks_celltypes)
    else:
        print(f"{consensus_out_path} already exists, skipping...")

BIO_ddseq_1.FULL
SCREEN_peaks/BIO_ddseq_1.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
BIO_ddseq_2.FULL
SCREEN_peaks/BIO_ddseq_2.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
BIO_ddseq_3.FULL
SCREEN_peaks/BIO_ddseq_3.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
BIO_ddseq_4.FULL
SCREEN_peaks/BIO_ddseq_4.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
BRO_mtscatac_1.FULL
SCREEN_peaks/BRO_mtscatac_1.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
BRO_mtscatac_2.FULL
SCREEN_peaks/BRO_mtscatac_2.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
CNA_10xmultiome_1.FULL
SCREEN_peaks/CNA_10xmultiome_1.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
CNA_10xmultiome_2.FULL
SCREEN_peaks/CNA_10xmultiome_2.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
CNA_10xv11_1.FULL
SCREEN_peaks/CNA_10xv11_1.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
CNA_10xv11_2.FULL
SCREEN_peaks/CNA_

  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.co

OHS_s3atac_2.FULL
2022-09-26 14:45:24,541 cisTopic     INFO     Extending and merging peaks per class
2022-09-26 14:46:57,886 cisTopic     INFO     Normalizing peak scores
2022-09-26 14:46:58,230 cisTopic     INFO     Merging peaks
2022-09-26 14:48:46,695 cisTopic     INFO     Done!


  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.co

SAN_10xmultiome_1.FULL
SCREEN_peaks/SAN_10xmultiome_1.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
SAN_10xmultiome_2.FULL
SCREEN_peaks/SAN_10xmultiome_2.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
STA_10xv11_1.FULL
SCREEN_peaks/STA_10xv11_1.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
STA_10xv11_2.FULL
SCREEN_peaks/STA_10xv11_2.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
TXG_10xv11_1.FULL
SCREEN_peaks/TXG_10xv11_1.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
TXG_10xv2_1.FULL
SCREEN_peaks/TXG_10xv2_1.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
TXG_10xv2_2.FULL
SCREEN_peaks/TXG_10xv2_2.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
UCS_ddseq_1.FULL
2022-09-26 14:48:48,926 cisTopic     INFO     Extending and merging peaks per class
2022-09-26 14:49:27,981 cisTopic     INFO     Normalizing peak scores
2022-09-26 14:49:28,125 cisTopic     INFO     Merging peaks
2022-09-26 14:50:0

  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.co

UCS_ddseq_2.FULL
2022-09-26 14:50:01,923 cisTopic     INFO     Extending and merging peaks per class
2022-09-26 14:50:41,748 cisTopic     INFO     Normalizing peak scores
2022-09-26 14:50:41,909 cisTopic     INFO     Merging peaks
2022-09-26 14:51:17,313 cisTopic     INFO     Done!


  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.concat([outdf, df.get(noncanonical)], axis=1)
  return pd.co

VIB_10xmultiome_1.FULL
SCREEN_peaks/VIB_10xmultiome_1.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
VIB_10xmultiome_2.FULL
SCREEN_peaks/VIB_10xmultiome_2.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
VIB_10xv1_1.FULL
SCREEN_peaks/VIB_10xv1_1.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
VIB_10xv1_2.FULL
SCREEN_peaks/VIB_10xv1_2.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
VIB_10xv2_1.FULL
SCREEN_peaks/VIB_10xv2_1.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
VIB_10xv2_2.FULL
SCREEN_peaks/VIB_10xv2_2.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
VIB_hydrop_1.FULL
SCREEN_peaks/VIB_hydrop_1.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
VIB_hydrop_11.FULL
SCREEN_peaks/VIB_hydrop_11.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
VIB_hydrop_12.FULL
SCREEN_peaks/VIB_hydrop_12.FULL__SCREEN_consensus_peaks.bed already exists, skipping...
VIB_hydrop_2.FULL
SCREEN_peaks/VIB_hydr

# Check % chrM in consensus peaks

In [19]:
consensus_peaks_path_dict = {x.split('/')[-1].split(f'__')[0]: x for x in sorted(glob.glob("SCREEN_peaks/*consensus_peaks.bed"))}
consensus_peaks_path_dict

{'BIO_ddseq_1.FULL': 'SCREEN_peaks/BIO_ddseq_1.FULL__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_2.FULL': 'SCREEN_peaks/BIO_ddseq_2.FULL__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_3.FULL': 'SCREEN_peaks/BIO_ddseq_3.FULL__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_4.FULL': 'SCREEN_peaks/BIO_ddseq_4.FULL__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_1.FULL': 'SCREEN_peaks/BRO_mtscatac_1.FULL__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_2.FULL': 'SCREEN_peaks/BRO_mtscatac_2.FULL__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_1.FULL': 'SCREEN_peaks/CNA_10xmultiome_1.FULL__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_2.FULL': 'SCREEN_peaks/CNA_10xmultiome_2.FULL__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_1.FULL': 'SCREEN_peaks/CNA_10xv11_1.FULL__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_2.FULL': 'SCREEN_peaks/CNA_10xv11_2.FULL__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_3.FULL': 'SCREEN_peaks/CNA_10xv11_3.FULL__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_4.FULL': 'SCREEN_peaks/CNA_10xv11_4.FULL__S

In [20]:
for sample, path in consensus_peaks_path_dict.items():
    print(sample)
    peaks_df = pd.read_csv(path, sep='\t', header=None)
    
    chroms_in_df = list(sorted(peaks_df[0].unique()))
    chroms_standard = ["chr" + str(x + 1) for x in range(22)] + ['chrX']
    chroms_nonstandard = list(set(chroms_in_df ) - set(chroms_standard) -  set(['chrM']))
    
    n_standard = peaks_df[0].value_counts()[chroms_standard].sum()
    n_contigs = peaks_df[0].value_counts()[chroms_nonstandard].sum()
    n_chrm = peaks_df[0].value_counts()['chrM'].sum()
    pct_nonstandard = (n_contigs + n_chrm)/len(peaks_df)*100
        
    print(f"\tpeaks on standard chromosomes: {n_standard}")
    print(f"\tpeaks on contigs: {n_contigs}")
    print(f"\tpeaks on chrM: {n_chrm}")
    print(f"\t% peaks non standard chromosomes: {pct_nonstandard}%")
    if 'chrY' in chroms_in_df:
        n_chrY = peaks_df[0].value_counts()['chrY'].sum()
        print(f"\tpeaks on chrY: {n_chrY}")
    else:
        print(f"\tpeaks on chrY: 0")

BIO_ddseq_1.FULL
	peaks on standard chromosomes: 165715
	peaks on contigs: 242
	peaks on chrM: 24
	% peaks non standard chromosomes: 0.16025930678812636%
	peaks on chrY: 0
BIO_ddseq_2.FULL
	peaks on standard chromosomes: 153883
	peaks on contigs: 81
	peaks on chrM: 22
	% peaks non standard chromosomes: 0.06688919771927318%
	peaks on chrY: 0
BIO_ddseq_3.FULL
	peaks on standard chromosomes: 191169
	peaks on contigs: 290
	peaks on chrM: 23
	% peaks non standard chromosomes: 0.1634618397551728%
	peaks on chrY: 207
BIO_ddseq_4.FULL
	peaks on standard chromosomes: 174118
	peaks on contigs: 264
	peaks on chrM: 23
	% peaks non standard chromosomes: 0.1645595023078467%
	peaks on chrY: 191
BRO_mtscatac_1.FULL
	peaks on standard chromosomes: 203455
	peaks on contigs: 257
	peaks on chrM: 21
	% peaks non standard chromosomes: 0.136453102835574%
	peaks on chrY: 148
BRO_mtscatac_2.FULL
	peaks on standard chromosomes: 202202
	peaks on contigs: 251
	peaks on chrM: 20
	% peaks non standard chromosomes: 