In [1]:
from pycisTopic.pseudobulk_peak_calling import export_pseudobulk, peak_calling
import pyranges as pr
import requests
import os
import pandas as pd
import glob
import pickle

In [2]:
%load_ext lab_black

In [3]:
# get chromosome sizes (hg38)
if not os.path.exists("chromsizes.txt"):
    target_url = (
        "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes"
    )
    chromsizes = pd.read_csv(target_url, sep="\t", header=None)
    chromsizes.columns = ["Chromosome", "End"]
    chromsizes["Start"] = [0] * chromsizes.shape[0]
    chromsizes = chromsizes.loc[:, ["Chromosome", "Start", "End"]]
    chromsizes = pr.PyRanges(chromsizes)
    chromsizes.to_csv("chromsizes.txt")
    chromsizes
else:
    chromsizes = pd.read_csv("chromsizes.txt")

In [4]:
cto_consensus_paths = sorted(glob.glob(f"cistopic_objects_subsampled/*consensus.pkl"))
cto_consensus_path_dict = {
    x.split("/")[-1].split(f"__")[0] + "." + x.split("/")[-1].split(f".")[-5]: x
    for x in cto_consensus_paths
}
cto_consensus_path_dict

{'BIO_ddseq_1.FIXEDCELLS.05k': 'cistopic_objects_subsampled/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.05k.model_10topics.dimreduc.consensus.pkl',
 'BIO_ddseq_1.FIXEDCELLS.15k': 'cistopic_objects_subsampled/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.15k.model_11topics.dimreduc.consensus.pkl',
 'BIO_ddseq_1.FIXEDCELLS.1k': 'cistopic_objects_subsampled/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.1k.model_10topics.dimreduc.consensus.pkl',
 'BIO_ddseq_1.FIXEDCELLS.25k': 'cistopic_objects_subsampled/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.25k.model_9topics.dimreduc.consensus.pkl',
 'BIO_ddseq_1.FIXEDCELLS.2k': 'cistopic_objects_subsampled/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.2k.model_12topics.dimreduc.consensus.pkl',
 'BIO_ddseq_1.FIXEDCELLS.3k': 'cistopic_objects_subsampled/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.3k.model_11topics.dimreduc.consensus.pkl',
 'BIO_ddseq_2.FIXEDCELLS.05k': 'cistopic_objects_subsampled/BIO_d

In [5]:
fragments_path_dict = {
    x.split("/")[-1].split(f".fragments.tsv.gz")[0]: x
    for x in sorted(glob.glob("../1_data_repository/fixedcells_fragments/*.tsv.gz"))
}
fragments_path_dict

{'BIO_ddseq_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_1.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_2.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_3.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_3.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_4.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_4.FIXEDCELLS.fragments.tsv.gz',
 'BRO_mtscatac_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BRO_mtscatac_1.FIXEDCELLS.fragments.tsv.gz',
 'BRO_mtscatac_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BRO_mtscatac_2.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xmultiome_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xmultiome_1.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xmultiome_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xmultiome_2.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xv11_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragmen

In [6]:
import gc
import logging
import os
import re
import subprocess
import sys
from typing import Dict, List, Optional, Union

import numpy as np
import pandas as pd
import pyBigWig
import pyranges as pr
import ray

from pycisTopic.cistopic_class import *
from pycisTopic.utils import *


def export_pseudobulk(
    input_data: Union["CistopicObject", pd.DataFrame, Dict[str, pd.DataFrame]],
    variable: str,
    chromsizes: Union[pd.DataFrame, pr.PyRanges],
    bed_path: str,
    bigwig_path: str,
    path_to_fragments: Optional[Dict[str, str]] = None,
    sample_id_col: Optional[str] = "sample_id",
    n_cpu: Optional[int] = 1,
    normalize_bigwig: Optional[bool] = True,
    remove_duplicates: Optional[bool] = True,
    split_pattern: Optional[str] = "___",
    use_polars: Optional[bool] = True,
    **kwargs
):
    """
    Create pseudobulks as bed and bigwig from single cell fragments file given a barcode annotation.
    Parameters
    ---------
    input_data: CistopicObject or pd.DataFrame
            A :class:`CistopicObject` containing the specified `variable` as a column in :class:`CistopicObject.cell_data` or a cell metadata
            :class:`pd.DataFrame` containing barcode as rows, containing the specified `variable` as a column (additional columns are
            possible) and a `sample_id` column. Index names must contain the BARCODE (e.g. ATGTCGTC-1), additional tags are possible separating with -
            (e.g. ATGCTGTGCG-1-Sample_1). The levels in the sample_id column must agree with the keys in the path_to_fragments dictionary.
            Alternatively, if the cell metadata contains a column named barcode it will be used instead of the index names.
    variable: str
            A character string indicating the column that will be used to create the different group pseudobulk. It must be included in
            the cell metadata provided as input_data.
    chromsizes: pd.DataFrame or pr.PyRanges
            A data frame or :class:`pr.PyRanges` containing size of each chromosome, containing 'Chromosome', 'Start' and 'End' columns.
    bed_path: str
            Path to folder where the fragments bed files per group will be saved. If None, files will not be generated.
    bigwig_path: str
            Path to folder where the bigwig files per group will be saved. If None, files will not be generated.
    path_to_fragments: str or dict, optional
            A dictionary of character strings, with sample name as names indicating the path to the fragments file/s from which pseudobulk profiles have to
            be created. If a :class:`CistopicObject` is provided as input it will be ignored, but if a cell metadata :class:`pd.DataFrame` is provided it
            is necessary to provide it. The keys of the dictionary need to match with the sample_id tag added to the index names of the input data frame.
    sample_id_col: str, optional
            Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'.
    n_cpu: int, optional
            Number of cores to use. Default: 1.
    normalize_bigwig: bool, optional
            Whether bigwig files should be CPM normalized. Default: True.
    remove_duplicates: bool, optional
            Whether duplicates should be removed before converting the data to bigwig.
    split_pattern: str, optional
            Pattern to split cell barcode from sample id. Default: ___ .
    use_polars: bool, optional
            Whether to use polars to read fragments files. Default: True.
    **kwargs
            Additional parameters for ray.init()
    Return
    ------
    dict
            A dictionary containing the paths to the newly created bed fragments files per group a dictionary containing the paths to the
            newly created bigwig files per group.
    """
    # Create logger
    level = logging.INFO
    log_format = "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger("cisTopic")

    # Get fragments file
    if isinstance(input_data, CistopicObject):
        path_to_fragments = input_data.path_to_fragments
        if path_to_fragments is None:
            log.error("No path_to_fragments in this cisTopic object.")
        cell_data = input_data.cell_data
    elif isinstance(input_data, pd.DataFrame):
        if path_to_fragments is None:
            log.error("Please, provide path_to_fragments.")
        cell_data = input_data
    # Check for sample_id column
    try:
        sample_ids = list(set(cell_data[sample_id_col]))
    except ValueError:
        print(
            'Please, include a sample identification column (e.g. "sample_id") in your cell metadata!'
        )

    # Get fragments
    fragments_df_dict = {}
    for sample_id in path_to_fragments.keys():
        if sample_id not in sample_ids:
            log.info(
                "The following path_to_fragments entry is not found in the cell metadata sample_id_col: ",
                sample_id,
                ". It will be ignored.",
            )
        else:
            log.info("Reading fragments from " + path_to_fragments[sample_id])
            fragments_df = read_fragments_from_file(
                path_to_fragments[sample_id], use_polars=use_polars
            ).df
            # Convert to int32 for memory efficiency
            fragments_df.Start = np.int32(fragments_df.Start)
            fragments_df.End = np.int32(fragments_df.End)
            if "Score" in fragments_df:
                fragments_df.Score = np.int32(fragments_df.Score)
            if "barcode" in cell_data:
                fragments_df = fragments_df.loc[
                    fragments_df["Name"].isin(cell_data["barcode"].tolist())
                ]
            else:
                fragments_df = fragments_df.loc[
                    fragments_df["Name"].isin(
                        prepare_tag_cells(cell_data.index.tolist(), split_pattern)
                    )
                ]
            fragments_df_dict[sample_id] = fragments_df

    # Set groups
    if "barcode" in cell_data:
        cell_data = cell_data.loc[:, [variable, sample_id_col, "barcode"]]
    else:
        cell_data = cell_data.loc[:, [variable, sample_id_col]]
    cell_data[variable] = cell_data[variable].replace(" ", "", regex=True)
    cell_data[variable] = cell_data[variable].replace("[^A-Za-z0-9]+", "_", regex=True)
    groups = sorted(list(set(cell_data[variable])))
    # Check chromosome sizes
    if isinstance(chromsizes, pd.DataFrame):
        chromsizes = chromsizes.loc[:, ["Chromosome", "Start", "End"]]
        chromsizes = pr.PyRanges(chromsizes)
    # Check that output dir exist and generate output paths
    if isinstance(bed_path, str):
        if not os.path.exists(bed_path):
            os.makedirs(bed_path)
        bed_paths = {
            group: os.path.join(bed_path, str(group) + ".bed.gz") for group in groups
        }
    else:
        bed_paths = {}
    if isinstance(bigwig_path, str):
        if not os.path.exists(bigwig_path):
            os.makedirs(bigwig_path)
        bw_paths = {
            group: os.path.join(bigwig_path, str(group) + ".bw") for group in groups
        }
    else:
        bw_paths = {}
    # Create pseudobulks
    if n_cpu > 1:
        ray.init(num_cpus=n_cpu, **kwargs)
        ray_handle = ray.wait(
            [
                export_pseudobulk_ray.remote(
                    cell_data,
                    group,
                    fragments_df_dict,
                    chromsizes,
                    bigwig_path,
                    bed_path,
                    sample_id_col,
                    normalize_bigwig,
                    remove_duplicates,
                    split_pattern,
                )
                for group in groups
            ],
            num_returns=len(groups),
        )
        ray.shutdown()
    else:
        [
            export_pseudobulk_one_sample(
                cell_data,
                group,
                fragments_df_dict,
                chromsizes,
                bigwig_path,
                bed_path,
                sample_id_col,
                normalize_bigwig,
                remove_duplicates,
                split_pattern,
            )
            for group in groups
        ]

    return bw_paths, bed_paths


def export_pseudobulk_one_sample(
    cell_data: pd.DataFrame,
    group: str,
    fragments_df_dict: Dict[str, pd.DataFrame],
    chromsizes: pr.PyRanges,
    bigwig_path: str,
    bed_path: str,
    sample_id_col: Optional[str] = "sample_id",
    normalize_bigwig: Optional[bool] = True,
    remove_duplicates: Optional[bool] = True,
    split_pattern: Optional[str] = "___",
):
    """
    Create pseudobulk as bed and bigwig from single cell fragments file given a barcode annotation and a group.
    Parameters
    ---------
    cell_data: pd.DataFrame
            A cell metadata :class:`pd.Dataframe` containing barcodes, their annotation and their sample of origin.
    group: str
            A character string indicating the group for which pseudobulks will be created.
    fragments_df_dict: dict
            A dictionary containing data frames as values with 'Chromosome', 'Start', 'End', 'Name', and 'Score' as columns; and sample label
            as keys. 'Score' indicates the number of times that a fragments is found assigned to that barcode.
    chromsizes: pr.PyRanges
            A :class:`pr.PyRanges` containing size of each column, containing 'Chromosome', 'Start' and 'End' columns.
    bigwig_path: str
            Path to folder where the bigwig file will be saved.
    bed_path: str
            Path to folder where the fragments bed file will be saved.
    sample_id_col: str, optional
            Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'.
    normalize_bigwig: bool, optional
            Whether bigwig files should be CPM normalized. Default: True.
    remove_duplicates: bool, optional
            Whether duplicates should be removed before converting the data to bigwig.
    split_pattern: str
            Pattern to split cell barcode from sample id. Default: ___ .
    """
    # Create logger
    level = logging.INFO
    log_format = "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger("cisTopic")

    log.info("Creating pseudobulk for " + str(group))
    group_fragments_list = []
    group_fragments_dict = {}
    for sample_id in fragments_df_dict:
        sample_data = cell_data[cell_data.loc[:, sample_id_col].isin([sample_id])]
        if "barcode" in sample_data:
            sample_data.index = sample_data["barcode"].tolist()
        else:
            sample_data.index = prepare_tag_cells(
                sample_data.index.tolist(), split_pattern
            )
        group_var = sample_data.iloc[:, 0]
        barcodes = group_var[group_var.isin([group])].index.tolist()
        fragments_df = fragments_df_dict[sample_id]
        group_fragments = fragments_df.loc[fragments_df["Name"].isin(barcodes)]
        if len(fragments_df_dict) > 1:
            group_fragments_dict[sample_id] = group_fragments

    if len(fragments_df_dict) > 1:
        group_fragments_list = [
            group_fragments_dict[list(group_fragments_dict.keys())[x]]
            for x in range(len(fragments_df_dict))
        ]
        group_fragments = group_fragments_list[0].append(group_fragments_list[1:])

    group_fragments = group_fragments[
        group_fragments["Chromosome"].isin(chromsizes.Chromosome)
    ]

    del group_fragments_dict
    del group_fragments_list
    del fragments_df
    gc.collect()

    group_pr = pr.PyRanges(group_fragments)
    if isinstance(bigwig_path, str):
        bigwig_path_group = os.path.join(bigwig_path, str(group) + ".bw")
        if remove_duplicates:
            group_pr.to_bigwig(
                path=bigwig_path_group,
                chromosome_sizes=chromsizes,
                rpm=normalize_bigwig,
            )
        else:
            group_pr.to_bigwig(
                path=bigwig_path_group,
                chromosome_sizes=chromsizes,
                rpm=normalize_bigwig,
                value_col="Score",
            )
    if isinstance(bed_path, str):
        bed_path_group = os.path.join(bed_path, str(group) + ".bed.gz")
        group_pr.to_bed(
            path=bed_path_group, keep=False, compression="infer", chain=False
        )

    log.info(str(group) + " done!")


@ray.remote
def export_pseudobulk_ray(
    cell_data: pd.DataFrame,
    group: str,
    fragments_df_dict: Dict[str, pd.DataFrame],
    chromsizes: pr.PyRanges,
    bigwig_path: str,
    bed_path: str,
    sample_id_col: Optional[str] = "sample_id",
    normalize_bigwig: Optional[bool] = True,
    remove_duplicates: Optional[bool] = True,
    split_pattern: Optional[str] = "___",
):
    """
    Create pseudobulk as bed and bigwig from single cell fragments file given a barcode annotation and a group.
    Parameters
    ---------
    cell_data: pd.DataFrame
            A cell metadata :class:`pd.Dataframe` containing barcodes, their annotation and their sample of origin.
    group: str
            A character string indicating the group for which pseudobulks will be created.
    fragments_df_dict: dict
            A dictionary containing data frames as values with 'Chromosome', 'Start', 'End', 'Name', and 'Score' as columns; and sample label
            as keys. 'Score' indicates the number of times that a fragments is found assigned to that barcode.
    chromsizes: pr.PyRanges
            A :class:`pr.PyRanges` containing size of each column, containing 'Chromosome', 'Start' and 'End' columns.
    bed_path: str
            Path to folder where the fragments bed file will be saved.
    bigwig_path: str
            Path to folder where the bigwig file will be saved.
    sample_id_col: str, optional
            Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'.
    normalize_bigwig: bool, optional
            Whether bigwig files should be CPM normalized. Default: True.
    remove_duplicates: bool, optional
            Whether duplicates should be removed before converting the data to bigwig.
    split_pattern: str
            Pattern to split cell barcode from sample id. Default: ___ .
    """
    export_pseudobulk_one_sample(
        cell_data,
        group,
        fragments_df_dict,
        chromsizes,
        bigwig_path,
        bed_path,
        sample_id_col,
        normalize_bigwig,
        remove_duplicates,
        split_pattern,
    )

In [7]:
# for sample in ["BRO_mtscatac_1.LIBDS"]:
bw_paths_dict = {}
bed_paths_dict = {}

import ray

if ray.is_initialized():
    print("Shutting down Ray")
    ray.shutdown()

for sample in cto_consensus_path_dict.keys():
    supersample = ".".join(sample.split(".")[:2])

    bed_path = os.path.join(
        "final_consensus_peaks", f"{sample}__SCREEN_pseudobulk_bed_files"
    )
    bw_path = os.path.join(
        "final_consensus_peaks", f"{sample}__SCREEN_pseudobulk_bw_files"
    )
    if not os.path.exists(bed_path):
        cto_path = cto_consensus_path_dict[sample]
        with open(cto_path, "rb") as f:
            cto = pickle.load(f)

            bw_paths, bed_paths = export_pseudobulk(
                input_data=cto,
                variable="consensus_cell_type",
                sample_id_col="sample_id",
                chromsizes=chromsizes,
                bed_path=bed_path,
                bigwig_path=bw_path,
                path_to_fragments=fragments_path_dict[supersample],
                n_cpu=16,
                normalize_bigwig=True,
                remove_duplicates=True,
            )

            if ray.is_initialized():
                print("Shutting down Ray")
                ray.shutdown()
    else:
        print(f"{bed_path} exists, skipping...")

2022-12-19 21:41:37,963 cisTopic     INFO     Reading fragments from ../1_data_repository/fixedcells_fragments/BIO_ddseq_1.FIXEDCELLS.fragments.tsv.gz


2022-12-19 21:43:18,351	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=3382882)[0m 2022-12-19 21:43:22,393 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=3382883)[0m 2022-12-19 21:43:23,633 cisTopic     INFO     Creating pseudobulk for CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=3382887)[0m 2022-12-19 21:43:24,866 cisTopic     INFO     Creating pseudobulk for CD4_Tcell
[2m[36m(export_pseudobulk_ray pid=3382879)[0m 2022-12-19 21:43:26,171 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell
[2m[36m(export_pseudobulk_ray pid=3382885)[0m 2022-12-19 21:43:27,446 cisTopic     INFO     Creating pseudobulk for Naturalkillercell
[2m[36m(export_pseudobulk_ray pid=3382882)[0m 2022-12-19 21:43:32,405 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=3382885)[0m 2022-12-19 21:43:35,969 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=3382883)[0m 2022-12-19 21:43:37,155 cisTopic     INFO     CD14_monocyte done!


2022-12-19 21:45:43,965	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8266 [39m[22m


[2m[36m(export_pseudobulk_ray pid=3386067)[0m 2022-12-19 21:45:48,341 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=3386078)[0m 2022-12-19 21:45:49,682 cisTopic     INFO     Creating pseudobulk for CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=3386079)[0m 2022-12-19 21:45:51,184 cisTopic     INFO     Creating pseudobulk for CD4_Tcell
[2m[36m(export_pseudobulk_ray pid=3386080)[0m 2022-12-19 21:45:52,683 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell
[2m[36m(export_pseudobulk_ray pid=3386076)[0m 2022-12-19 21:45:54,309 cisTopic     INFO     Creating pseudobulk for Naturalkillercell
[2m[36m(export_pseudobulk_ray pid=3386067)[0m 2022-12-19 21:46:08,900 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=3386076)[0m 2022-12-19 21:46:17,095 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=3386080)[0m 2022-12-19 21:46:48,840 cisTopic     INFO     CytotoxicTcell done!


2022-12-19 21:49:27,136	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8266 [39m[22m


[2m[36m(export_pseudobulk_ray pid=3390262)[0m 2022-12-19 21:49:31,343 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=3390259)[0m 2022-12-19 21:49:32,685 cisTopic     INFO     Creating pseudobulk for CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=3390261)[0m 2022-12-19 21:49:34,054 cisTopic     INFO     Creating pseudobulk for CD4_Tcell
[2m[36m(export_pseudobulk_ray pid=3390263)[0m 2022-12-19 21:49:35,482 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell
[2m[36m(export_pseudobulk_ray pid=3390258)[0m 2022-12-19 21:49:36,924 cisTopic     INFO     Creating pseudobulk for Naturalkillercell
[2m[36m(export_pseudobulk_ray pid=3390262)[0m 2022-12-19 21:49:45,068 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=3390258)[0m 2022-12-19 21:49:51,388 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=3390259)[0m 2022-12-19 21:50:04,662 cisTopic     INFO     CD14_monocyte done!


2022-12-19 21:52:42,746	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=3393709)[0m 2022-12-19 21:52:46,893 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=3393713)[0m 2022-12-19 21:52:48,253 cisTopic     INFO     Creating pseudobulk for CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=3393710)[0m 2022-12-19 21:52:49,674 cisTopic     INFO     Creating pseudobulk for CD4_Tcell
[2m[36m(export_pseudobulk_ray pid=3393703)[0m 2022-12-19 21:52:51,107 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell
[2m[36m(export_pseudobulk_ray pid=3393716)[0m 2022-12-19 21:52:52,591 cisTopic     INFO     Creating pseudobulk for Naturalkillercell
[2m[36m(export_pseudobulk_ray pid=3393709)[0m 2022-12-19 21:53:35,834 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=3393716)[0m 2022-12-19 21:53:39,375 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=3393713)[0m 2022-12-19 21:54:16,768 cisTopic     INFO     CD14_monocyte done!


2022-12-19 21:58:07,320	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=3395651)[0m 2022-12-19 21:58:11,531 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=3395657)[0m 2022-12-19 21:58:12,851 cisTopic     INFO     Creating pseudobulk for CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=3395658)[0m 2022-12-19 21:58:14,147 cisTopic     INFO     Creating pseudobulk for CD4_Tcell
[2m[36m(export_pseudobulk_ray pid=3395656)[0m 2022-12-19 21:58:15,497 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell
[2m[36m(export_pseudobulk_ray pid=3395649)[0m 2022-12-19 21:58:16,803 cisTopic     INFO     Creating pseudobulk for Naturalkillercell
[2m[36m(export_pseudobulk_ray pid=3395651)[0m 2022-12-19 21:58:39,175 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=3395649)[0m 2022-12-19 21:58:42,775 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=3395657)[0m 2022-12-19 21:59:21,644 cisTopic     INFO     CD14_monocyte done!


2022-12-19 22:03:00,752	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=3396959)[0m 2022-12-19 22:03:05,213 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=3396964)[0m 2022-12-19 22:03:06,603 cisTopic     INFO     Creating pseudobulk for CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=3396962)[0m 2022-12-19 22:03:08,112 cisTopic     INFO     Creating pseudobulk for CD4_Tcell
[2m[36m(export_pseudobulk_ray pid=3396957)[0m 2022-12-19 22:03:09,558 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell
[2m[36m(export_pseudobulk_ray pid=3396963)[0m 2022-12-19 22:03:10,963 cisTopic     INFO     Creating pseudobulk for Naturalkillercell
[2m[36m(export_pseudobulk_ray pid=3396959)[0m 2022-12-19 22:03:59,499 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=3396963)[0m 2022-12-19 22:04:04,749 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=3396957)[0m 2022-12-19 22:04:37,894 cisTopic     INFO     CytotoxicTcell done!


2022-12-19 22:09:14,372	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=3398300)[0m 2022-12-19 22:09:18,239 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=3398294)[0m 2022-12-19 22:09:19,218 cisTopic     INFO     Creating pseudobulk for CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=3398296)[0m 2022-12-19 22:09:20,428 cisTopic     INFO     Creating pseudobulk for CD4_Tcell
[2m[36m(export_pseudobulk_ray pid=3398291)[0m 2022-12-19 22:09:21,573 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell
[2m[36m(export_pseudobulk_ray pid=3398292)[0m 2022-12-19 22:09:22,596 cisTopic     INFO     Creating pseudobulk for Naturalkillercell
[2m[36m(export_pseudobulk_ray pid=3398300)[0m 2022-12-19 22:09:27,987 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=3398292)[0m 2022-12-19 22:09:31,745 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=3398294)[0m 2022-12-19 22:09:33,653 cisTopic     INFO     CD14_monocyte done!


2022-12-19 22:11:11,920	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=3399293)[0m 2022-12-19 22:11:15,894 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=3399280)[0m 2022-12-19 22:11:17,156 cisTopic     INFO     Creating pseudobulk for CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=3399283)[0m 2022-12-19 22:11:18,405 cisTopic     INFO     Creating pseudobulk for CD4_Tcell
[2m[36m(export_pseudobulk_ray pid=3399290)[0m 2022-12-19 22:11:19,639 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell
[2m[36m(export_pseudobulk_ray pid=3399278)[0m 2022-12-19 22:11:20,908 cisTopic     INFO     Creating pseudobulk for Naturalkillercell
[2m[36m(export_pseudobulk_ray pid=3399293)[0m 2022-12-19 22:11:37,111 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=3399278)[0m 2022-12-19 22:11:43,716 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=3399280)[0m 2022-12-19 22:12:01,985 cisTopic     INFO     CD14_monocyte done!


2022-12-19 22:14:59,707	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=3400434)[0m 2022-12-19 22:15:03,929 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=3400438)[0m 2022-12-19 22:15:05,201 cisTopic     INFO     Creating pseudobulk for CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=3400439)[0m 2022-12-19 22:15:06,504 cisTopic     INFO     Creating pseudobulk for CD4_Tcell
[2m[36m(export_pseudobulk_ray pid=3400433)[0m 2022-12-19 22:15:07,628 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell
[2m[36m(export_pseudobulk_ray pid=3400441)[0m 2022-12-19 22:15:08,834 cisTopic     INFO     Creating pseudobulk for Naturalkillercell
[2m[36m(export_pseudobulk_ray pid=3400434)[0m 2022-12-19 22:15:17,333 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=3400441)[0m 2022-12-19 22:15:18,738 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=3400438)[0m 2022-12-19 22:15:32,216 cisTopic     INFO     CD14_monocyte done!


2022-12-19 22:17:49,260	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=3401516)[0m 2022-12-19 22:17:53,412 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=3401521)[0m 2022-12-19 22:17:54,544 cisTopic     INFO     Creating pseudobulk for CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=3401530)[0m 2022-12-19 22:17:55,835 cisTopic     INFO     Creating pseudobulk for CD4_Tcell
[2m[36m(export_pseudobulk_ray pid=3401525)[0m 2022-12-19 22:17:57,082 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell
[2m[36m(export_pseudobulk_ray pid=3401531)[0m 2022-12-19 22:17:58,476 cisTopic     INFO     Creating pseudobulk for Naturalkillercell
[2m[36m(export_pseudobulk_ray pid=3401516)[0m 2022-12-19 22:18:31,830 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=3401531)[0m 2022-12-19 22:18:38,873 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=3401525)[0m 2022-12-19 22:19:19,274 cisTopic     INFO     CytotoxicTcell done!


2022-12-19 22:23:45,423	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=3402942)[0m 2022-12-19 22:23:49,411 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=3402948)[0m 2022-12-19 22:23:50,543 cisTopic     INFO     Creating pseudobulk for CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=3402954)[0m 2022-12-19 22:23:51,751 cisTopic     INFO     Creating pseudobulk for CD4_Tcell
[2m[36m(export_pseudobulk_ray pid=3402952)[0m 2022-12-19 22:23:52,833 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell
[2m[36m(export_pseudobulk_ray pid=3402944)[0m 2022-12-19 22:23:54,052 cisTopic     INFO     Creating pseudobulk for Naturalkillercell
[2m[36m(export_pseudobulk_ray pid=3402942)[0m 2022-12-19 22:24:20,159 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=3402944)[0m 2022-12-19 22:24:26,619 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=3402948)[0m 2022-12-19 22:24:53,252 cisTopic     INFO     CD14_monocyte done!


2022-12-19 22:28:26,841	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(export_pseudobulk_ray pid=3404120)[0m 2022-12-19 22:28:30,854 cisTopic     INFO     Creating pseudobulk for Bcell
[2m[36m(export_pseudobulk_ray pid=3404121)[0m 2022-12-19 22:28:32,042 cisTopic     INFO     Creating pseudobulk for CD14_monocyte
[2m[36m(export_pseudobulk_ray pid=3404129)[0m 2022-12-19 22:28:33,259 cisTopic     INFO     Creating pseudobulk for CD4_Tcell
[2m[36m(export_pseudobulk_ray pid=3404131)[0m 2022-12-19 22:28:34,611 cisTopic     INFO     Creating pseudobulk for CytotoxicTcell
[2m[36m(export_pseudobulk_ray pid=3404130)[0m 2022-12-19 22:28:36,021 cisTopic     INFO     Creating pseudobulk for Naturalkillercell
[2m[36m(export_pseudobulk_ray pid=3404120)[0m 2022-12-19 22:29:17,673 cisTopic     INFO     Bcell done!
[2m[36m(export_pseudobulk_ray pid=3404130)[0m 2022-12-19 22:29:24,240 cisTopic     INFO     Naturalkillercell done!
[2m[36m(export_pseudobulk_ray pid=3404121)[0m 2022-12-19 22:30:16,051 cisTopic     INFO     CD14_monocyte done!


# consensus

In [8]:
cell_data_path_dict = {
    x.split("/")[-1].split(f"__")[0] + "." + x.split("/")[-1].split(f".")[-6]: x
    for x in sorted(glob.glob("cistopic_objects_subsampled/*consensus.cell_data.tsv"))
}
cell_data_path_dict

{'BIO_ddseq_3.FIXEDCELLS.05k': 'cistopic_objects_subsampled/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.05k.model_10topics.dimreduc.consensus.cell_data.tsv',
 'BIO_ddseq_3.FIXEDCELLS.15k': 'cistopic_objects_subsampled/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.15k.model_7topics.dimreduc.consensus.cell_data.tsv',
 'BIO_ddseq_3.FIXEDCELLS.1k': 'cistopic_objects_subsampled/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.1k.model_9topics.dimreduc.consensus.cell_data.tsv',
 'BIO_ddseq_3.FIXEDCELLS.25k': 'cistopic_objects_subsampled/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.25k.model_10topics.dimreduc.consensus.cell_data.tsv',
 'BIO_ddseq_3.FIXEDCELLS.2k': 'cistopic_objects_subsampled/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.2k.model_10topics.dimreduc.consensus.cell_data.tsv',
 'BIO_ddseq_4.FIXEDCELLS.05k': 'cistopic_objects_subsampled/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.05k.model_10topics.dimreduc.consensus.cell_data.tsv',
 'BI

In [9]:
bw_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*_pseudobulk_bw_files"))
}
bw_path_dict

{'BIO_ddseq_1.FIXEDCELLS.05k': 'final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS.05k__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_1.FIXEDCELLS.15k': 'final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS.15k__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_1.FIXEDCELLS.1k': 'final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS.1k__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_1.FIXEDCELLS.25k': 'final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS.25k__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_1.FIXEDCELLS.2k': 'final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS.2k__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_1.FIXEDCELLS.3k': 'final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS.3k__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_2.FIXEDCELLS.05k': 'final_consensus_peaks/BIO_ddseq_2.FIXEDCELLS.05k__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_2.FIXEDCELLS.15k': 'final_consensus_peaks/BIO_ddseq_2.FIXEDCELLS.15k__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_2.FIXEDCELLS.1k': 'final_consensus_peaks/BIO_ddseq_2.FIXEDCELLS.1k__SCREEN_pseudobulk_bw_files',
 'BIO_dds

In [10]:
bed_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*_pseudobulk_bed_files"))
}
bed_path_dict

{'BIO_ddseq_1.FIXEDCELLS.05k': 'final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS.05k__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_1.FIXEDCELLS.15k': 'final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS.15k__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_1.FIXEDCELLS.1k': 'final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS.1k__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_1.FIXEDCELLS.25k': 'final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS.25k__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_1.FIXEDCELLS.2k': 'final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS.2k__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_1.FIXEDCELLS.3k': 'final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS.3k__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_2.FIXEDCELLS.05k': 'final_consensus_peaks/BIO_ddseq_2.FIXEDCELLS.05k__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_2.FIXEDCELLS.15k': 'final_consensus_peaks/BIO_ddseq_2.FIXEDCELLS.15k__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_2.FIXEDCELLS.1k': 'final_consensus_peaks/BIO_ddseq_2.FIXEDCELLS.1k__SCREEN_pseudobulk_bed_files',


In [11]:
for sample in bed_path_dict.keys():
    bed_paths = {
        x.split("/")[-1].split("__")[0].split(".bed.gz")[0]: x
        for x in glob.glob(bed_path_dict[sample] + "/*")
    }

In [12]:
from pycisTopic.pseudobulk_peak_calling import *

In [13]:
import ray

In [None]:
narrow_peaks_dict = {}
ray.shutdown()
for sample in bed_path_dict.keys():
    narrow_peaks_dict_path = bed_path_dict[sample].replace(
        "_pseudobulk_bed_files", "_narrow_peaks_dict.pkl"
    )
    peak_path = os.path.join(
        "final_consensus_peaks", f"{sample}__SCREEN_consensus_peaks"
    )
    if not os.path.exists(peak_path):
        os.mkdir(peak_path)

    if not os.path.exists(narrow_peaks_dict_path):
        cell_data = pd.read_csv(cell_data_path_dict[sample])
        cto_celltypes = set(
            [
                x.replace(" ", "").replace("+", "").replace("_", "")
                for x in set(cell_data["consensus_cell_type"].unique())
            ]
        )

        bed_celltypes = set(
            [
                x.split(".")[0].replace("+", "").replace("_", "")
                for x in os.listdir(bed_path_dict[sample])
            ]
        )

        if cto_celltypes == bed_celltypes:
            print(f"Starting {narrow_peaks_dict_path}")
            bed_paths = {
                x.split("/")[-1].split("__")[0].split(".bed.gz")[0]: x
                for x in glob.glob(bed_path_dict[sample] + "/*")
            }
            narrow_peaks_dict = peak_calling(
                macs_path="macs2",
                bed_paths=bed_paths,
                outdir=peak_path,
                genome_size="hs",
                n_cpu=16,
                input_format="BEDPE",
                shift=73,
                ext_size=146,
                keep_dup="all",
                q_value=0.05,
            )
            with open(narrow_peaks_dict_path, "wb") as f:
                pickle.dump(narrow_peaks_dict, f)
        else:
            print(f"{sample} cell types not matching!! Rerun bed file writing.")
            print(f"\t{bed_celltypes}")
            print(f"\t{cto_celltypes}")
    else:
        print(f"{narrow_peaks_dict_path} already exists")

# call consensus peaks

In [None]:
from pycisTopic.iterative_peak_calling import get_consensus_peaks

In [None]:
narrow_peaks_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*_narrow_peaks_dict.pkl"))
}
narrow_peaks_path_dict

In [None]:
path_to_blacklist = "../0_resources/regions/hg38-blacklist.v2.bed"

In [None]:
peak_half_width = 250

# Get consensus peaks
consensus_peaks_dict = {}
for sample in narrow_peaks_path_dict.keys():
    print(sample)
    consensus_out_path = narrow_peaks_path_dict[sample].replace(
        "_narrow_peaks_dict.pkl", "_consensus_peaks.bed"
    )
    if not os.path.exists(consensus_out_path):
        cell_data = pd.read_csv(cell_data_path_dict[sample])
        cto_celltypes = set(
            [
                x.replace(" ", "").replace("+", "").replace("_", "")
                for x in set(cell_data["consensus_cell_type"].unique())
            ]
        )

        with open(narrow_peaks_path_dict[sample], "rb") as f:
            narrow_peaks_dict = pickle.load(f)
        peaks_celltypes = set(
            [
                x.replace(" ", "").replace("+", "").replace("_", "")
                for x in set(narrow_peaks_dict.keys())
            ]
        )

        if cto_celltypes == peaks_celltypes:
            consensus_peaks = get_consensus_peaks(
                narrow_peaks_dict,
                peak_half_width,
                chromsizes=chromsizes,
                path_to_blacklist=path_to_blacklist,
            )

            consensus_peaks.to_bed(
                path=consensus_out_path, keep=True, compression="infer", chain=False
            )
        else:
            print("CELL TYPE SETS NOT MATCHING! Rerun peak calling.")
            print(peaks_celltypes - cto_celltypes)
            print(cto_celltypes - peaks_celltypes)
    else:
        print(f"{consensus_out_path} already exists, skipping...")

# Check % chrM in consensus peaks

In [None]:
consensus_peaks_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*consensus_peaks.bed"))
}
consensus_peaks_path_dict

In [None]:
for sample, path in consensus_peaks_path_dict.items():
    print(sample)
    peaks_df = pd.read_csv(path, sep="\t", header=None)

    chroms_in_df = list(sorted(peaks_df[0].unique()))
    chroms_standard = ["chr" + str(x + 1) for x in range(22)] + ["chrX"]
    chroms_nonstandard = list(set(chroms_in_df) - set(chroms_standard) - set(["chrM"]))

    n_standard = peaks_df[0].value_counts()[chroms_standard].sum()
    n_contigs = peaks_df[0].value_counts()[chroms_nonstandard].sum()
    n_chrm = peaks_df[0].value_counts()["chrM"].sum()
    pct_nonstandard = (n_contigs + n_chrm) / len(peaks_df) * 100

    print(f"\tpeaks on standard chromosomes: {n_standard}")
    print(f"\tpeaks on contigs: {n_contigs}")
    print(f"\tpeaks on chrM: {n_chrm}")
    print(f"\t% peaks non standard chromosomes: {pct_nonstandard}%")
    if "chrY" in chroms_in_df:
        n_chrY = peaks_df[0].value_counts()["chrY"].sum()
        print(f"\tpeaks on chrY: {n_chrY}")
    else:
        print(f"\tpeaks on chrY: 0")