In [1]:
from pycisTopic.pseudobulk_peak_calling import export_pseudobulk, peak_calling
import pyranges as pr
import requests
import os
import pandas as pd
import glob
import pickle

In [2]:
%load_ext lab_black

In [3]:
# get chromosome sizes (hg38)
if not os.path.exists("chromsizes.txt"):
    target_url = (
        "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes"
    )
    chromsizes = pd.read_csv(target_url, sep="\t", header=None)
    chromsizes.columns = ["Chromosome", "End"]
    chromsizes["Start"] = [0] * chromsizes.shape[0]
    chromsizes = chromsizes.loc[:, ["Chromosome", "Start", "End"]]
    chromsizes = pr.PyRanges(chromsizes)
    chromsizes.to_csv("chromsizes.txt")
    chromsizes
else:
    chromsizes = pd.read_csv("chromsizes.txt")

In [4]:
cto_consensus_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("cistopic_objects/*singlets.model*consensus.pkl"))
}
cto_consensus_path_dict

{'BIO_ddseq_1.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_11topics.dimreduc.consensus.pkl',
 'BIO_ddseq_2.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_14topics.dimreduc.consensus.pkl',
 'BIO_ddseq_3.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_10topics.dimreduc.consensus.pkl',
 'BIO_ddseq_4.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_11topics.dimreduc.consensus.pkl',
 'BRO_mtscatac_1.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_11topics.dimreduc.consensus.pkl',
 'BRO_mtscatac_2.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_17topics.dimreduc.consensus.pkl',
 'CNA_10xmultiome_1.FIXEDCELLS': 'cistopic_objects/CNA_10xmultiome_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_10topics.dimreduc.consensus.pkl',
 'CNA_10xm

In [5]:
fragments_path_dict = {
    x.split("/")[-1].split(f".fragments.tsv.gz")[0]: x
    for x in sorted(glob.glob("../1_data_repository/fixedcells_fragments/*.tsv.gz"))
}
fragments_path_dict

{'BIO_ddseq_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_1.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_2.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_3.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_3.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_4.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_4.FIXEDCELLS.fragments.tsv.gz',
 'BRO_mtscatac_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BRO_mtscatac_1.FIXEDCELLS.fragments.tsv.gz',
 'BRO_mtscatac_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BRO_mtscatac_2.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xmultiome_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xmultiome_1.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xmultiome_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xmultiome_2.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xv11_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragmen

In [6]:
import gc
import logging
import os
import re
import subprocess
import sys
from typing import Dict, List, Optional, Union

import numpy as np
import pandas as pd
import pyBigWig
import pyranges as pr
import ray

from pycisTopic.cistopic_class import *
from pycisTopic.utils import *


def export_pseudobulk(
    input_data: Union["CistopicObject", pd.DataFrame, Dict[str, pd.DataFrame]],
    variable: str,
    chromsizes: Union[pd.DataFrame, pr.PyRanges],
    bed_path: str,
    bigwig_path: str,
    path_to_fragments: Optional[Dict[str, str]] = None,
    sample_id_col: Optional[str] = "sample_id",
    n_cpu: Optional[int] = 1,
    normalize_bigwig: Optional[bool] = True,
    remove_duplicates: Optional[bool] = True,
    split_pattern: Optional[str] = "___",
    use_polars: Optional[bool] = True,
    **kwargs
):
    """
    Create pseudobulks as bed and bigwig from single cell fragments file given a barcode annotation.
    Parameters
    ---------
    input_data: CistopicObject or pd.DataFrame
            A :class:`CistopicObject` containing the specified `variable` as a column in :class:`CistopicObject.cell_data` or a cell metadata
            :class:`pd.DataFrame` containing barcode as rows, containing the specified `variable` as a column (additional columns are
            possible) and a `sample_id` column. Index names must contain the BARCODE (e.g. ATGTCGTC-1), additional tags are possible separating with -
            (e.g. ATGCTGTGCG-1-Sample_1). The levels in the sample_id column must agree with the keys in the path_to_fragments dictionary.
            Alternatively, if the cell metadata contains a column named barcode it will be used instead of the index names.
    variable: str
            A character string indicating the column that will be used to create the different group pseudobulk. It must be included in
            the cell metadata provided as input_data.
    chromsizes: pd.DataFrame or pr.PyRanges
            A data frame or :class:`pr.PyRanges` containing size of each chromosome, containing 'Chromosome', 'Start' and 'End' columns.
    bed_path: str
            Path to folder where the fragments bed files per group will be saved. If None, files will not be generated.
    bigwig_path: str
            Path to folder where the bigwig files per group will be saved. If None, files will not be generated.
    path_to_fragments: str or dict, optional
            A dictionary of character strings, with sample name as names indicating the path to the fragments file/s from which pseudobulk profiles have to
            be created. If a :class:`CistopicObject` is provided as input it will be ignored, but if a cell metadata :class:`pd.DataFrame` is provided it
            is necessary to provide it. The keys of the dictionary need to match with the sample_id tag added to the index names of the input data frame.
    sample_id_col: str, optional
            Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'.
    n_cpu: int, optional
            Number of cores to use. Default: 1.
    normalize_bigwig: bool, optional
            Whether bigwig files should be CPM normalized. Default: True.
    remove_duplicates: bool, optional
            Whether duplicates should be removed before converting the data to bigwig.
    split_pattern: str, optional
            Pattern to split cell barcode from sample id. Default: ___ .
    use_polars: bool, optional
            Whether to use polars to read fragments files. Default: True.
    **kwargs
            Additional parameters for ray.init()
    Return
    ------
    dict
            A dictionary containing the paths to the newly created bed fragments files per group a dictionary containing the paths to the
            newly created bigwig files per group.
    """
    # Create logger
    level = logging.INFO
    log_format = "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger("cisTopic")

    # Get fragments file
    if isinstance(input_data, CistopicObject):
        path_to_fragments = input_data.path_to_fragments
        if path_to_fragments is None:
            log.error("No path_to_fragments in this cisTopic object.")
        cell_data = input_data.cell_data
    elif isinstance(input_data, pd.DataFrame):
        if path_to_fragments is None:
            log.error("Please, provide path_to_fragments.")
        cell_data = input_data
    # Check for sample_id column
    try:
        sample_ids = list(set(cell_data[sample_id_col]))
    except ValueError:
        print(
            'Please, include a sample identification column (e.g. "sample_id") in your cell metadata!'
        )

    # Get fragments
    fragments_df_dict = {}
    for sample_id in path_to_fragments.keys():
        if sample_id not in sample_ids:
            log.info(
                "The following path_to_fragments entry is not found in the cell metadata sample_id_col: ",
                sample_id,
                ". It will be ignored.",
            )
        else:
            log.info("Reading fragments from " + path_to_fragments[sample_id])
            fragments_df = read_fragments_from_file(
                path_to_fragments[sample_id], use_polars=use_polars
            ).df
            # Convert to int32 for memory efficiency
            fragments_df.Start = np.int32(fragments_df.Start)
            fragments_df.End = np.int32(fragments_df.End)
            if "Score" in fragments_df:
                fragments_df.Score = np.int32(fragments_df.Score)
            if "barcode" in cell_data:
                fragments_df = fragments_df.loc[
                    fragments_df["Name"].isin(cell_data["barcode"].tolist())
                ]
            else:
                fragments_df = fragments_df.loc[
                    fragments_df["Name"].isin(
                        prepare_tag_cells(cell_data.index.tolist(), split_pattern)
                    )
                ]
            fragments_df_dict[sample_id] = fragments_df

    # Set groups
    if "barcode" in cell_data:
        cell_data = cell_data.loc[:, [variable, sample_id_col, "barcode"]]
    else:
        cell_data = cell_data.loc[:, [variable, sample_id_col]]
    cell_data[variable] = cell_data[variable].replace(" ", "", regex=True)
    cell_data[variable] = cell_data[variable].replace("[^A-Za-z0-9]+", "_", regex=True)
    groups = sorted(list(set(cell_data[variable])))
    # Check chromosome sizes
    if isinstance(chromsizes, pd.DataFrame):
        chromsizes = chromsizes.loc[:, ["Chromosome", "Start", "End"]]
        chromsizes = pr.PyRanges(chromsizes)
    # Check that output dir exist and generate output paths
    if isinstance(bed_path, str):
        if not os.path.exists(bed_path):
            os.makedirs(bed_path)
        bed_paths = {
            group: os.path.join(bed_path, str(group) + ".bed.gz") for group in groups
        }
    else:
        bed_paths = {}
    if isinstance(bigwig_path, str):
        if not os.path.exists(bigwig_path):
            os.makedirs(bigwig_path)
        bw_paths = {
            group: os.path.join(bigwig_path, str(group) + ".bw") for group in groups
        }
    else:
        bw_paths = {}
    # Create pseudobulks
    if n_cpu > 1:
        ray.init(num_cpus=n_cpu, **kwargs)
        ray_handle = ray.wait(
            [
                export_pseudobulk_ray.remote(
                    cell_data,
                    group,
                    fragments_df_dict,
                    chromsizes,
                    bigwig_path,
                    bed_path,
                    sample_id_col,
                    normalize_bigwig,
                    remove_duplicates,
                    split_pattern,
                )
                for group in groups
            ],
            num_returns=len(groups),
        )
        ray.shutdown()
    else:
        [
            export_pseudobulk_one_sample(
                cell_data,
                group,
                fragments_df_dict,
                chromsizes,
                bigwig_path,
                bed_path,
                sample_id_col,
                normalize_bigwig,
                remove_duplicates,
                split_pattern,
            )
            for group in groups
        ]

    return bw_paths, bed_paths


def export_pseudobulk_one_sample(
    cell_data: pd.DataFrame,
    group: str,
    fragments_df_dict: Dict[str, pd.DataFrame],
    chromsizes: pr.PyRanges,
    bigwig_path: str,
    bed_path: str,
    sample_id_col: Optional[str] = "sample_id",
    normalize_bigwig: Optional[bool] = True,
    remove_duplicates: Optional[bool] = True,
    split_pattern: Optional[str] = "___",
):
    """
    Create pseudobulk as bed and bigwig from single cell fragments file given a barcode annotation and a group.
    Parameters
    ---------
    cell_data: pd.DataFrame
            A cell metadata :class:`pd.Dataframe` containing barcodes, their annotation and their sample of origin.
    group: str
            A character string indicating the group for which pseudobulks will be created.
    fragments_df_dict: dict
            A dictionary containing data frames as values with 'Chromosome', 'Start', 'End', 'Name', and 'Score' as columns; and sample label
            as keys. 'Score' indicates the number of times that a fragments is found assigned to that barcode.
    chromsizes: pr.PyRanges
            A :class:`pr.PyRanges` containing size of each column, containing 'Chromosome', 'Start' and 'End' columns.
    bigwig_path: str
            Path to folder where the bigwig file will be saved.
    bed_path: str
            Path to folder where the fragments bed file will be saved.
    sample_id_col: str, optional
            Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'.
    normalize_bigwig: bool, optional
            Whether bigwig files should be CPM normalized. Default: True.
    remove_duplicates: bool, optional
            Whether duplicates should be removed before converting the data to bigwig.
    split_pattern: str
            Pattern to split cell barcode from sample id. Default: ___ .
    """
    # Create logger
    level = logging.INFO
    log_format = "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger("cisTopic")

    log.info("Creating pseudobulk for " + str(group))
    group_fragments_list = []
    group_fragments_dict = {}
    for sample_id in fragments_df_dict:
        sample_data = cell_data[cell_data.loc[:, sample_id_col].isin([sample_id])]
        if "barcode" in sample_data:
            sample_data.index = sample_data["barcode"].tolist()
        else:
            sample_data.index = prepare_tag_cells(
                sample_data.index.tolist(), split_pattern
            )
        group_var = sample_data.iloc[:, 0]
        barcodes = group_var[group_var.isin([group])].index.tolist()
        fragments_df = fragments_df_dict[sample_id]
        group_fragments = fragments_df.loc[fragments_df["Name"].isin(barcodes)]
        if len(fragments_df_dict) > 1:
            group_fragments_dict[sample_id] = group_fragments

    if len(fragments_df_dict) > 1:
        group_fragments_list = [
            group_fragments_dict[list(group_fragments_dict.keys())[x]]
            for x in range(len(fragments_df_dict))
        ]
        group_fragments = group_fragments_list[0].append(group_fragments_list[1:])

    group_fragments = group_fragments[
        group_fragments["Chromosome"].isin(chromsizes.Chromosome)
    ]

    del group_fragments_dict
    del group_fragments_list
    del fragments_df
    gc.collect()

    group_pr = pr.PyRanges(group_fragments)
    if isinstance(bigwig_path, str):
        bigwig_path_group = os.path.join(bigwig_path, str(group) + ".bw")
        if remove_duplicates:
            group_pr.to_bigwig(
                path=bigwig_path_group,
                chromosome_sizes=chromsizes,
                rpm=normalize_bigwig,
            )
        else:
            group_pr.to_bigwig(
                path=bigwig_path_group,
                chromosome_sizes=chromsizes,
                rpm=normalize_bigwig,
                value_col="Score",
            )
    if isinstance(bed_path, str):
        bed_path_group = os.path.join(bed_path, str(group) + ".bed.gz")
        group_pr.to_bed(
            path=bed_path_group, keep=False, compression="infer", chain=False
        )

    log.info(str(group) + " done!")


@ray.remote
def export_pseudobulk_ray(
    cell_data: pd.DataFrame,
    group: str,
    fragments_df_dict: Dict[str, pd.DataFrame],
    chromsizes: pr.PyRanges,
    bigwig_path: str,
    bed_path: str,
    sample_id_col: Optional[str] = "sample_id",
    normalize_bigwig: Optional[bool] = True,
    remove_duplicates: Optional[bool] = True,
    split_pattern: Optional[str] = "___",
):
    """
    Create pseudobulk as bed and bigwig from single cell fragments file given a barcode annotation and a group.
    Parameters
    ---------
    cell_data: pd.DataFrame
            A cell metadata :class:`pd.Dataframe` containing barcodes, their annotation and their sample of origin.
    group: str
            A character string indicating the group for which pseudobulks will be created.
    fragments_df_dict: dict
            A dictionary containing data frames as values with 'Chromosome', 'Start', 'End', 'Name', and 'Score' as columns; and sample label
            as keys. 'Score' indicates the number of times that a fragments is found assigned to that barcode.
    chromsizes: pr.PyRanges
            A :class:`pr.PyRanges` containing size of each column, containing 'Chromosome', 'Start' and 'End' columns.
    bed_path: str
            Path to folder where the fragments bed file will be saved.
    bigwig_path: str
            Path to folder where the bigwig file will be saved.
    sample_id_col: str, optional
            Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'.
    normalize_bigwig: bool, optional
            Whether bigwig files should be CPM normalized. Default: True.
    remove_duplicates: bool, optional
            Whether duplicates should be removed before converting the data to bigwig.
    split_pattern: str
            Pattern to split cell barcode from sample id. Default: ___ .
    """
    export_pseudobulk_one_sample(
        cell_data,
        group,
        fragments_df_dict,
        chromsizes,
        bigwig_path,
        bed_path,
        sample_id_col,
        normalize_bigwig,
        remove_duplicates,
        split_pattern,
    )

# consensus

In [7]:
cell_data_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("cistopic_objects/*consensus.cell_data.tsv"))
}
cell_data_path_dict

{'BIO_ddseq_1.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_11topics.dimreduc.consensus.cell_data.tsv',
 'BIO_ddseq_2.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_14topics.dimreduc.consensus.cell_data.tsv',
 'BIO_ddseq_3.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_10topics.dimreduc.consensus.cell_data.tsv',
 'BIO_ddseq_4.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_11topics.dimreduc.consensus.cell_data.tsv',
 'BRO_mtscatac_1.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_11topics.dimreduc.consensus.cell_data.tsv',
 'BRO_mtscatac_2.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_17topics.dimreduc.consensus.cell_data.tsv',
 'CNA_10xmultiome_1.FIXEDCELLS': 'cistopic_objects/CNA_10xmultiome_1.FIXEDCELLS__cto.scrublet0-4.fmx

In [8]:
bw_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*_pseudobulk_bw_files"))
}
bw_path_dict

{'BIO_ddseq_1.FIXEDCELLS': 'final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_2.FIXEDCELLS': 'final_consensus_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_3.FIXEDCELLS': 'final_consensus_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_4.FIXEDCELLS': 'final_consensus_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'BRO_mtscatac_1.FIXEDCELLS': 'final_consensus_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'BRO_mtscatac_2.FIXEDCELLS': 'final_consensus_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'CNA_10xmultiome_1.FIXEDCELLS': 'final_consensus_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'CNA_10xmultiome_2.FIXEDCELLS': 'final_consensus_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'CNA_10xv11_1.FIXEDCELLS': 'final_consensus_peaks/CNA_10xv11_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'CNA_10xv11_2.FIXEDCELLS': 'final_

In [9]:
bed_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*_pseudobulk_bed_files"))
}
bed_path_dict

{'BIO_ddseq_1.FIXEDCELLS': 'final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_2.FIXEDCELLS': 'final_consensus_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_3.FIXEDCELLS': 'final_consensus_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_4.FIXEDCELLS': 'final_consensus_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'BRO_mtscatac_1.FIXEDCELLS': 'final_consensus_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'BRO_mtscatac_2.FIXEDCELLS': 'final_consensus_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'CNA_10xmultiome_1.FIXEDCELLS': 'final_consensus_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'CNA_10xmultiome_2.FIXEDCELLS': 'final_consensus_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'CNA_10xv11_1.FIXEDCELLS': 'final_consensus_peaks/CNA_10xv11_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'CNA_10xv11_2.FIXEDCELLS'

In [10]:
for sample in bed_path_dict.keys():
    bed_paths = {
        x.split("/")[-1].split("__")[0].split(".bed.gz")[0]: x
        for x in glob.glob(bed_path_dict[sample] + "/*")
    }

In [11]:
from pycisTopic.pseudobulk_peak_calling import *

In [12]:
import ray

# call consensus peaks

In [13]:
from pycisTopic.iterative_peak_calling import get_consensus_peaks

In [14]:
narrow_peaks_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*_narrow_peaks_dict.pkl"))
}
narrow_peaks_path_dict

{'BIO_ddseq_1.FIXEDCELLS': 'final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl',
 'BIO_ddseq_2.FIXEDCELLS': 'final_consensus_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl',
 'BIO_ddseq_3.FIXEDCELLS': 'final_consensus_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl',
 'BIO_ddseq_4.FIXEDCELLS': 'final_consensus_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl',
 'BRO_mtscatac_1.FIXEDCELLS': 'final_consensus_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl',
 'BRO_mtscatac_2.FIXEDCELLS': 'final_consensus_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl',
 'CNA_10xmultiome_1.FIXEDCELLS': 'final_consensus_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl',
 'CNA_10xmultiome_2.FIXEDCELLS': 'final_consensus_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl',
 'CNA_10xv11_1.FIXEDCELLS': 'final_consensus_peaks/CNA_10xv11_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl',
 'CNA_10xv11_2.FI

In [136]:
path_to_blacklist = "../0_resources/regions/hg38-blacklist.v2.bed"

In [15]:
peak_half_width = 250

# Get consensus peaks
consensus_peaks_dict = {}
for sample in narrow_peaks_path_dict.keys():
    print(sample)
    consensus_out_path = narrow_peaks_path_dict[sample].replace(
        "_narrow_peaks_dict.pkl", "_consensus_peaks.bed"
    )
    if not os.path.exists(consensus_out_path):
        cell_data = pd.read_csv(cell_data_path_dict[sample])
        cto_celltypes = set(
            [
                x.replace(" ", "").replace("+", "").replace("_", "")
                for x in set(cell_data["consensus_cell_type"].unique())
            ]
        )

        with open(narrow_peaks_path_dict[sample], "rb") as f:
            narrow_peaks_dict = pickle.load(f)
        peaks_celltypes = set(
            [
                x.replace(" ", "").replace("+", "").replace("_", "")
                for x in set(narrow_peaks_dict.keys())
            ]
        )

        if cto_celltypes == peaks_celltypes:
            consensus_peaks = get_consensus_peaks(
                narrow_peaks_dict,
                peak_half_width,
                chromsizes=chromsizes,
                path_to_blacklist=path_to_blacklist,
            )

            consensus_peaks.to_bed(
                path=consensus_out_path, keep=True, compression="infer", chain=False
            )
        else:
            print("CELL TYPE SETS NOT MATCHING! Rerun peak calling.")
            print(peaks_celltypes - cto_celltypes)
            print(cto_celltypes - peaks_celltypes)
    else:
        print(f"{consensus_out_path} already exists, skipping...")

BIO_ddseq_1.FIXEDCELLS
final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_consensus_peaks.bed already exists, skipping...
BIO_ddseq_2.FIXEDCELLS
final_consensus_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_consensus_peaks.bed already exists, skipping...
BIO_ddseq_3.FIXEDCELLS
final_consensus_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_consensus_peaks.bed already exists, skipping...
BIO_ddseq_4.FIXEDCELLS
final_consensus_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_consensus_peaks.bed already exists, skipping...
BRO_mtscatac_1.FIXEDCELLS
final_consensus_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_consensus_peaks.bed already exists, skipping...
BRO_mtscatac_2.FIXEDCELLS
final_consensus_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_consensus_peaks.bed already exists, skipping...
CNA_10xmultiome_1.FIXEDCELLS
final_consensus_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_consensus_peaks.bed already exists, skipping...
CNA_10xmultiome_2.FIXEDCELLS
final_consensus_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_consensus_peaks.bed 

# Check % chrM in consensus peaks

In [138]:
consensus_peaks_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("final_consensus_peaks/*consensus_peaks.bed"))
}
consensus_peaks_path_dict

{'BIO_ddseq_1.FIXEDCELLS': 'final_consensus_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_2.FIXEDCELLS': 'final_consensus_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_3.FIXEDCELLS': 'final_consensus_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_4.FIXEDCELLS': 'final_consensus_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_1.FIXEDCELLS': 'final_consensus_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_2.FIXEDCELLS': 'final_consensus_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_1.FIXEDCELLS': 'final_consensus_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_2.FIXEDCELLS': 'final_consensus_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_1.FIXEDCELLS': 'final_consensus_peaks/CNA_10xv11_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_2.FIXEDCELLS': 'final_

In [139]:
for sample, path in consensus_peaks_path_dict.items():
    print(sample)
    peaks_df = pd.read_csv(path, sep="\t", header=None)

    chroms_in_df = list(sorted(peaks_df[0].unique()))
    chroms_standard = ["chr" + str(x + 1) for x in range(22)] + ["chrX"]
    chroms_nonstandard = list(set(chroms_in_df) - set(chroms_standard) - set(["chrM"]))

    n_standard = peaks_df[0].value_counts()[chroms_standard].sum()
    n_contigs = peaks_df[0].value_counts()[chroms_nonstandard].sum()
    n_chrm = peaks_df[0].value_counts()["chrM"].sum()
    pct_nonstandard = (n_contigs + n_chrm) / len(peaks_df) * 100

    print(f"\tpeaks on standard chromosomes: {n_standard}")
    print(f"\tpeaks on contigs: {n_contigs}")
    print(f"\tpeaks on chrM: {n_chrm}")
    print(f"\t% peaks non standard chromosomes: {pct_nonstandard}%")
    if "chrY" in chroms_in_df:
        n_chrY = peaks_df[0].value_counts()["chrY"].sum()
        print(f"\tpeaks on chrY: {n_chrY}")
    else:
        print(f"\tpeaks on chrY: 0")

BIO_ddseq_1.FIXEDCELLS
	peaks on standard chromosomes: 128186
	peaks on contigs: 115
	peaks on chrM: 23
	% peaks non standard chromosomes: 0.10754028864436893%
	peaks on chrY: 0
BIO_ddseq_2.FIXEDCELLS
	peaks on standard chromosomes: 126797
	peaks on contigs: 55
	peaks on chrM: 22
	% peaks non standard chromosomes: 0.060690133518293735%
	peaks on chrY: 0
BIO_ddseq_3.FIXEDCELLS
	peaks on standard chromosomes: 167832
	peaks on contigs: 242
	peaks on chrM: 23
	% peaks non standard chromosomes: 0.15764707282104976%
	peaks on chrY: 170
BIO_ddseq_4.FIXEDCELLS
	peaks on standard chromosomes: 154291
	peaks on contigs: 245
	peaks on chrM: 23
	% peaks non standard chromosomes: 0.1733965670067741%
	peaks on chrY: 176
BRO_mtscatac_1.FIXEDCELLS
	peaks on standard chromosomes: 202745
	peaks on contigs: 254
	peaks on chrM: 21
	% peaks non standard chromosomes: 0.13545463501132893%
	peaks on chrY: 147
BRO_mtscatac_2.FIXEDCELLS
	peaks on standard chromosomes: 185765
	peaks on contigs: 224
	peaks on chrM

# run seurat

In [6]:
frags_path_dict = {
    x.split("/")[-1].split(f".fragments.tsv.gz")[0]: x
    for x in sorted(
        glob.glob(f"*k/*k_preprocessing_out/data/fragments/*fragments.tsv.gz")
    )
}
frags_path_dict

{}

In [7]:
scrub_name_suffix = "0-4"
frags_path_dict  = {x.split('/')[-1].split(f'.fragments.tsv.gz')[0] :x for x in sorted(glob.glob(f'../1_data_repository/fixedcells_fragments/*.fragments.tsv.gz'))}
frags_path_dict

{'BIO_ddseq_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_1.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_2.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_3.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_3.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_4.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_4.FIXEDCELLS.fragments.tsv.gz',
 'BRO_mtscatac_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BRO_mtscatac_1.FIXEDCELLS.fragments.tsv.gz',
 'BRO_mtscatac_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BRO_mtscatac_2.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xmultiome_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xmultiome_1.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xmultiome_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xmultiome_2.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xv11_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragmen

In [8]:
scrub_name_suffix = "0-4"
loom_path_dict  = {x.split('/')[-1].split(f'__')[0]:x for x in sorted(glob.glob(f'cell_region_looms/*singlets.loom'))}
loom_path_dict

{'BIO_ddseq_1.FIXEDCELLS': 'cell_region_looms/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom',
 'BIO_ddseq_2.FIXEDCELLS': 'cell_region_looms/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom',
 'BIO_ddseq_3.FIXEDCELLS': 'cell_region_looms/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom',
 'BIO_ddseq_4.FIXEDCELLS': 'cell_region_looms/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom',
 'BRO_mtscatac_1.FIXEDCELLS': 'cell_region_looms/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom',
 'BRO_mtscatac_2.FIXEDCELLS': 'cell_region_looms/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom',
 'CNA_10xmultiome_1.FIXEDCELLS': 'cell_region_looms/CNA_10xmultiome_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom',
 'CNA_10xmultiome_2.FIXEDCELLS': 'cell_region_looms/CNA_10xmultiome_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom',
 'CNA_10xv11_1.FIXEDCELLS': 'cell_region_looms/CNA_10xv11_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom',
 'CNA_10

In [16]:
!cat ../0_resources/scripts/seurat_label_transfer_consensus.R

#!/usr/bin/env Rscript
library(Seurat)
library(SeuratDisk)
library(Signac)
# library(EnsDb.Hsapiens.v86)
library(ggplot2)
library(stringr)

args = commandArgs(trailingOnly=TRUE)
sample_id = args[1]
f_loom = args[2]
f_frag = args[3]
f_reference = args[4]
f_annotation = args[5]
f_out = args[6]

print(paste0("Processing sample ", args[1]))

# load pbmc object
pbmc.rna <- readRDS("../0_resources/seurat_references/pbmc_integrated.RDS")
#pbmc.rna <- readRDS('/lustre1/project/stg_00090/scatac_benchmark/0_resources/seurat_references/pbmc_ssc_mat__integrated.rds')

################################################################################
# ATAC
################################################################################

### get data from loom:
atacloomcon <- Connect(filename = f_loom, mode = "r")
atacloomcon
atac_tmp <- as.Seurat(atacloomcon, assay='ATAC')
atacloomcon$close_all()

# subset by removing contig chromosomes
rawregions = rownames(GetAssayData(atac_tmp, slot = "counts", a

In [9]:
parallel_filename = "seurat_label_transfer.parallel"
script_path = "../0_resources/scripts/seurat_label_transfer_consensus.R"
img_path = "../0_resources/vsn_cache/cflerin-seurat-4.0.3-plus.sif"
reference_path = "../0_resources/seurat_references/pbmc_ref.rds"
annotation_path = "../0_resources/seurat_references/granges_annotation.rds"
bind_mounts = "/dodrio,/readonly/dodrio,/tmp"
#bind_mounts = "/lustre1,/staging,${VSC_SCRATCH}/tmp:/tmp"

with open(parallel_filename, 'w') as f:
    for sample in loom_path_dict.keys():
        outfile = f"cell_type_classification/{sample}__cell_type_seurat.txt"
        if not os.path.exists(outfile):
            loomfile = loom_path_dict[sample]
            fragfile = frags_path_dict[sample]
            command=f"img_path={img_path} && singularity exec --cleanenv -H $PWD -B {bind_mounts} $img_path Rscript {script_path} {sample} {loomfile} {fragfile} {reference_path} {annotation_path} {outfile}"
            f.write(f"{command}\n")
            print(command)

        else:
            f.write(f"#{outfile} already exists!\n")
            print(f"#{outfile} already exists!")

#cell_type_classification/BIO_ddseq_1.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/BIO_ddseq_2.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/BIO_ddseq_3.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/BIO_ddseq_4.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/BRO_mtscatac_1.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/BRO_mtscatac_2.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/CNA_10xmultiome_1.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/CNA_10xmultiome_2.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/CNA_10xv11_1.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/CNA_10xv11_2.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/CNA_10xv11_3.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classificat

In [10]:
parallel_filename = "seurat_label_transfer.parallel"
script_path = "../0_resources/scripts/seurat_label_transfer_consensus.R"
img_path = "../0_resources/vsn_cache/cflerin-seurat-4.0.3-plus.sif"
reference_path = "../0_resources/seurat_references/pbmc_ref.rds"
annotation_path = "../0_resources/seurat_references/granges_annotation.rds"
bind_mounts = "/dodrio,/readonly/dodrio,/tmp"
#bind_mounts = "/lustre1,/staging,${VSC_SCRATCH}/tmp:/tmp"

with open(parallel_filename, 'w') as f:
    for sample in loom_path_dict.keys():
        outfile = f"cell_type_classification/{sample}__cell_type_seurat.txt"
        if not os.path.exists(outfile):
            loomfile = loom_path_dict[sample]
            fragfile = frags_path_dict[sample]
            command=f"Rscript {script_path} {sample} {loomfile} {fragfile} {reference_path} {annotation_path} {outfile}"
            f.write(f"{command}\n")
            print(command)

        else:
            f.write(f"#{outfile} already exists!\n")
            print(f"#{outfile} already exists!")

#cell_type_classification/BIO_ddseq_1.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/BIO_ddseq_2.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/BIO_ddseq_3.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/BIO_ddseq_4.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/BRO_mtscatac_1.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/BRO_mtscatac_2.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/CNA_10xmultiome_1.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/CNA_10xmultiome_2.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/CNA_10xv11_1.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/CNA_10xv11_2.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/CNA_10xv11_3.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classificat

In [19]:
parallel_filename = "seurat_label_transfer.parallel"
script_path = "../0_resources/scripts/seurat_label_transfer_consensus.R"
img_path = "../0_resources/vsn_cache/cflerin-seurat-4.0.3-plus.sif"
reference_path = "../0_resources/seurat_references/pbmc_ref.rds"
annotation_path = "../0_resources/seurat_references/granges_annotation.rds"
bind_mounts = "/dodrio,/readonly/dodrio,/tmp"
#bind_mounts = "/lustre1,/staging,${VSC_SCRATCH}/tmp:/tmp"

with open(parallel_filename, 'w') as f:
    for sample in loom_path_dict.keys():
        outfile = f"cell_type_classification/{sample}__cell_type_seurat.txt"
        if not os.path.exists(outfile):
            loomfile = loom_path_dict[sample]
            fragfile = frags_path_dict[sample]
            command=f"Rscript {script_path} {sample} {loomfile} {fragfile} {reference_path} {annotation_path} {outfile}"
            f.write(f"{command}\n")
            print(command)

        else:
            f.write(f"#{outfile} already exists!\n")
            print(f"#{outfile} already exists!")

#cell_type_classification/BIO_ddseq_1.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/BIO_ddseq_2.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/BIO_ddseq_3.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/BIO_ddseq_4.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/BRO_mtscatac_1.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/BRO_mtscatac_2.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/CNA_10xmultiome_1.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/CNA_10xmultiome_2.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/CNA_10xv11_1.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/CNA_10xv11_2.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classification/CNA_10xv11_3.FIXEDCELLS__cell_type_seurat.txt already exists!
#cell_type_classificat

# Run the Rscript

```
mkdir cell_type_classification
export OMP_THREAD_LIMIT=80
cat seurat_label_transfer.parallel | parallel -j 2 --progress
```

In [20]:
kill $(jobs -p)

SyntaxError: invalid syntax (4107442686.py, line 1)