In [15]:
from pycisTopic.pseudobulk_peak_calling import export_pseudobulk, peak_calling
import pyranges as pr
import requests
import os
import pandas as pd
import glob
import pickle

In [16]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [17]:
# get chromosome sizes (hg38)
if not os.path.exists("chromsizes.txt"):
    target_url = (
        "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes"
    )
    chromsizes = pd.read_csv(target_url, sep="\t", header=None)
    chromsizes.columns = ["Chromosome", "End"]
    chromsizes["Start"] = [0] * chromsizes.shape[0]
    chromsizes = chromsizes.loc[:, ["Chromosome", "Start", "End"]]
    chromsizes = pr.PyRanges(chromsizes)
    chromsizes.to_csv("chromsizes.txt")
    chromsizes
else:
    chromsizes = pd.read_csv("chromsizes.txt")

# write cell data

In [18]:
cto_consensus_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("cistopic_objects/*singlets.model*consensus.pkl"))
}
cto_consensus_path_dict

{'BIO_ddseq_1.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_9topics.dimreduc.consensus.pkl',
 'BIO_ddseq_2.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_14topics.dimreduc.consensus.pkl',
 'BIO_ddseq_3.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_11topics.dimreduc.consensus.pkl',
 'BIO_ddseq_4.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_7topics.dimreduc.consensus.pkl',
 'BRO_mtscatac_1.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_12topics.dimreduc.consensus.pkl',
 'BRO_mtscatac_2.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_10topics.dimreduc.consensus.pkl',
 'CNA_10xmultiome_1.FIXEDCELLS': 'cistopic_objects/CNA_10xmultiome_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_12topics.dimreduc.consensus.pkl',
 'CNA_10xmul

In [19]:
for sample in cto_consensus_path_dict.keys():
    cto_path = cto_consensus_path_dict[sample]
    cell_data_path = cto_path.replace(".pkl", ".cell_data.csv")
    if not os.path.exists(cell_data_path):
        with open(cto_path, "rb") as f:
            cto = pickle.load(f)

        cto.cell_data.to_csv(cell_data_path)
        print(f"written {cell_data_path}")
    else:
        print(f"{cell_data_path} already exists, skipping...")

cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_9topics.dimreduc.consensus.cell_data.csv already exists, skipping...
cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_14topics.dimreduc.consensus.cell_data.csv already exists, skipping...
cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_11topics.dimreduc.consensus.cell_data.csv already exists, skipping...
cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_7topics.dimreduc.consensus.cell_data.csv already exists, skipping...
cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_12topics.dimreduc.consensus.cell_data.csv already exists, skipping...
cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_10topics.dimreduc.consensus.cell_data.csv already exists, skipping...
cistopic_objects/CNA_10xmultiome_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_12topics.dimreduc.consensus.cell_data.

# consensus

In [20]:
cell_data_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("cistopic_objects/*.cell_data.csv"))
}
cell_data_path_dict

{'BIO_ddseq_1.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_9topics.dimreduc.consensus.cell_data.csv',
 'BIO_ddseq_2.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_14topics.dimreduc.consensus.cell_data.csv',
 'BIO_ddseq_3.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_11topics.dimreduc.consensus.cell_data.csv',
 'BIO_ddseq_4.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_7topics.dimreduc.consensus.cell_data.csv',
 'BRO_mtscatac_1.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_12topics.dimreduc.consensus.cell_data.csv',
 'BRO_mtscatac_2.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_10topics.dimreduc.consensus.cell_data.csv',
 'CNA_10xmultiome_1.FIXEDCELLS': 'cistopic_objects/CNA_10xmultiome_1.FIXEDCELLS__cto.scrublet0-4.fmx.s

In [21]:
bw_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("SCREEN_peaks/*_pseudobulk_bw_files"))
}
bw_path_dict

{'BIO_ddseq_1.FIXEDCELLS': 'SCREEN_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_2.FIXEDCELLS': 'SCREEN_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_3.FIXEDCELLS': 'SCREEN_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'BIO_ddseq_4.FIXEDCELLS': 'SCREEN_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'BRO_mtscatac_1.FIXEDCELLS': 'SCREEN_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'BRO_mtscatac_2.FIXEDCELLS': 'SCREEN_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'CNA_10xmultiome_1.FIXEDCELLS': 'SCREEN_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'CNA_10xmultiome_2.FIXEDCELLS': 'SCREEN_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'CNA_10xv11_1.FIXEDCELLS': 'SCREEN_peaks/CNA_10xv11_1.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'CNA_10xv11_2.FIXEDCELLS': 'SCREEN_peaks/CNA_10xv11_2.FIXEDCELLS__SCREEN_pseudobulk_bw_files',
 'CNA_10xv11_3.FIXED

In [22]:
bed_path_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("SCREEN_peaks/*_pseudobulk_bed_files"))
}
bed_path_dict

{'BIO_ddseq_1.FIXEDCELLS': 'SCREEN_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_2.FIXEDCELLS': 'SCREEN_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_3.FIXEDCELLS': 'SCREEN_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'BIO_ddseq_4.FIXEDCELLS': 'SCREEN_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'BRO_mtscatac_1.FIXEDCELLS': 'SCREEN_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'BRO_mtscatac_2.FIXEDCELLS': 'SCREEN_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'CNA_10xmultiome_1.FIXEDCELLS': 'SCREEN_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'CNA_10xmultiome_2.FIXEDCELLS': 'SCREEN_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'CNA_10xv11_1.FIXEDCELLS': 'SCREEN_peaks/CNA_10xv11_1.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'CNA_10xv11_2.FIXEDCELLS': 'SCREEN_peaks/CNA_10xv11_2.FIXEDCELLS__SCREEN_pseudobulk_bed_files',
 'CNA_10xv

In [23]:
for sample in bed_path_dict.keys():
    bed_paths = {
        x.split("/")[-1].split("__")[0].split(".bed.gz")[0]: x
        for x in glob.glob(bed_path_dict[sample] + "/*")
    }

In [24]:
from pycisTopic.pseudobulk_peak_calling import *

In [25]:
import ray

In [26]:
narrow_peaks_dict = {}
ray.shutdown()
for sample in bed_path_dict.keys():
    narrow_peaks_dict_path = bed_path_dict[sample].replace(
        "_pseudobulk_bed_files", "_narrow_peaks_dict.pkl"
    )
    peak_path = os.path.join("SCREEN_peaks", f"{sample}__SCREEN_consensus_peaks")
    if not os.path.exists(peak_path):
        os.mkdir(peak_path)

    if not os.path.exists(narrow_peaks_dict_path):
        cell_data = pd.read_csv(cell_data_path_dict[sample])
        cto_celltypes = set(
            [
                x.replace(" ", "").replace("+", "").replace("_", "")
                for x in set(cell_data["consensus_cell_type"].unique())
            ]
        )

        bed_celltypes = set(
            [
                x.split(".")[0].replace("+", "").replace("_", "")
                for x in os.listdir(bed_path_dict[sample])
            ]
        )

        if cto_celltypes == bed_celltypes:
            print(f"Starting {narrow_peaks_dict_path}")
            bed_paths = {
                x.split("/")[-1].split("__")[0].split(".bed.gz")[0]: x
                for x in glob.glob(bed_path_dict[sample] + "/*")
            }
            narrow_peaks_dict = peak_calling(
                macs_path="macs2",
                bed_paths=bed_paths,
                outdir=peak_path,
                genome_size="hs",
                n_cpu=20,
                input_format="BEDPE",
                shift=73,
                ext_size=146,
                keep_dup="all",
                q_value=0.05,
            )
            with open(narrow_peaks_dict_path, "wb") as f:
                pickle.dump(narrow_peaks_dict, f)
        else:
            print(f"{sample} cell types not matching!! Rerun bed file writing.")
            print(f"\t{bed_celltypes}")
            print(f"\t{cto_celltypes}")
    else:
        print(f"{narrow_peaks_dict_path} already exists")

SCREEN_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/CNA_10xv11_1.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/CNA_10xv11_2.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/CNA_10xv11_3.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN_peaks/CNA_10xv11_4.FIXEDCELLS__SCREEN_narrow_peaks_dict.pkl already exists
SCREEN

In [27]:
sample

'VIB_hydrop_22.FIXEDCELLS'

In [28]:
cell_data["consensus_cell_type"].unique()

array(['B cell', 'CD14+ monocyte', 'CD4+ T cell', 'Cytotoxic T cell'],
      dtype=object)