In [20]:
import pycisTopic
import glob
import os
import pybiomart as pbm
import pandas as pd
import pickle
from pycisTopic.qc import *
from IPython.display import Image, display
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import multiprocess as mp  # for kde multithreading calculation
from multiprocess import Pool

%matplotlib inline
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


# Download annotation

In [3]:
!pwd

/dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/35k


In [2]:
depth_str = os.getcwd().split("/")[-1]

In [4]:
wdir = f"/dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/{depth_str}"
os.chdir(wdir)

In [5]:
genome = "hg38"

pbm_genome_name_dict = {
    "hg38": "hsapiens_gene_ensembl",
    "hg37": "hsapiens_gene_ensembl",
    "mm10": "mmusculus_gene_ensembl",
    "dm6": "dmelanogaster_gene_ensembl",
}

pbm_host_dict = {
    "hg38": "http://www.ensembl.org",
    "hg37": "http://grch37.ensembl.org/",
    "mm10": "http://nov2020.archive.ensembl.org/",
    "dm6": "http://www.ensembl.org",
}

if os.path.exists(f"../annotation.tsv"):
    print(f"Loading cached genome annotation...")
    annotation = pd.read_csv("../annotation.tsv", sep="\t", header=0, index_col=0)
else:
    dataset = pbm.Dataset(name=pbm_genome_name_dict[genome], host=pbm_host_dict[genome])

    annotation = dataset.query(
        attributes=[
            "chromosome_name",
            "transcription_start_site",
            "strand",
            "external_gene_name",
            "transcript_biotype",
        ]
    )
    filter = annotation["Chromosome/scaffold name"].str.contains("CHR|GL|JH|MT")
    annotation = annotation[~filter]
    annotation["Chromosome/scaffold name"] = annotation[
        "Chromosome/scaffold name"
    ].str.replace(r"(\b\S)", r"chr\1")
    annotation.columns = ["Chromosome", "Start", "Strand", "Gene", "Transcript_type"]
    annotation = annotation[annotation.Transcript_type == "protein_coding"]
    annotation.to_csv("annotation.tsv", sep="\t")

Loading cached genome annotation...


In [6]:
fragments_list = sorted(
    glob.glob(f"{depth_str}_preprocessing_out/data/fragments/*fragments.tsv.gz")
)
fragments_dict = {}
for fragments_file in fragments_list:
    sample = fragments_file.split("/")[-1].split(".fragments.tsv.gz")[0]
    fragments_dict[sample] = fragments_file

fragments_dict

{'BIO_ddseq_1.35k': '35k_preprocessing_out/data/fragments/BIO_ddseq_1.35k.fragments.tsv.gz',
 'BIO_ddseq_2.35k': '35k_preprocessing_out/data/fragments/BIO_ddseq_2.35k.fragments.tsv.gz',
 'BIO_ddseq_3.35k': '35k_preprocessing_out/data/fragments/BIO_ddseq_3.35k.fragments.tsv.gz',
 'BIO_ddseq_4.35k': '35k_preprocessing_out/data/fragments/BIO_ddseq_4.35k.fragments.tsv.gz',
 'BRO_mtscatac_1.35k': '35k_preprocessing_out/data/fragments/BRO_mtscatac_1.35k.fragments.tsv.gz',
 'BRO_mtscatac_2.35k': '35k_preprocessing_out/data/fragments/BRO_mtscatac_2.35k.fragments.tsv.gz',
 'CNA_10xmultiome_1.35k': '35k_preprocessing_out/data/fragments/CNA_10xmultiome_1.35k.fragments.tsv.gz',
 'CNA_10xmultiome_2.35k': '35k_preprocessing_out/data/fragments/CNA_10xmultiome_2.35k.fragments.tsv.gz',
 'CNA_10xv11_1.35k': '35k_preprocessing_out/data/fragments/CNA_10xv11_1.35k.fragments.tsv.gz',
 'CNA_10xv11_2.35k': '35k_preprocessing_out/data/fragments/CNA_10xv11_2.35k.fragments.tsv.gz',
 'CNA_10xv11_3.35k': '35k_prep

In [7]:
regions_paths_dict = {
    x.split("/")[-1].split(f"__")[0].replace("FIXEDCELLS", depth_str): x
    for x in sorted(glob.glob("../SCREEN_peaks/*consensus_peaks.bed"))
}
regions_paths_dict

{'BIO_ddseq_1.35k': '../SCREEN_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_2.35k': '../SCREEN_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_3.35k': '../SCREEN_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_4.35k': '../SCREEN_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_1.35k': '../SCREEN_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_2.35k': '../SCREEN_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_1.35k': '../SCREEN_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_2.35k': '../SCREEN_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_1.35k': '../SCREEN_peaks/CNA_10xv11_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_2.35k': '../SCREEN_peaks/CNA_10xv11_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_3.35k': '../SCREEN_peaks/CNA_10xv11_3.FIXEDCELL

In [12]:
regions_paths_dict["VIB_hydrop_11." + depth_str] = regions_paths_dict[
    "VIB_hydrop_1." + depth_str
]
regions_paths_dict["VIB_hydrop_12." + depth_str] = regions_paths_dict[
    "VIB_hydrop_1." + depth_str
]
regions_paths_dict["VIB_hydrop_21." + depth_str] = regions_paths_dict[
    "VIB_hydrop_2." + depth_str
]
regions_paths_dict["VIB_hydrop_22." + depth_str] = regions_paths_dict[
    "VIB_hydrop_2." + depth_str
]

In [14]:
regions_paths_dict

{'BIO_ddseq_1.35k': '../SCREEN_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_2.35k': '../SCREEN_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_3.35k': '../SCREEN_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_4.35k': '../SCREEN_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_1.35k': '../SCREEN_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_2.35k': '../SCREEN_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_1.35k': '../SCREEN_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_2.35k': '../SCREEN_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_1.35k': '../SCREEN_peaks/CNA_10xv11_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_2.35k': '../SCREEN_peaks/CNA_10xv11_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_3.35k': '../SCREEN_peaks/CNA_10xv11_3.FIXEDCELL

Now, make a sub dictionary of all samples within the fragments dict that have not been run yet (good for resuming a stopped cistopic run):

In [15]:
cistopic_qc_out = os.path.join(wdir, "cistopic_qc_out_CONSENSUS")
if not os.path.exists(cistopic_qc_out):
    os.makedirs(cistopic_qc_out)

In [16]:
fragments_sub_dict = {}
regions_sub_dict = {}
for sample in fragments_dict.keys():
    metadata_file = os.path.join(cistopic_qc_out, sample + "__metadata_bc.pkl")
    print(f"Checking if {metadata_file} exist...")
    if os.path.exists(metadata_file):
        print("\tMetadata exists! Skipping...")
    else:
        fragments_sub_dict[sample] = fragments_dict[sample]
        print("\tMetadata does not exist, adding to subdict to generate")

Checking if /dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/35k/cistopic_qc_out_CONSENSUS/BIO_ddseq_1.35k__metadata_bc.pkl exist...
	Metadata does not exist, adding to subdict to generate
Checking if /dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/35k/cistopic_qc_out_CONSENSUS/BIO_ddseq_2.35k__metadata_bc.pkl exist...
	Metadata does not exist, adding to subdict to generate
Checking if /dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/35k/cistopic_qc_out_CONSENSUS/BIO_ddseq_3.35k__metadata_bc.pkl exist...
	Metadata does not exist, adding to subdict to generate
Checking if /dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/35k/cistopic_qc_out_CONSENSUS/BIO_ddseq_4.35k__metadata_bc.pkl exist...
	Metadata does not exist, adding to subdict to generate
Checking if /dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsa

In [17]:
regions_sub_dict = {x: regions_paths_dict[x] for x in sorted(regions_paths_dict.keys())}

In [18]:
regions_sub_dict

{'BIO_ddseq_1.35k': '../SCREEN_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_2.35k': '../SCREEN_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_3.35k': '../SCREEN_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_4.35k': '../SCREEN_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_1.35k': '../SCREEN_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_2.35k': '../SCREEN_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_1.35k': '../SCREEN_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_2.35k': '../SCREEN_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_1.35k': '../SCREEN_peaks/CNA_10xv11_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_2.35k': '../SCREEN_peaks/CNA_10xv11_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_3.35k': '../SCREEN_peaks/CNA_10xv11_3.FIXEDCELL

In [19]:
set(regions_sub_dict) == set(fragments_sub_dict)
set(regions_sub_dict) - set(fragments_sub_dict)

{'UCS_ddseq_1.35k', 'UCS_ddseq_2.35k', 'VIB_hydrop_1.35k', 'VIB_hydrop_2.35k'}

In [14]:
common = set(fragments_sub_dict).intersection(set(regions_sub_dict))

In [15]:
len(common)

45

In [16]:
regions_sub_dict = {x: regions_sub_dict[x] for x in list(common)}
fragments_sub_dict = {x: fragments_sub_dict[x] for x in list(common)}

In [17]:
ray.shutdown()

In [12]:
n_cores = 16
if regions_sub_dict != {}:
    samples_sub = list(regions_sub_dict.keys())
    blocks = [samples_sub[i : i + n_cores] for i in range(0, len(samples_sub), n_cores)]
    for samples_torun_in_block in blocks:
        fragments_sub_dict_block = {
            key: fragments_sub_dict[key] for key in samples_torun_in_block
        }
        regions_sub_dict_block = {
            key: regions_sub_dict[key] for key in samples_torun_in_block
        }

        metadata_bc_dict, profile_data_dict = compute_qc_stats(
            fragments_dict=fragments_sub_dict_block,
            tss_annotation=annotation,
            stats=[
                "barcode_rank_plot",
                "duplicate_rate",
                "insert_size_distribution",
                "profile_tss",
                "frip",
            ],
            label_list=None,
            path_to_regions=regions_sub_dict_block,
            n_cpu=n_cores,
            valid_bc=None,
            n_frag=10,
            n_bc=None,
            tss_flank_window=2000,
            tss_window=50,
            tss_minimum_signal_window=100,
            tss_rolling_window=10,
            # min_norm=0.2,
            remove_duplicates=True,
        )

        ray.shutdown()
        print(f"Dumping files in {cistopic_qc_out}...")
        for sample in sorted(metadata_bc_dict.keys()):
            metadata_bc_dict[sample]["sample_id"] = sample
            metadata_bc_dict[sample].index = [
                x + "___" + sample for x in list(metadata_bc_dict[sample].index)
            ]
            with open(
                os.path.join(cistopic_qc_out, f"{sample}__metadata_bc.pkl"), "wb"
            ) as f:
                pickle.dump(metadata_bc_dict[sample], f, protocol=4)

            with open(
                os.path.join(cistopic_qc_out, f"{sample}__profile_data.pkl"), "wb"
            ) as f:
                pickle.dump(profile_data_dict[sample], f, protocol=4)
else:
    print("All samples already processed.")

All samples already processed.


# Plot

Calculating a KDE is simultaneously expensive and scales poorly with increasing n. Therefore, I wrote a multithreaded script that divides the QC array into equal parts (interleaved to avoid biases in the order!) and performs a KDE calculation on each part. Here, Otsu thresholding is used to find the right threshold for minimum fragments and minimum TSS enrichment. ddseq samples have a significantly higher noise floor than the other samples when it comes to fragment distribution. Therefore, the otsu algorithm is performed on all barcodes with a minimum of 300 fragments for the ddseq samples, and a minimum of 100 fragments for all the other samples. I tried to perform this filtering completely independent of sample/technique (e.g. using gaussian mixture modeling, Jenks natural breaks, or multiple step Otsu thresholding) but found that no solution worked perfectly for all samples.

This is regulated by the code below in qc_plots.py:
```
min_otsu_frags_dict = {}
for fragments_file in fragments_list:
    sample = fragments_file.split("/")[-1].split(".")[0]
    tech = sample.split('_')[1]
    if tech == "ddseq":
        if sample == "BIO_ddseq_1":
            min_otsu_frags_dict[sample] = 600
        else:
            min_otsu_frags_dict[sample] = 300
    elif tech == "hydrop":
        min_otsu_frags_dict[sample] = 300
    else:
        min_otsu_frags_dict[sample] = 100
```

In [13]:
!cat ../0_resources/scripts/qc_plots.py

cat: ../0_resources/scripts/qc_plots.py: No such file or directory


Since multiprocessing does not work with jupyter notebooks, run the following code in terminal:

```
mkdir plots_qc
mkdir selected_barcodes
SIF=../0_resources/cistopic_image/20220722_pycistopic.sif
singularity exec \
    --cleanenv \
    -H $PWD:/home \
    $SIF \
    python ../0_resources/scripts/qc_plots.py
```

And then open the plots:

In [14]:
metadata_bc_pkl_list = sorted(glob.glob("cistopic_qc_out/*metadata_bc.pkl"))
metadata_bc_pkl_path_dict = {}
for metadata_bc_pkl_path in metadata_bc_pkl_list:
    sample = metadata_bc_pkl_path.split("/")[-1].split("__")[0]
    metadata_bc_pkl_path_dict[sample] = metadata_bc_pkl_path

for sample in metadata_bc_pkl_path_dict.keys():
    if os.path.exists(f"selected_barcodes/{sample}_bc_passing_filters_otsu.pkl"):
        print(f"{sample} bc passing filters exists, printing img and skipping")
        display(Image(f"plots_qc/{sample}_qc_otsu.png"))
    else:
        print(
            f"{sample} bc passing filters does not exist yet, generate using qc_plots.py script!"
        )

# Write raw barcode file

Freemuxlet reads barcodes in the bam file. These barcodes (tag: DB) do not contain the `sample` suffix, which is included in the `*_bc_passing_filters_otsu.txt` files:

In [15]:
!head selected_barcodes/BIO_ddseq_1.LIBDS_bc_passing_filters_otsu.txt

head: cannot open 'selected_barcodes/BIO_ddseq_1.LIBDS_bc_passing_filters_otsu.txt' for reading: No such file or directory


In [16]:
for file in glob.glob("selected_barcodes/*_bc_passing_filters_otsu.txt"):
    print(file)
    sample = file.split("/")[-1].split("_bc_passing_filters_otsu.txt")[0]
    print(sample)
    df = pd.read_csv(file, header=None, index_col=None)

    with open(f"selected_barcodes/{sample}_bc_passing_filters_otsu.RAW.txt", "w") as fp:
        for x in df[0]:
            # write each item on a new line
            fp.write(x.split("___")[0] + "\n")
        print("Done")

In [17]:
!head selected_barcodes/BIO_ddseq_1.LIBDS_bc_passing_filters_otsu.RAW.txt

head: cannot open 'selected_barcodes/BIO_ddseq_1.LIBDS_bc_passing_filters_otsu.RAW.txt' for reading: No such file or directory
