In [1]:
import pycisTopic
import glob
import os
import pybiomart as pbm
import pandas as pd
import pickle
from pycisTopic.qc import *
from IPython.display import Image, display
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import multiprocess as mp  # for kde multithreading calculation
from multiprocess import Pool

%matplotlib inline
%load_ext lab_black

# Download annotation

In [2]:
!pwd

/dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/25k


In [3]:
depth_str = os.getcwd().split("/")[-1]

In [4]:
wdir = f"/dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/{depth_str}"
os.chdir(wdir)

In [5]:
genome = "hg38"

pbm_genome_name_dict = {
    "hg38": "hsapiens_gene_ensembl",
    "hg37": "hsapiens_gene_ensembl",
    "mm10": "mmusculus_gene_ensembl",
    "dm6": "dmelanogaster_gene_ensembl",
}

pbm_host_dict = {
    "hg38": "http://www.ensembl.org",
    "hg37": "http://grch37.ensembl.org/",
    "mm10": "http://nov2020.archive.ensembl.org/",
    "dm6": "http://www.ensembl.org",
}

if os.path.exists(f"../annotation.tsv"):
    print(f"Loading cached genome annotation...")
    annotation = pd.read_csv("../annotation.tsv", sep="\t", header=0, index_col=0)
else:
    dataset = pbm.Dataset(name=pbm_genome_name_dict[genome], host=pbm_host_dict[genome])

    annotation = dataset.query(
        attributes=[
            "chromosome_name",
            "transcription_start_site",
            "strand",
            "external_gene_name",
            "transcript_biotype",
        ]
    )
    filter = annotation["Chromosome/scaffold name"].str.contains("CHR|GL|JH|MT")
    annotation = annotation[~filter]
    annotation["Chromosome/scaffold name"] = annotation[
        "Chromosome/scaffold name"
    ].str.replace(r"(\b\S)", r"chr\1")
    annotation.columns = ["Chromosome", "Start", "Strand", "Gene", "Transcript_type"]
    annotation = annotation[annotation.Transcript_type == "protein_coding"]
    annotation.to_csv("annotation.tsv", sep="\t")

Loading cached genome annotation...


In [6]:
fragments_list = sorted(
    glob.glob(f"{depth_str}_preprocessing_out/data/fragments/*fragments.tsv.gz")
)
fragments_dict = {}
for fragments_file in fragments_list:
    sample = fragments_file.split("/")[-1].split(".fragments.tsv.gz")[0]
    fragments_dict[sample] = fragments_file

fragments_dict

{'BIO_ddseq_1.25k': '25k_preprocessing_out/data/fragments/BIO_ddseq_1.25k.fragments.tsv.gz',
 'BIO_ddseq_2.25k': '25k_preprocessing_out/data/fragments/BIO_ddseq_2.25k.fragments.tsv.gz',
 'BIO_ddseq_3.25k': '25k_preprocessing_out/data/fragments/BIO_ddseq_3.25k.fragments.tsv.gz',
 'BIO_ddseq_4.25k': '25k_preprocessing_out/data/fragments/BIO_ddseq_4.25k.fragments.tsv.gz',
 'BRO_mtscatac_1.25k': '25k_preprocessing_out/data/fragments/BRO_mtscatac_1.25k.fragments.tsv.gz',
 'BRO_mtscatac_2.25k': '25k_preprocessing_out/data/fragments/BRO_mtscatac_2.25k.fragments.tsv.gz',
 'CNA_10xmultiome_1.25k': '25k_preprocessing_out/data/fragments/CNA_10xmultiome_1.25k.fragments.tsv.gz',
 'CNA_10xmultiome_2.25k': '25k_preprocessing_out/data/fragments/CNA_10xmultiome_2.25k.fragments.tsv.gz',
 'CNA_10xv11_1.25k': '25k_preprocessing_out/data/fragments/CNA_10xv11_1.25k.fragments.tsv.gz',
 'CNA_10xv11_2.25k': '25k_preprocessing_out/data/fragments/CNA_10xv11_2.25k.fragments.tsv.gz',
 'CNA_10xv11_3.25k': '25k_prep

In [7]:
regions_paths_dict = {
    x.split("/")[-1].split(f"__")[0].replace("FIXEDCELLS", depth_str): x
    for x in sorted(glob.glob("../SCREEN_peaks/*consensus_peaks.bed"))
}
regions_paths_dict

{'BIO_ddseq_1.25k': '../SCREEN_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_2.25k': '../SCREEN_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_3.25k': '../SCREEN_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_4.25k': '../SCREEN_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_1.25k': '../SCREEN_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_2.25k': '../SCREEN_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_1.25k': '../SCREEN_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_2.25k': '../SCREEN_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_1.25k': '../SCREEN_peaks/CNA_10xv11_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_2.25k': '../SCREEN_peaks/CNA_10xv11_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_3.25k': '../SCREEN_peaks/CNA_10xv11_3.FIXEDCELL

In [8]:
regions_paths_dict["VIB_hydrop_11." + depth_str] = regions_paths_dict[
    "VIB_hydrop_1." + depth_str
]
regions_paths_dict["VIB_hydrop_12." + depth_str] = regions_paths_dict[
    "VIB_hydrop_1." + depth_str
]
regions_paths_dict["VIB_hydrop_21." + depth_str] = regions_paths_dict[
    "VIB_hydrop_2." + depth_str
]
regions_paths_dict["VIB_hydrop_22." + depth_str] = regions_paths_dict[
    "VIB_hydrop_2." + depth_str
]

In [9]:
regions_paths_dict

{'BIO_ddseq_1.25k': '../SCREEN_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_2.25k': '../SCREEN_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_3.25k': '../SCREEN_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_4.25k': '../SCREEN_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_1.25k': '../SCREEN_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_2.25k': '../SCREEN_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_1.25k': '../SCREEN_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_2.25k': '../SCREEN_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_1.25k': '../SCREEN_peaks/CNA_10xv11_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_2.25k': '../SCREEN_peaks/CNA_10xv11_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_3.25k': '../SCREEN_peaks/CNA_10xv11_3.FIXEDCELL

Now, make a sub dictionary of all samples within the fragments dict that have not been run yet (good for resuming a stopped cistopic run):

In [10]:
cistopic_qc_out = os.path.join(wdir, "cistopic_qc_out_CONSENSUS")
if not os.path.exists(cistopic_qc_out):
    os.makedirs(cistopic_qc_out)

In [11]:
fragments_sub_dict = {}
regions_sub_dict = {}
for sample in fragments_dict.keys():
    metadata_file = os.path.join(cistopic_qc_out, sample + "__metadata_bc.pkl")
    print(f"Checking if {metadata_file} exist...")
    if os.path.exists(metadata_file):
        print("\tMetadata exists! Skipping...")
    else:
        fragments_sub_dict[sample] = fragments_dict[sample]
        print("\tMetadata does not exist, adding to subdict to generate")

Checking if /dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/25k/cistopic_qc_out_CONSENSUS/BIO_ddseq_1.25k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if /dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/25k/cistopic_qc_out_CONSENSUS/BIO_ddseq_2.25k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if /dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/25k/cistopic_qc_out_CONSENSUS/BIO_ddseq_3.25k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if /dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/25k/cistopic_qc_out_CONSENSUS/BIO_ddseq_4.25k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if /dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/25k/cistopic_qc_out_CONSENSUS/BRO_mtscatac_1.25k__metadata_bc.pkl exist...
	Metadata exists!

	Metadata exists! Skipping...
Checking if /dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/25k/cistopic_qc_out_CONSENSUS/CNA_10xmultiome_1.25k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if /dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/25k/cistopic_qc_out_CONSENSUS/CNA_10xmultiome_2.25k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if /dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/25k/cistopic_qc_out_CONSENSUS/CNA_10xv11_1.25k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if /dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/25k/cistopic_qc_out_CONSENSUS/CNA_10xv11_2.25k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if /dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/25k/cistopic_qc_out_CONSENSUS/CNA_10xv11_3.25k__

In [12]:
regions_sub_dict = {x: regions_paths_dict[x] for x in sorted(regions_paths_dict.keys())}

In [13]:
regions_sub_dict

{'BIO_ddseq_1.25k': '../SCREEN_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_2.25k': '../SCREEN_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_3.25k': '../SCREEN_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_4.25k': '../SCREEN_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_1.25k': '../SCREEN_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_2.25k': '../SCREEN_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_1.25k': '../SCREEN_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_2.25k': '../SCREEN_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_1.25k': '../SCREEN_peaks/CNA_10xv11_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_2.25k': '../SCREEN_peaks/CNA_10xv11_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_3.25k': '../SCREEN_peaks/CNA_10xv11_3.FIXEDCELL

In [14]:
set(regions_sub_dict) == set(fragments_sub_dict)
set(regions_sub_dict) - set(fragments_sub_dict)

{'BIO_ddseq_1.25k',
 'BIO_ddseq_2.25k',
 'BIO_ddseq_3.25k',
 'BIO_ddseq_4.25k',
 'BRO_mtscatac_1.25k',
 'BRO_mtscatac_2.25k',
 'CNA_10xmultiome_1.25k',
 'CNA_10xmultiome_2.25k',
 'CNA_10xv11_1.25k',
 'CNA_10xv11_2.25k',
 'CNA_10xv11_3.25k',
 'CNA_10xv11_4.25k',
 'CNA_10xv11_5.25k',
 'CNA_10xv2_1.25k',
 'CNA_10xv2_2.25k',
 'CNA_hydrop_1.25k',
 'CNA_hydrop_2.25k',
 'CNA_hydrop_3.25k',
 'CNA_mtscatac_1.25k',
 'CNA_mtscatac_2.25k',
 'EPF_hydrop_1.25k',
 'EPF_hydrop_2.25k',
 'EPF_hydrop_3.25k',
 'EPF_hydrop_4.25k',
 'HAR_ddseq_1.25k',
 'HAR_ddseq_2.25k',
 'MDC_mtscatac_1.25k',
 'MDC_mtscatac_2.25k',
 'OHS_s3atac_1.25k',
 'OHS_s3atac_2.25k',
 'SAN_10xmultiome_1.25k',
 'SAN_10xmultiome_2.25k',
 'STA_10xv11_1.25k',
 'STA_10xv11_2.25k',
 'UCS_ddseq_1.25k',
 'UCS_ddseq_2.25k',
 'VIB_10xmultiome_1.25k',
 'VIB_10xmultiome_2.25k',
 'VIB_10xv1_1.25k',
 'VIB_10xv1_2.25k',
 'VIB_10xv2_1.25k',
 'VIB_10xv2_2.25k',
 'VIB_hydrop_1.25k',
 'VIB_hydrop_11.25k',
 'VIB_hydrop_12.25k',
 'VIB_hydrop_2.25k',
 'VI

In [15]:
common = set(fragments_sub_dict).intersection(set(regions_sub_dict))

In [16]:
len(common)

3

In [17]:
regions_sub_dict = {x: regions_sub_dict[x] for x in list(common)}
fragments_sub_dict = {x: fragments_sub_dict[x] for x in list(common)}

In [18]:
ray.shutdown()

In [19]:
n_cores = 16
if regions_sub_dict != {}:
    samples_sub = list(regions_sub_dict.keys())
    blocks = [samples_sub[i : i + n_cores] for i in range(0, len(samples_sub), n_cores)]
    for samples_torun_in_block in blocks:
        fragments_sub_dict_block = {
            key: fragments_sub_dict[key] for key in samples_torun_in_block
        }
        regions_sub_dict_block = {
            key: regions_sub_dict[key] for key in samples_torun_in_block
        }

        metadata_bc_dict, profile_data_dict = compute_qc_stats(
            fragments_dict=fragments_sub_dict_block,
            tss_annotation=annotation,
            stats=[
                "barcode_rank_plot",
                "duplicate_rate",
                "insert_size_distribution",
                "profile_tss",
                "frip",
            ],
            label_list=None,
            path_to_regions=regions_sub_dict_block,
            n_cpu=n_cores,
            valid_bc=None,
            n_frag=10,
            n_bc=None,
            tss_flank_window=2000,
            tss_window=50,
            tss_minimum_signal_window=100,
            tss_rolling_window=10,
            # min_norm=0.2,
            remove_duplicates=True,
        )

        ray.shutdown()
        print(f"Dumping files in {cistopic_qc_out}...")
        for sample in sorted(metadata_bc_dict.keys()):
            metadata_bc_dict[sample]["sample_id"] = sample
            metadata_bc_dict[sample].index = [
                x + "___" + sample for x in list(metadata_bc_dict[sample].index)
            ]
            with open(
                os.path.join(cistopic_qc_out, f"{sample}__metadata_bc.pkl"), "wb"
            ) as f:
                pickle.dump(metadata_bc_dict[sample], f, protocol=4)

            with open(
                os.path.join(cistopic_qc_out, f"{sample}__profile_data.pkl"), "wb"
            ) as f:
                pickle.dump(profile_data_dict[sample], f, protocol=4)
else:
    print("All samples already processed.")

2022-09-15 12:40:02,679 cisTopic     INFO     n_cpu is larger than the number of samples. Setting n_cpu to the number of samples


2022-09-15 12:40:06,738	INFO services.py:1470 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 12:40:17,810 cisTopic     INFO     Reading TXG_10xv11_1.25k
[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 12:40:17,826 cisTopic     INFO     Reading TXG_10xv2_1.25k
[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 12:40:17,811 cisTopic     INFO     Reading TXG_10xv2_2.25k


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 12:45:06,561 cisTopic     INFO     Computing barcode rank plot for TXG_10xv2_2.25k
[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 12:45:06,561 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 12:45:09,702 cisTopic     INFO     Computing barcode rank plot for TXG_10xv2_1.25k
[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 12:45:09,703 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 12:45:22,822 cisTopic     INFO     Marking barcodes with more than 10


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 12:45:23,107 cisTopic     INFO     Returning plot data
[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 12:45:23,147 cisTopic     INFO     Returning valid barcodes


[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 12:45:25,848 cisTopic     INFO     Marking barcodes with more than 10


[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 12:45:26,123 cisTopic     INFO     Returning plot data
[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 12:45:26,159 cisTopic     INFO     Returning valid barcodes


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 12:45:35,298 cisTopic     INFO     Computing barcode rank plot for TXG_10xv11_1.25k
[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 12:45:35,299 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 12:45:39,715 cisTopic     INFO     Computing duplicate rate plot for TXG_10xv2_2.25k


[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 12:45:43,104 cisTopic     INFO     Computing duplicate rate plot for TXG_10xv2_1.25k


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 12:45:52,886 cisTopic     INFO     Marking barcodes with more than 10


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 12:45:53,202 cisTopic     INFO     Returning plot data
[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 12:45:53,238 cisTopic     INFO     Returning valid barcodes


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 12:46:04,596 cisTopic     INFO     Return plot data


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 12:46:04,838 cisTopic     INFO     Computing insert size distribution for TXG_10xv2_2.25k
[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 12:46:04,838 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 12:46:07,947 cisTopic     INFO     Return plot data


[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 12:46:08,187 cisTopic     INFO     Computing insert size distribution for TXG_10xv2_1.25k
[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 12:46:08,187 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 12:46:10,432 cisTopic     INFO     Computing duplicate rate plot for TXG_10xv11_1.25k


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 12:46:11,872 cisTopic     INFO     Returning plot data


[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 12:46:15,239 cisTopic     INFO     Returning plot data


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 12:46:37,777 cisTopic     INFO     Return plot data


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 12:46:38,031 cisTopic     INFO     Computing insert size distribution for TXG_10xv11_1.25k
[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 12:46:38,031 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 12:46:45,146 cisTopic     INFO     Returning plot data


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 12:48:21,245 cisTopic     INFO     Computing TSS profile for TXG_10xv2_2.25k


[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 12:48:25,868 cisTopic     INFO     Computing TSS profile for TXG_10xv2_1.25k


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 12:48:52,457 cisTopic     INFO     Formatting annnotation


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 12:48:52,710 cisTopic     INFO     Creating coverage matrix


[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 12:48:57,395 cisTopic     INFO     Formatting annnotation


[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 12:48:57,584 cisTopic     INFO     Creating coverage matrix


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 12:49:00,870 cisTopic     INFO     Computing TSS profile for TXG_10xv11_1.25k


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 12:49:43,063 cisTopic     INFO     Formatting annnotation


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 12:49:43,250 cisTopic     INFO     Creating coverage matrix


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 12:57:22,865 cisTopic     INFO     Coverage matrix done


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 12:59:03,492 cisTopic     INFO     Coverage matrix done


[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 12:59:05,395 cisTopic     INFO     Coverage matrix done


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 13:01:44,824 cisTopic     INFO     Returning normalized TSS coverage matrix per barcode


[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 13:02:50,665 cisTopic     INFO     Returning normalized TSS coverage matrix per barcode


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 13:02:52,613 cisTopic     INFO     Returning normalized TSS coverage matrix per barcode


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 13:04:36,027 cisTopic     INFO     Returning normalized sample TSS enrichment data


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 13:04:36,283 cisTopic     INFO     Computing FRIP profile for TXG_10xv11_1.25k


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 13:04:38,319 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 13:05:00,923 cisTopic     INFO     Intersecting fragments with regions


[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 13:05:21,811 cisTopic     INFO     Returning normalized sample TSS enrichment data


[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 13:05:22,073 cisTopic     INFO     Computing FRIP profile for TXG_10xv2_1.25k


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 13:05:25,032 cisTopic     INFO     Returning normalized sample TSS enrichment data


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 13:05:25,298 cisTopic     INFO     Computing FRIP profile for TXG_10xv2_2.25k
[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 13:05:25,391 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 13:05:28,361 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 13:05:48,156 cisTopic     INFO     Intersecting fragments with regions


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 13:05:50,307 cisTopic     INFO     Intersecting fragments with regions


[2m[36m(compute_qc_stats_ray pid=903755)[0m 2022-09-15 13:06:30,176 cisTopic     INFO     Sample TXG_10xv11_1.25k done!


[2m[36m(compute_qc_stats_ray pid=903756)[0m 2022-09-15 13:07:31,819 cisTopic     INFO     Sample TXG_10xv2_2.25k done!


[2m[36m(compute_qc_stats_ray pid=903757)[0m 2022-09-15 13:07:32,456 cisTopic     INFO     Sample TXG_10xv2_1.25k done!


Dumping files in /dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/downsample_series/25k/cistopic_qc_out_CONSENSUS...


# Plot

Calculating a KDE is simultaneously expensive and scales poorly with increasing n. Therefore, I wrote a multithreaded script that divides the QC array into equal parts (interleaved to avoid biases in the order!) and performs a KDE calculation on each part. Here, Otsu thresholding is used to find the right threshold for minimum fragments and minimum TSS enrichment. ddseq samples have a significantly higher noise floor than the other samples when it comes to fragment distribution. Therefore, the otsu algorithm is performed on all barcodes with a minimum of 300 fragments for the ddseq samples, and a minimum of 100 fragments for all the other samples. I tried to perform this filtering completely independent of sample/technique (e.g. using gaussian mixture modeling, Jenks natural breaks, or multiple step Otsu thresholding) but found that no solution worked perfectly for all samples.

This is regulated by the code below in qc_plots.py:
```
min_otsu_frags_dict = {}
for fragments_file in fragments_list:
    sample = fragments_file.split("/")[-1].split(".")[0]
    tech = sample.split('_')[1]
    if tech == "ddseq":
        if sample == "BIO_ddseq_1":
            min_otsu_frags_dict[sample] = 600
        else:
            min_otsu_frags_dict[sample] = 300
    elif tech == "hydrop":
        min_otsu_frags_dict[sample] = 300
    else:
        min_otsu_frags_dict[sample] = 100
```

In [20]:
!cat ../0_resources/scripts/qc_plots.py

cat: ../0_resources/scripts/qc_plots.py: No such file or directory


Since multiprocessing does not work with jupyter notebooks, run the following code in terminal:

```
mkdir plots_qc
mkdir selected_barcodes
SIF=../0_resources/cistopic_image/20220722_pycistopic.sif
singularity exec \
    --cleanenv \
    -H $PWD:/home \
    $SIF \
    python ../0_resources/scripts/qc_plots.py
```

And then open the plots:

In [21]:
metadata_bc_pkl_list = sorted(glob.glob("cistopic_qc_out/*metadata_bc.pkl"))
metadata_bc_pkl_path_dict = {}
for metadata_bc_pkl_path in metadata_bc_pkl_list:
    sample = metadata_bc_pkl_path.split("/")[-1].split("__")[0]
    metadata_bc_pkl_path_dict[sample] = metadata_bc_pkl_path

for sample in metadata_bc_pkl_path_dict.keys():
    if os.path.exists(f"selected_barcodes/{sample}_bc_passing_filters_otsu.pkl"):
        print(f"{sample} bc passing filters exists, printing img and skipping")
        display(Image(f"plots_qc/{sample}_qc_otsu.png"))
    else:
        print(
            f"{sample} bc passing filters does not exist yet, generate using qc_plots.py script!"
        )

# Write raw barcode file

Freemuxlet reads barcodes in the bam file. These barcodes (tag: DB) do not contain the `sample` suffix, which is included in the `*_bc_passing_filters_otsu.txt` files:

In [22]:
!head selected_barcodes/BIO_ddseq_1.LIBDS_bc_passing_filters_otsu.txt

head: cannot open 'selected_barcodes/BIO_ddseq_1.LIBDS_bc_passing_filters_otsu.txt' for reading: No such file or directory


In [23]:
for file in glob.glob("selected_barcodes/*_bc_passing_filters_otsu.txt"):
    print(file)
    sample = file.split("/")[-1].split("_bc_passing_filters_otsu.txt")[0]
    print(sample)
    df = pd.read_csv(file, header=None, index_col=None)

    with open(f"selected_barcodes/{sample}_bc_passing_filters_otsu.RAW.txt", "w") as fp:
        for x in df[0]:
            # write each item on a new line
            fp.write(x.split("___")[0] + "\n")
        print("Done")

In [24]:
!head selected_barcodes/BIO_ddseq_1.LIBDS_bc_passing_filters_otsu.RAW.txt

head: cannot open 'selected_barcodes/BIO_ddseq_1.LIBDS_bc_passing_filters_otsu.RAW.txt' for reading: No such file or directory
