In [1]:
import pycisTopic
import glob
import os
import pybiomart as pbm
import pandas as pd
import pickle
from pycisTopic.qc import *
from IPython.display import Image, display
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import multiprocess as mp  # for kde multithreading calculation
from multiprocess import Pool

%matplotlib inline
%load_ext lab_black

# Download annotation

In [2]:
!pwd

/lustre1/project/stg_00090/scatac_benchmark/public_downsample_series


In [3]:
wdir = f"/lustre1/project/stg_00090/scatac_benchmark/public_downsample_series"
os.chdir(wdir)

In [4]:
genome = "mm10"

pbm_genome_name_dict = {
    "hg38": "hsapiens_gene_ensembl",
    "hg37": "hsapiens_gene_ensembl",
    "mm10": "mmusculus_gene_ensembl",
    "dm6": "dmelanogaster_gene_ensembl",
}

pbm_host_dict = {
    "hg38": "http://www.ensembl.org",
    "hg37": "http://grch37.ensembl.org/",
    "mm10": "http://nov2020.archive.ensembl.org/",
    "dm6": "http://www.ensembl.org",
}

if os.path.exists(f"../annotation.tsv"):
    print(f"Loading cached genome annotation...")
    annotation = pd.read_csv("../annotation.tsv", sep="\t", header=0, index_col=0)
else:
    dataset = pbm.Dataset(name=pbm_genome_name_dict[genome], host=pbm_host_dict[genome])

    annotation = dataset.query(
        attributes=[
            "chromosome_name",
            "transcription_start_site",
            "strand",
            "external_gene_name",
            "transcript_biotype",
        ]
    )
    filter = annotation["Chromosome/scaffold name"].str.contains("CHR|GL|JH|MT")
    annotation = annotation[~filter]
    annotation["Chromosome/scaffold name"] = annotation[
        "Chromosome/scaffold name"
    ].str.replace(r"(\b\S)", r"chr\1")
    annotation.columns = ["Chromosome", "Start", "Strand", "Gene", "Transcript_type"]
    annotation = annotation[annotation.Transcript_type == "protein_coding"]
    annotation.to_csv("annotation.tsv", sep="\t")

  annotation["Chromosome/scaffold name"] = annotation[


In [5]:
fragments_list = sorted(
    glob.glob(f"preprocessing_out/data/fragments/*fragments.tsv.gz")
)
fragments_dict = {}
for fragments_file in fragments_list:
    sample = fragments_file.split("/")[-1].split(".fragments.tsv.gz")[0]
    fragments_dict[sample] = fragments_file

fragments_dict

{'BIO_ddseq_m1c1.10k': 'preprocessing_out/data/fragments/BIO_ddseq_m1c1.10k.fragments.tsv.gz',
 'BIO_ddseq_m1c1.15k': 'preprocessing_out/data/fragments/BIO_ddseq_m1c1.15k.fragments.tsv.gz',
 'BIO_ddseq_m1c1.20k': 'preprocessing_out/data/fragments/BIO_ddseq_m1c1.20k.fragments.tsv.gz',
 'BIO_ddseq_m1c1.5k': 'preprocessing_out/data/fragments/BIO_ddseq_m1c1.5k.fragments.tsv.gz',
 'BIO_ddseq_m1c2.10k': 'preprocessing_out/data/fragments/BIO_ddseq_m1c2.10k.fragments.tsv.gz',
 'BIO_ddseq_m1c2.15k': 'preprocessing_out/data/fragments/BIO_ddseq_m1c2.15k.fragments.tsv.gz',
 'BIO_ddseq_m1c2.20k': 'preprocessing_out/data/fragments/BIO_ddseq_m1c2.20k.fragments.tsv.gz',
 'BIO_ddseq_m1c2.25k': 'preprocessing_out/data/fragments/BIO_ddseq_m1c2.25k.fragments.tsv.gz',
 'BIO_ddseq_m1c2.30k': 'preprocessing_out/data/fragments/BIO_ddseq_m1c2.30k.fragments.tsv.gz',
 'BIO_ddseq_m1c2.5k': 'preprocessing_out/data/fragments/BIO_ddseq_m1c2.5k.fragments.tsv.gz',
 'BIO_ddseq_m1c3.10k': 'preprocessing_out/data/fragmen

In [6]:
regions_paths_dict = {
    x.split("/")[-1].split(f"__")[0].replace(".FULL", ""): x
    for x in sorted(glob.glob("../public_3_cistopic_qc/final_consensus_peaks/*.bed"))
}
regions_paths_dict

{'BIO_ddseq_m1c1': '../public_3_cistopic_qc/final_consensus_peaks/BIO_ddseq_m1c1.FULL__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_m1c2': '../public_3_cistopic_qc/final_consensus_peaks/BIO_ddseq_m1c2.FULL__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_m1c3': '../public_3_cistopic_qc/final_consensus_peaks/BIO_ddseq_m1c3.FULL__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_m1c4': '../public_3_cistopic_qc/final_consensus_peaks/BIO_ddseq_m1c4.FULL__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_m1c5': '../public_3_cistopic_qc/final_consensus_peaks/BIO_ddseq_m1c5.FULL__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_m1c6': '../public_3_cistopic_qc/final_consensus_peaks/BIO_ddseq_m1c6.FULL__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_m1c7': '../public_3_cistopic_qc/final_consensus_peaks/BIO_ddseq_m1c7.FULL__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_m1c8': '../public_3_cistopic_qc/final_consensus_peaks/BIO_ddseq_m1c8.FULL__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_m2c1': '../public_3_cistopic_qc/final_consensus_peaks/BIO_dd

In [7]:
regions_paths_dict = {x: regions_paths_dict[x.split(".")[0]] for x in fragments_dict}

Now, make a sub dictionary of all samples within the fragments dict that have not been run yet (good for resuming a stopped cistopic run):

In [8]:
fragments_sub_dict = {}
regions_sub_dict = {}
metadata_out_dict = {}
for sample in fragments_dict.keys():
    depth = sample.split(".")[-1]
    metadata_file = (
        fragments_dict[sample]
        .replace(f"preprocessing_out/data/fragments/", "cistopic_qc_out/")
        .replace(".fragments.tsv.gz", "__metadata_bc.pkl")
    )
    # metadata_file = os.path.join(cistopic_qc_out, sample + "__metadata_bc.pkl")
    print(f"Checking if {metadata_file} exist...")
    if os.path.exists(metadata_file):
        print("\tMetadata exists! Skipping...")
    else:
        fragments_sub_dict[sample] = fragments_dict[sample]
        print("\tMetadata does not exist, adding to subdict to generate")
        metadata_out_dict[sample] = metadata_file

Checking if cistopic_qc_out/BIO_ddseq_m1c1.10k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if cistopic_qc_out/BIO_ddseq_m1c1.15k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if cistopic_qc_out/BIO_ddseq_m1c1.20k__metadata_bc.pkl exist...
	Metadata does not exist, adding to subdict to generate
Checking if cistopic_qc_out/BIO_ddseq_m1c1.5k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if cistopic_qc_out/BIO_ddseq_m1c2.10k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if cistopic_qc_out/BIO_ddseq_m1c2.15k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if cistopic_qc_out/BIO_ddseq_m1c2.20k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if cistopic_qc_out/BIO_ddseq_m1c2.25k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if cistopic_qc_out/BIO_ddseq_m1c2.30k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if cistopic_qc_out/BIO_ddseq_m1c2.5k__m

In [9]:
regions_sub_dict = {x: regions_paths_dict[x] for x in sorted(fragments_sub_dict.keys())}

In [10]:
regions_sub_dict

{'BIO_ddseq_m1c1.20k': '../public_3_cistopic_qc/final_consensus_peaks/BIO_ddseq_m1c1.FULL__SCREEN_consensus_peaks.bed',
 'TXG_10xv2_adultmousecortex.50k': '../public_3_cistopic_qc/final_consensus_peaks/TXG_10xv2_adultmousecortex.FULL__SCREEN_consensus_peaks.bed',
 'TXG_10xv2_adultmousecortexchromiumx.50k': '../public_3_cistopic_qc/final_consensus_peaks/TXG_10xv2_adultmousecortexchromiumx.FULL__SCREEN_consensus_peaks.bed'}

In [11]:
set(regions_sub_dict) == set(fragments_sub_dict)
set(regions_sub_dict) - set(fragments_sub_dict)

set()

In [12]:
common = set(fragments_sub_dict).intersection(set(regions_sub_dict))

In [13]:
metadata_out_dict

{'BIO_ddseq_m1c1.20k': 'cistopic_qc_out/BIO_ddseq_m1c1.20k__metadata_bc.pkl',
 'TXG_10xv2_adultmousecortex.50k': 'cistopic_qc_out/TXG_10xv2_adultmousecortex.50k__metadata_bc.pkl',
 'TXG_10xv2_adultmousecortexchromiumx.50k': 'cistopic_qc_out/TXG_10xv2_adultmousecortexchromiumx.50k__metadata_bc.pkl'}

In [14]:
regions_sub_dict = {x: regions_sub_dict[x] for x in list(common)}
fragments_sub_dict = {x: fragments_sub_dict[x] for x in list(common)}

In [15]:
n_cores = 6
if regions_sub_dict != {}:
    samples_sub = list(regions_sub_dict.keys())
    blocks = [samples_sub[i : i + n_cores] for i in range(0, len(samples_sub), n_cores)]
    for samples_torun_in_block in blocks:
        fragments_sub_dict_block = {
            key: fragments_sub_dict[key] for key in samples_torun_in_block
        }
        regions_sub_dict_block = {
            key: regions_sub_dict[key] for key in samples_torun_in_block
        }

        metadata_bc_dict, profile_data_dict = compute_qc_stats(
            fragments_dict=fragments_sub_dict_block,
            tss_annotation=annotation,
            stats=[
                "barcode_rank_plot",
                "duplicate_rate",
                "insert_size_distribution",
                "profile_tss",
                "frip",
            ],
            label_list=None,
            path_to_regions=regions_sub_dict_block,
            n_cpu=n_cores,
            # valid_bc=selected_barcodes_dict,
            n_frag=5,
            # n_bc=None,
            tss_flank_window=2000,
            tss_window=50,
            tss_minimum_signal_window=100,
            tss_rolling_window=1,
            # min_norm=0.2,
            remove_duplicates=True,
        )

        ray.shutdown()
        print(f"Dumping files...")
        for sample in sorted(metadata_bc_dict.keys()):
            metadata_bc_dict[sample]["sample_id"] = sample
            metadata_bc_dict[sample].index = [
                x + "___" + sample for x in list(metadata_bc_dict[sample].index)
            ]
            with open(metadata_out_dict[sample], "wb") as f:
                pickle.dump(metadata_bc_dict[sample], f, protocol=4)

            with open(
                metadata_out_dict[sample].replace(
                    "__metadata_bc.pkl", "__profile_data.pkl"
                ),
                "wb",
            ) as f:
                pickle.dump(profile_data_dict[sample], f, protocol=4)
else:
    print("All samples already processed.")

2022-11-16 18:48:47,686 cisTopic     INFO     n_cpu is larger than the number of samples. Setting n_cpu to the number of samples


2022-11-16 18:48:53,702	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8266 [39m[22m


[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 18:48:57,100 cisTopic     INFO     Reading TXG_10xv2_adultmousecortex.50k
[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 18:48:57,094 cisTopic     INFO     Reading TXG_10xv2_adultmousecortexchromiumx.50k
[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:48:57,140 cisTopic     INFO     Reading BIO_ddseq_m1c1.20k


[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:50:51,737 cisTopic     INFO     Computing barcode rank plot for BIO_ddseq_m1c1.20k
[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:50:51,737 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:51:01,644 cisTopic     INFO     Marking barcodes with more than 5


[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:51:01,868 cisTopic     INFO     Returning plot data
[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:51:01,920 cisTopic     INFO     Returning valid barcodes


[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:51:05,383 cisTopic     INFO     Computing duplicate rate plot for BIO_ddseq_m1c1.20k


[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:51:19,656 cisTopic     INFO     Return plot data
[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:51:19,768 cisTopic     INFO     Computing insert size distribution for BIO_ddseq_m1c1.20k
[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:51:19,769 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:51:20,926 cisTopic     INFO     Returning plot data


[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:51:43,182 cisTopic     INFO     Computing TSS profile for BIO_ddseq_m1c1.20k


[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:51:49,154 cisTopic     INFO     Formatting annnotation
[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:51:49,204 cisTopic     INFO     Creating coverage matrix


[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 18:53:07,056 cisTopic     INFO     Computing barcode rank plot for TXG_10xv2_adultmousecortexchromiumx.50k
[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 18:53:07,056 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:53:07,319 cisTopic     INFO     Coverage matrix done


[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 18:53:09,099 cisTopic     INFO     Computing barcode rank plot for TXG_10xv2_adultmousecortex.50k
[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 18:53:09,100 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 18:53:16,085 cisTopic     INFO     Marking barcodes with more than 5
[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 18:53:16,137 cisTopic     INFO     Returning plot data
[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 18:53:16,179 cisTopic     INFO     Returning valid barcodes


[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 18:53:20,872 cisTopic     INFO     Marking barcodes with more than 5
[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 18:53:20,922 cisTopic     INFO     Returning plot data


[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 18:53:21,029 cisTopic     INFO     Returning valid barcodes


[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 18:53:29,127 cisTopic     INFO     Computing duplicate rate plot for TXG_10xv2_adultmousecortexchromiumx.50k


[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 18:53:34,468 cisTopic     INFO     Computing duplicate rate plot for TXG_10xv2_adultmousecortex.50k


[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 18:53:42,065 cisTopic     INFO     Return plot data


[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 18:53:42,238 cisTopic     INFO     Computing insert size distribution for TXG_10xv2_adultmousecortexchromiumx.50k
[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 18:53:42,238 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 18:53:46,081 cisTopic     INFO     Returning plot data


[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 18:53:48,126 cisTopic     INFO     Return plot data
[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 18:53:48,272 cisTopic     INFO     Computing insert size distribution for TXG_10xv2_adultmousecortex.50k
[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 18:53:48,272 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 18:53:52,220 cisTopic     INFO     Returning plot data


[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:55:02,657 cisTopic     INFO     Returning normalized TSS coverage matrix per barcode


[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 18:55:28,190 cisTopic     INFO     Computing TSS profile for TXG_10xv2_adultmousecortex.50k


[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 18:55:36,084 cisTopic     INFO     Computing TSS profile for TXG_10xv2_adultmousecortexchromiumx.50k


[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 18:55:42,990 cisTopic     INFO     Formatting annnotation
[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 18:55:43,040 cisTopic     INFO     Creating coverage matrix


[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 18:55:53,908 cisTopic     INFO     Formatting annnotation
[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 18:55:53,975 cisTopic     INFO     Creating coverage matrix


[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:56:05,388 cisTopic     INFO     Returning normalized sample TSS enrichment data


[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:56:05,630 cisTopic     INFO     Computing FRIP profile for BIO_ddseq_m1c1.20k


[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:56:06,386 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:56:14,494 cisTopic     INFO     Intersecting fragments with regions


[2m[36m(compute_qc_stats_ray pid=12888)[0m 2022-11-16 18:56:37,564 cisTopic     INFO     Sample BIO_ddseq_m1c1.20k done!


[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 18:58:31,135 cisTopic     INFO     Coverage matrix done


[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 18:58:55,299 cisTopic     INFO     Coverage matrix done


[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 19:00:55,740 cisTopic     INFO     Returning normalized TSS coverage matrix per barcode


[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 19:01:35,230 cisTopic     INFO     Returning normalized TSS coverage matrix per barcode


[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 19:02:22,139 cisTopic     INFO     Returning normalized sample TSS enrichment data


[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 19:02:22,450 cisTopic     INFO     Computing FRIP profile for TXG_10xv2_adultmousecortex.50k


[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 19:02:23,999 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 19:02:53,985 cisTopic     INFO     Intersecting fragments with regions


[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 19:02:57,305 cisTopic     INFO     Returning normalized sample TSS enrichment data


[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 19:02:57,525 cisTopic     INFO     Computing FRIP profile for TXG_10xv2_adultmousecortexchromiumx.50k


[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 19:02:58,822 cisTopic     INFO     Counting fragments


[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 19:03:24,486 cisTopic     INFO     Intersecting fragments with regions


[2m[36m(compute_qc_stats_ray pid=12890)[0m 2022-11-16 19:03:59,334 cisTopic     INFO     Sample TXG_10xv2_adultmousecortex.50k done!


[2m[36m(compute_qc_stats_ray pid=12889)[0m 2022-11-16 19:04:25,847 cisTopic     INFO     Sample TXG_10xv2_adultmousecortexchromiumx.50k done!


Dumping files...
