In [31]:
import pycisTopic
import glob
import os
import pybiomart as pbm
import pandas as pd
import pickle
from pycisTopic.qc import *
from IPython.display import Image, display
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import multiprocess as mp  # for kde multithreading calculation
from multiprocess import Pool

%matplotlib inline
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


# Download annotation

In [32]:
!pwd

/lustre1/project/stg_00090/scatac_benchmark/fixedcells_downsample_series


In [33]:
wdir = f"/lustre1/project/stg_00090/scatac_benchmark/fixedcells_downsample_series"
os.chdir(wdir)

In [34]:
genome = "hg38"

pbm_genome_name_dict = {
    "hg38": "hsapiens_gene_ensembl",
    "hg37": "hsapiens_gene_ensembl",
    "mm10": "mmusculus_gene_ensembl",
    "dm6": "dmelanogaster_gene_ensembl",
}

pbm_host_dict = {
    "hg38": "http://www.ensembl.org",
    "hg37": "http://grch37.ensembl.org/",
    "mm10": "http://nov2020.archive.ensembl.org/",
    "dm6": "http://www.ensembl.org",
}

if os.path.exists(f"../annotation.tsv"):
    print(f"Loading cached genome annotation...")
    annotation = pd.read_csv("../annotation.tsv", sep="\t", header=0, index_col=0)
else:
    dataset = pbm.Dataset(name=pbm_genome_name_dict[genome], host=pbm_host_dict[genome])

    annotation = dataset.query(
        attributes=[
            "chromosome_name",
            "transcription_start_site",
            "strand",
            "external_gene_name",
            "transcript_biotype",
        ]
    )
    filter = annotation["Chromosome/scaffold name"].str.contains("CHR|GL|JH|MT")
    annotation = annotation[~filter]
    annotation["Chromosome/scaffold name"] = annotation[
        "Chromosome/scaffold name"
    ].str.replace(r"(\b\S)", r"chr\1")
    annotation.columns = ["Chromosome", "Start", "Strand", "Gene", "Transcript_type"]
    annotation = annotation[annotation.Transcript_type == "protein_coding"]
    annotation.to_csv("annotation.tsv", sep="\t")

  annotation["Chromosome/scaffold name"] = annotation[


In [35]:
fragments_list = sorted(
    glob.glob(f"*k/*_preprocessing_out/data/fragments/*fragments.tsv.gz")
)
fragments_dict = {}
for fragments_file in fragments_list:
    sample = fragments_file.split("/")[-1].split(".fragments.tsv.gz")[0]
    fragments_dict[sample] = fragments_file

fragments_dict

{'BIO_ddseq_1.10k': '10k/10k_preprocessing_out/data/fragments/BIO_ddseq_1.10k.fragments.tsv.gz',
 'BIO_ddseq_2.10k': '10k/10k_preprocessing_out/data/fragments/BIO_ddseq_2.10k.fragments.tsv.gz',
 'BIO_ddseq_3.10k': '10k/10k_preprocessing_out/data/fragments/BIO_ddseq_3.10k.fragments.tsv.gz',
 'BIO_ddseq_4.10k': '10k/10k_preprocessing_out/data/fragments/BIO_ddseq_4.10k.fragments.tsv.gz',
 'BRO_mtscatac_1.10k': '10k/10k_preprocessing_out/data/fragments/BRO_mtscatac_1.10k.fragments.tsv.gz',
 'BRO_mtscatac_2.10k': '10k/10k_preprocessing_out/data/fragments/BRO_mtscatac_2.10k.fragments.tsv.gz',
 'CNA_10xmultiome_1.10k': '10k/10k_preprocessing_out/data/fragments/CNA_10xmultiome_1.10k.fragments.tsv.gz',
 'CNA_10xmultiome_2.10k': '10k/10k_preprocessing_out/data/fragments/CNA_10xmultiome_2.10k.fragments.tsv.gz',
 'CNA_10xv11_1.10k': '10k/10k_preprocessing_out/data/fragments/CNA_10xv11_1.10k.fragments.tsv.gz',
 'CNA_10xv11_2.10k': '10k/10k_preprocessing_out/data/fragments/CNA_10xv11_2.10k.fragments

In [36]:
regions_paths_dict = {
    x.split("/")[-1].split(f"__")[0].replace(".FIXEDCELLS", ""): x
    for x in sorted(glob.glob("SCREEN_peaks/*consensus_peaks.bed"))
}
regions_paths_dict

{'BIO_ddseq_1': 'SCREEN_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_2': 'SCREEN_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_3': 'SCREEN_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_4': 'SCREEN_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_1': 'SCREEN_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_2': 'SCREEN_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_1': 'SCREEN_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_2': 'SCREEN_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_1': 'SCREEN_peaks/CNA_10xv11_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_2': 'SCREEN_peaks/CNA_10xv11_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_3': 'SCREEN_peaks/CNA_10xv11_3.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_4': 'SCREEN_peaks/CNA_10xv11_4.F

In [37]:
regions_paths_dict["VIB_hydrop_11"] = regions_paths_dict["VIB_hydrop_1"]
regions_paths_dict["VIB_hydrop_12"] = regions_paths_dict["VIB_hydrop_1"]
regions_paths_dict["VIB_hydrop_21"] = regions_paths_dict["VIB_hydrop_2"]
regions_paths_dict["VIB_hydrop_22"] = regions_paths_dict["VIB_hydrop_2"]

In [38]:
regions_paths_dict = {x: regions_paths_dict[x.split(".")[0]] for x in fragments_dict}

Now, make a sub dictionary of all samples within the fragments dict that have not been run yet (good for resuming a stopped cistopic run):

In [39]:
fragments_sub_dict = {}
regions_sub_dict = {}
metadata_out_dict = {}
for sample in fragments_dict.keys():
    depth = sample.split(".")[-1]
    metadata_file = (
        fragments_dict[sample]
        .replace(
            f"{depth}_preprocessing_out/data/fragments/", "cistopic_qc_out_CONSENSUS/"
        )
        .replace(".fragments.tsv.gz", "__metadata_bc.pkl")
    )
    # metadata_file = os.path.join(cistopic_qc_out, sample + "__metadata_bc.pkl")
    print(f"Checking if {metadata_file} exist...")
    if os.path.exists(metadata_file):
        print("\tMetadata exists! Skipping...")
    else:
        fragments_sub_dict[sample] = fragments_dict[sample]
        print("\tMetadata does not exist, adding to subdict to generate")
        metadata_out_dict[sample] = metadata_file

Checking if 10k/cistopic_qc_out_CONSENSUS/BIO_ddseq_1.10k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if 10k/cistopic_qc_out_CONSENSUS/BIO_ddseq_2.10k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if 10k/cistopic_qc_out_CONSENSUS/BIO_ddseq_3.10k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if 10k/cistopic_qc_out_CONSENSUS/BIO_ddseq_4.10k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if 10k/cistopic_qc_out_CONSENSUS/BRO_mtscatac_1.10k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if 10k/cistopic_qc_out_CONSENSUS/BRO_mtscatac_2.10k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if 10k/cistopic_qc_out_CONSENSUS/CNA_10xmultiome_1.10k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if 10k/cistopic_qc_out_CONSENSUS/CNA_10xmultiome_2.10k__metadata_bc.pkl exist...
	Metadata exists! Skipping...
Checking if 10k/cistopic_qc_out_CONSENSUS/CNA_10xv11_1.10k__metadata_b

In [40]:
regions_sub_dict = {x: regions_paths_dict[x] for x in sorted(fragments_sub_dict.keys())}

In [41]:
regions_sub_dict

{'BIO_ddseq_1.30k': 'SCREEN_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_2.30k': 'SCREEN_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_3.30k': 'SCREEN_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_4.30k': 'SCREEN_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_1.30k': 'SCREEN_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_2.30k': 'SCREEN_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_1.30k': 'SCREEN_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_2.30k': 'SCREEN_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_1.30k': 'SCREEN_peaks/CNA_10xv11_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_2.30k': 'SCREEN_peaks/CNA_10xv11_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_3.30k': 'SCREEN_peaks/CNA_10xv11_3.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 

In [42]:
set(regions_sub_dict) == set(fragments_sub_dict)
set(regions_sub_dict) - set(fragments_sub_dict)

set()

In [43]:
common = set(fragments_sub_dict).intersection(set(regions_sub_dict))

In [44]:
metadata_out_dict

{'BIO_ddseq_1.30k': '30k/cistopic_qc_out_CONSENSUS/BIO_ddseq_1.30k__metadata_bc.pkl',
 'BIO_ddseq_2.30k': '30k/cistopic_qc_out_CONSENSUS/BIO_ddseq_2.30k__metadata_bc.pkl',
 'BIO_ddseq_3.30k': '30k/cistopic_qc_out_CONSENSUS/BIO_ddseq_3.30k__metadata_bc.pkl',
 'BIO_ddseq_4.30k': '30k/cistopic_qc_out_CONSENSUS/BIO_ddseq_4.30k__metadata_bc.pkl',
 'BRO_mtscatac_1.30k': '30k/cistopic_qc_out_CONSENSUS/BRO_mtscatac_1.30k__metadata_bc.pkl',
 'BRO_mtscatac_2.30k': '30k/cistopic_qc_out_CONSENSUS/BRO_mtscatac_2.30k__metadata_bc.pkl',
 'CNA_10xmultiome_1.30k': '30k/cistopic_qc_out_CONSENSUS/CNA_10xmultiome_1.30k__metadata_bc.pkl',
 'CNA_10xmultiome_2.30k': '30k/cistopic_qc_out_CONSENSUS/CNA_10xmultiome_2.30k__metadata_bc.pkl',
 'CNA_10xv11_1.30k': '30k/cistopic_qc_out_CONSENSUS/CNA_10xv11_1.30k__metadata_bc.pkl',
 'CNA_10xv11_2.30k': '30k/cistopic_qc_out_CONSENSUS/CNA_10xv11_2.30k__metadata_bc.pkl',
 'CNA_10xv11_3.30k': '30k/cistopic_qc_out_CONSENSUS/CNA_10xv11_3.30k__metadata_bc.pkl',
 'CNA_10xv11

In [45]:
regions_sub_dict = {x: regions_sub_dict[x] for x in list(common)}
fragments_sub_dict = {x: fragments_sub_dict[x] for x in list(common)}

In [46]:
ray.shutdown()

In [49]:
n_cores = 64
if regions_sub_dict != {}:
    samples_sub = list(regions_sub_dict.keys())
    blocks = [samples_sub[i : i + n_cores] for i in range(0, len(samples_sub), n_cores)]
    for samples_torun_in_block in blocks:
        fragments_sub_dict_block = {
            key: fragments_sub_dict[key] for key in samples_torun_in_block
        }
        regions_sub_dict_block = {
            key: regions_sub_dict[key] for key in samples_torun_in_block
        }

        metadata_bc_dict, profile_data_dict = compute_qc_stats(
            fragments_dict=fragments_sub_dict_block,
            tss_annotation=annotation,
            stats=[
                "barcode_rank_plot",
                "duplicate_rate",
                "insert_size_distribution",
                "profile_tss",
                "frip",
            ],
            label_list=None,
            path_to_regions=regions_sub_dict_block,
            n_cpu=n_cores,
            valid_bc=None,
            n_frag=10,
            n_bc=None,
            tss_flank_window=2000,
            tss_window=50,
            tss_minimum_signal_window=100,
            tss_rolling_window=10,
            # min_norm=0.2,
            remove_duplicates=True,
        )

        ray.shutdown()
        print(f"Dumping files in...")
        for sample in sorted(metadata_bc_dict.keys()):
            metadata_bc_dict[sample]["sample_id"] = sample
            metadata_bc_dict[sample].index = [
                x + "___" + sample for x in list(metadata_bc_dict[sample].index)
            ]
            with open(metadata_out_dict[sample], "wb") as f:
                pickle.dump(metadata_bc_dict[sample], f, protocol=4)

            with open(
                metadata_out_dict[sample].replace(
                    "__metadata_bc.pkl", "__profile_data.pkl"
                ),
                "wb",
            ) as f:
                pickle.dump(profile_data_dict[sample], f, protocol=4)
else:
    print("All samples already processed.")

2022-12-08 13:42:01,495 cisTopic     INFO     n_cpu is larger than the number of samples. Setting n_cpu to the number of samples


2022-12-08 13:42:05,026	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(compute_qc_stats_ray pid=44289)[0m 2022-12-08 13:42:10,030 cisTopic     INFO     Reading SAN_10xmultiome_1.30k
[2m[36m(compute_qc_stats_ray pid=44277)[0m 2022-12-08 13:42:10,079 cisTopic     INFO     Reading VIB_hydrop_21.30k
[2m[36m(compute_qc_stats_ray pid=44258)[0m 2022-12-08 13:42:10,079 cisTopic     INFO     Reading CNA_10xv11_4.30k
[2m[36m(compute_qc_stats_ray pid=44264)[0m 2022-12-08 13:42:10,145 cisTopic     INFO     Reading CNA_10xv11_2.30k
[2m[36m(compute_qc_stats_ray pid=44287)[0m 2022-12-08 13:42:10,161 cisTopic     INFO     Reading VIB_hydrop_12.30k
[2m[36m(compute_qc_stats_ray pid=44280)[0m 2022-12-08 13:42:10,213 cisTopic     INFO     Reading VIB_10xmultiome_1.30k
[2m[36m(compute_qc_stats_ray pid=44292)[0m 2022-12-08 13:42:10,207 cisTopic     INFO     Reading EPF_hydrop_1.30k
[2m[36m(compute_qc_stats_ray pid=44268)[0m 2022-12-08 13:42:10,209 cisTopic     INFO     Reading BIO_ddseq_3.30k
[2m[36m(compute_qc_stats_ray pid=44276)[0m 2022-12