In [1]:
# Parameters
SAMPLES = "Broad_mito_1,VIB_Hydrop_1,Broad_mito_2,VIB_Hydrop_2,Broad_1,Sanger_1,Broad_2,CNAG_1,Stanford_2,Stanford_1,CNAG_2,s3atac,VIB_2,VIB_1,Sanger_2,VIB_Hydrop_1,VIB_Hydrop_2"
METADATAPKL = "out_qc_all/data/pycistopic/qc/benchmark__metadata.pickle"
PROFDATAPKL = "out_qc_all/data/pycistopic/qc/benchmark__profile_data.pickle"
WORKFLOW_PARAMETERS = "{\"container\":\"/staging/leuven/stg_00002/lcb/cflerin/containers/aertslab-pycistopic-latest.sif\",\"biomart_annot\":{\"biomart_dataset_name\":\"hsapiens_gene_ensembl\",\"biomart_host\":\"http://www.ensembl.org\"},\"macs2_call_peaks\":{\"gsize\":\"hs\",\"qvalue\":0.01,\"extsize\":146,\"shift\":73,\"keepdup\":\"all\"},\"compute_qc_stats\":{\"n_frag\":100,\"tss_flank_window\":2000,\"tss_window\":50,\"tss_minimum_signal_window\":100,\"tss_rolling_window\":10,\"min_norm\":0.1},\"call_cells\":{\"report_ipynb\":\"/src/pycistopic/bin/pycisTopic_qc_report_template.ipynb\",\"use_density_coloring_on_scatterplot\":true,\"use_detailed_title_on_scatterplot\":true,\"filter_frags_lower\":{\"Broad_1\":3000,\"Broad_2\":3000,\"Broad_mito_1\":3500,\"Broad_mito_2\":3500,\"CNAG_1\":2000,\"CNAG_2\":2500,\"Sanger_1\":7000,\"Sanger_2\":7000,\"VIB_1\":3500,\"VIB_2\":3500,\"Stanford_1\":7000,\"Stanford_2\":4000,\"atac_pbmc_5k_v1\":3000,\"atac_pbmc_5k_nextgem\":3000,\"pbmc_unsorted_3k\":2000,\"s3atac\":10000,\"VIB_Hydrop_1\":1000,\"VIB_Hydrop_2\":1000},\"filter_tss_lower\":{\"Broad_1\":17,\"Broad_2\":17,\"Broad_mito_1\":12,\"Broad_mito_2\":12,\"CNAG_1\":13,\"CNAG_2\":13,\"Sanger_1\":10,\"Sanger_2\":10,\"VIB_1\":16,\"VIB_2\":13,\"Stanford_1\":10,\"Stanford_2\":10,\"atac_pbmc_5k_v1\":16,\"atac_pbmc_5k_nextgem\":16,\"pbmc_unsorted_3k\":15,\"s3atac\":4,\"VIB_Hydrop_1\":15,\"VIB_Hydrop_2\":15},\"filter_frags_upper\":\"\",\"filter_tss_upper\":\"\",\"filter_frip_lower\":\"\",\"filter_frip_upper\":\"\",\"filter_dup_rate_lower\":\"\",\"filter_dup_rate_upper\":\"\"}}"

# VSN Pipelines: pycisTopic QC report

scATAC-seq quality control and cell calling from pycisTopic (https://github.com/aertslab/pycisTopic)

In [2]:
import pycisTopic
pycisTopic.__version__

'0.1.dev300+g7494158'

In [3]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [4]:
import pybiomart as pbm
import pandas as pd
import pickle
import re
import os
import json

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

%matplotlib inline

In [5]:
params = json.loads(WORKFLOW_PARAMETERS)

sample_ids = SAMPLES.split(',')

print(f"SAMPLES: {sample_ids}")
print(f"pycisTopic parameters: {json.dumps(params, indent=4)}")

SAMPLES: ['Broad_mito_1', 'VIB_Hydrop_1', 'Broad_mito_2', 'VIB_Hydrop_2', 'Broad_1', 'Sanger_1', 'Broad_2', 'CNAG_1', 'Stanford_2', 'Stanford_1', 'CNAG_2', 's3atac', 'VIB_2', 'VIB_1', 'Sanger_2', 'VIB_Hydrop_1', 'VIB_Hydrop_2']
pycisTopic parameters: {
    "container": "/staging/leuven/stg_00002/lcb/cflerin/containers/aertslab-pycistopic-latest.sif",
    "biomart_annot": {
        "biomart_dataset_name": "hsapiens_gene_ensembl",
        "biomart_host": "http://www.ensembl.org"
    },
    "macs2_call_peaks": {
        "gsize": "hs",
        "qvalue": 0.01,
        "extsize": 146,
        "shift": 73,
        "keepdup": "all"
    },
    "compute_qc_stats": {
        "n_frag": 100,
        "tss_flank_window": 2000,
        "tss_window": 50,
        "tss_minimum_signal_window": 100,
        "tss_rolling_window": 10,
        "min_norm": 0.1
    },
    "call_cells": {
        "report_ipynb": "/src/pycistopic/bin/pycisTopic_qc_report_template.ipynb",
        "use_density_coloring_on_scatterpl

In [6]:
# Load metadata
infile = open(METADATAPKL, 'rb')
metadata_bc_dict = pickle.load(infile)
infile.close()

In [7]:
# Load profile data
infile = open(PROFDATAPKL, 'rb')
profile_data_dict = pickle.load(infile)
infile.close()

In [8]:
from collections import OrderedDict
metadata_bc_dict = OrderedDict(sorted(metadata_bc_dict.items()))
profile_data_dict = OrderedDict(sorted(profile_data_dict.items()))

## QC summary

In [9]:
from collections import OrderedDict

In [10]:
alias_dict={
    "Broad_1": "BioRad ATAC 1",
    "Broad_2": "BioRad ATAC 2",
    "Stanford_1": "10x ATAC A1",
    "Stanford_2": "10x ATAC A2",
    "VIB_1": "10x ATAC B1",
    "VIB_2": "10x ATAC B2",
    "CNAG_1": "10x ATAC C1",
    "CNAG_2": "10x ATAC C2",
    "Broad_mito_1": "10x mtATAC 1",
    "Broad_mito_2": "10x mtATAC 2",
    "Sanger_1": "10x Multiome 1",
    "Sanger_2": "10x Multiome 2",
    "VIB_Hydrop_1": "Hydrop ATAC 1",
    "VIB_Hydrop_2": "Hydrop ATAC 2",
    "s3atac": "s3 ATAC",
    #"merged": "Merged"
}

In [11]:
tech_dict = OrderedDict({
    "Broad_1": "BioRad ATAC",
    "Broad_2": "BioRad ATAC",
    "Stanford_1": "10x ATAC A",
    "Stanford_2": "10x ATAC A",
    "VIB_1": "10x ATAC B",
    "VIB_2": "10x ATAC B",
    "CNAG_1": "10x ATAC C",
    "CNAG_2": "10x ATAC C",
    "Broad_mito_1": "10x mtATAC",
    "Broad_mito_2": "10x mtATAC",
    "Sanger_1": "10x Multiome",
    "Sanger_2": "10x Multiome",
    "VIB_Hydrop_1": "Hydrop ATAC",
    "VIB_Hydrop_2": "Hydrop ATAC",
    "s3atac": "s3 ATAC",
    # "merged": "Merged"
})

In [12]:
color_dict = OrderedDict({
    "Broad_mito_1": "#9467bd",
    "Broad_mito_2": "#c5b0d5",
    "CNAG_1": "#d62728",
    "CNAG_2": "#ff9896",
    "Broad_1": "#1f77b4",
    "Broad_2": "#aec7e8",
    "Sanger_1": "#8c564b",
    "Sanger_2": "#c49c94",
    "Stanford_1": "#ff7f0e",
    "Stanford_2": "#ffbb78",
    "VIB_1": "#2ca02c",
    "VIB_2": "#98df8a",
    "VIB_Hydrop_1": "#e377c2",
    "VIB_Hydrop_2": "#f7b6d2",
    "s3atac": "#7f7f7f",
    #"merged": "#111111"
    # "none": 'k'
})

In [13]:
for key in metadata_bc_dict.keys():
    print(key)
    metadata_bc_dict[key]['sample_id'] = key
    metadata_bc_dict[key]['alias'] = alias_dict[key]
    metadata_bc_dict[key]['tech'] = tech_dict[key]
    suffix = '-' + key
    # metadata_bc_dict[key] = metadata_bc_dict[key].set_index(metadata_bc_dict[key].index.astype(str) + suffix)
    metadata_bc_dict[key]['is_cell'] = alias_dict[key]
    metadata_bc_dict[key].is_cell[metadata_bc_dict[key].TSS_enrichment <= params['call_cells']['filter_tss_lower'][key]] = 'none'
    metadata_bc_dict[key].is_cell[metadata_bc_dict[key].Unique_nr_frag <= params['call_cells']['filter_frags_lower'][key]] = 'none'
    n_cells = len(metadata_bc_dict[key].loc[metadata_bc_dict[key]['is_cell'] != 'none'])
    print(f'\t {n_cells}')

Broad_1
	 4204
Broad_2
	 4053
Broad_mito_1
	 3466
Broad_mito_2
	 3302
CNAG_1
	 2696
CNAG_2
	 2712
Sanger_1
	 3142
Sanger_2
	 3814
Stanford_1
	 768
Stanford_2
	 1403
VIB_1
	 2813
VIB_2
	 7336
VIB_Hydrop_1
	 2410
VIB_Hydrop_2
	 2744
s3atac
	 3122


In [14]:
metadata_bc_df = pd.DataFrame()
for key in metadata_bc_dict.keys():
    print(key)
    metadata_bc_df = pd.concat([metadata_bc_df, metadata_bc_dict[key]])

Broad_1
Broad_2
Broad_mito_1
Broad_mito_2
CNAG_1
CNAG_2
Sanger_1
Sanger_2
Stanford_1
Stanford_2
VIB_1
VIB_2
VIB_Hydrop_1
VIB_Hydrop_2
s3atac


In [15]:
color_alias_dict = {alias_dict[key]:color_dict[key] for key in color_dict.keys()}

In [16]:
order = [tech_dict[x] for x in alias_dict.keys()]

In [17]:
order =['BioRad ATAC',
 '10x ATAC A',
 '10x ATAC B',
 '10x ATAC C',
 '10x mtATAC',
 '10x Multiome',
 'Hydrop ATAC',
 's3 ATAC']

# now remake this but in a more controllable fashion

In [18]:
ylim_dict = {
    "TSS_enrichment": [0, 60],
    "FRIP": [0, 1],
    "Dupl_rate": [0, 1]
}

In [19]:
ylabel_dict = {
    "TSS_enrichment": "TSS Enrichment",
    "FRIP": "FRIP",
    "Dupl_rate": "Duplication rate"
}

In [20]:
cutoff_dict = {
    "TSS_enrichment":params['call_cells']['filter_tss_lower'],
    "Unique_nr_frag":params['call_cells']['filter_frags_lower']
}

In [21]:
color_alias_dict = {alias_dict[key]:color_dict[key] for key in color_dict.keys()}
color_alias_dict['none'] = 'k'

In [22]:
sns.set_style('white')

In [23]:
df_sub = pd.DataFrame()
for x in metadata_bc_df['sample_id'].unique():
    print(x)
    df_sub = pd.concat([df_sub, metadata_bc_df[metadata_bc_df['sample_id'] == x].iloc[0]], axis=1)
    # print(f'{df_sub.iloc[0]}')
    # print('\n')
df_sub = df_sub.T

Broad_1
Broad_2
Broad_mito_1
Broad_mito_2
CNAG_1
CNAG_2
Sanger_1
Sanger_2
Stanford_1
Stanford_2
VIB_1
VIB_2
VIB_Hydrop_1
VIB_Hydrop_2
s3atac


In [27]:
x_max = 5000000 # fragments

In [None]:
var_to_plot = ["TSS_enrichment", "FRIP", "Dupl_rate"]
order =['BioRad ATAC',
 '10x ATAC A',
 '10x ATAC B',
 '10x ATAC C',
 '10x mtATAC',
 '10x Multiome',
 'Hydrop ATAC',
 's3 ATAC']

f, axes = plt.subplots(len(var_to_plot), len(order), figsize=(len(order)*3, len(var_to_plot)*3), sharex="col", sharey="row", dpi=300)
for tech in order:
    tech_index = order.index(tech)
    print(f'{tech}, {tech_index}')
    for var in var_to_plot:
        var_index = var_to_plot.index(var)
        print(f'\t{var}, {var_index}')
        
        # plot scatter
        hue_order = reversed(sorted(list(metadata_bc_df[metadata_bc_df['tech'] == tech]["is_cell"].unique())))
        plot = sns.scatterplot(data=df_sub[df_sub['tech'] == tech].sort_values('is_cell', ascending=False), x="Unique_nr_frag", y=var, hue="is_cell", hue_order=hue_order, palette=color_alias_dict, linewidth=0, s=0, edgecolor=None, ax=axes[var_index, tech_index])
        
        # draw cutoff lines, but only when the cutoff is defined
        # if var in cutoff_dict.keys():
            # for sample in metadata_bc_df[metadata_bc_df['tech'] == tech]['sample_id'].unique():
                # axes[var_index, tech_index].axvline(cutoff_dict["Unique_nr_frag"][sample], color=color_dict[sample])
                # print(f'\t\tadding vline at {cutoff_dict["Unique_nr_frag"][sample]}')
                
                # axes[var_index, tech_index].axhline(cutoff_dict[var][sample], color=color_dict[sample])
                # print(f'\t\tadding hline at {cutoff_dict[var][sample]}')
        
        # plot settings
        plot.set(xscale="log")
        axes[var_index, tech_index].set_xlim([100, x_max])
        axes[var_index, tech_index].set_ylim(ylim_dict[var])
        axes[var_index, tech_index].get_legend().remove()
        sns.despine(top=True, right=True, left=False, bottom=False, ax=axes[var_index, tech_index])

        if var_index == 0:
            plot.set_title(tech)
        
        if tech_index == 0:
            plot.set_ylabel(ylabel_dict[var])
        
        # add subtitle with cell counts
        counts = metadata_bc_df[metadata_bc_df['tech'] == tech]['is_cell'].value_counts()
        idx = sorted(counts.index)
        if var == var_to_plot[-1]:
            if tech == 's3 ATAC':
                plot.set_xlabel(f'log(unique fragments)\n{idx[1]}: {counts[idx[1]]} cells', y=-0.01)
            else:
                plot.set_xlabel(f'log(unique fragments)\n{idx[0]}: {counts[idx[0]]} cells\n{idx[1]}: {counts[idx[1]]} cells', y=-0.01)
            
    print('\n')

f.tight_layout()
# f.savefig('plts_pub/scatterplots_compound.png', dpi=300, facecolor='white')
# f.savefig('plts_pub/scatterplots_compound_AGG.png', dpi=300, facecolor='white', backend='AGG')
# f.savefig('plts_pub/scatterplots_compound.tiff', dpi=300, facecolor='white')
# f.savefig('plts_pub/scatterplots_compound_axes_cairo.svg', dpi=300, facecolor='white', backend='Cairo')
f.savefig('plts_pub/scatterplots_compound_axes_SVG.svg')


BioRad ATAC, 0
	TSS_enrichment, 0
	FRIP, 1
	Dupl_rate, 2


10x ATAC A, 1
	TSS_enrichment, 0
	FRIP, 1
	Dupl_rate, 2


10x ATAC B, 2
	TSS_enrichment, 0
	FRIP, 1
	Dupl_rate, 2


10x ATAC C, 3
	TSS_enrichment, 0
	FRIP, 1
	Dupl_rate, 2


10x mtATAC, 4
	TSS_enrichment, 0
	FRIP, 1


In [None]:
var_to_plot = ["TSS_enrichment", "FRIP", "Dupl_rate"]
order =['BioRad ATAC',
 '10x ATAC A',
 '10x ATAC B',
 '10x ATAC C',
 '10x mtATAC',
 '10x Multiome',
 'Hydrop ATAC',
 's3 ATAC']

f, axes = plt.subplots(len(var_to_plot), len(order), figsize=(len(order)*3, len(var_to_plot)*3), sharex="col", sharey="row", dpi=300)
for tech in order:
    tech_index = order.index(tech)
    print(f'{tech}, {tech_index}')
    for var in var_to_plot:
        var_index = var_to_plot.index(var)
        print(f'\t{var}, {var_index}')
        
        # plot scatter
        hue_order = reversed(sorted(list(metadata_bc_df[metadata_bc_df['tech'] == tech]["is_cell"].unique())))
        plot = sns.scatterplot(data=metadata_bc_df[metadata_bc_df['tech'] == tech].sort_values('is_cell', ascending=False), x="Unique_nr_frag", y=var, hue="is_cell", hue_order=hue_order, palette=color_alias_dict, s=1.5, linewidth=0, edgecolor=None, ax=axes[var_index, tech_index])
        
        # draw cutoff lines, but only when the cutoff is defined
        if var in cutoff_dict.keys():
            for sample in metadata_bc_df[metadata_bc_df['tech'] == tech]['sample_id'].unique():
                axes[var_index, tech_index].axvline(cutoff_dict["Unique_nr_frag"][sample], color=color_dict[sample])
                print(f'\t\tadding vline at {cutoff_dict["Unique_nr_frag"][sample]}')
                
                axes[var_index, tech_index].axhline(cutoff_dict[var][sample], color=color_dict[sample])
                print(f'\t\tadding hline at {cutoff_dict[var][sample]}')
        
        # plot settings
        plot.set(xscale="log")
        axes[var_index, tech_index].set_xlim([100, x_max])
        axes[var_index, tech_index].set_ylim(ylim_dict[var])
        axes[var_index, tech_index].get_legend().remove()
        sns.despine(top=True, right=True, left=True, bottom=True, ax=axes[var_index, tech_index])

        if var_index == 0:
            plot.set_title(tech)
        
        if tech_index == 0:
            plot.set_ylabel(ylabel_dict[var])
        
        # add subtitle with cell counts
        counts = metadata_bc_df[metadata_bc_df['tech'] == tech]['is_cell'].value_counts()
        idx = sorted(counts.index)
        if var == var_to_plot[-1]:
            if tech == 's3 ATAC':
                plot.set_xlabel(f'log(unique fragments)\n{idx[1]}: {counts[idx[1]]} cells', y=-0.01)
            else:
                plot.set_xlabel(f'log(unique fragments)\n{idx[0]}: {counts[idx[0]]} cells\n{idx[1]}: {counts[idx[1]]} cells', y=-0.01)
            
    print('\n')

f.tight_layout()
f.savefig('plts_pub/scatterplots_compound_noaxes.png', dpi=300, transparent=True)
# f.savefig('plts_pub/scatterplots_compound_AGG.png', dpi=300, facecolor='white', backend='AGG')
# f.savefig('plts_pub/scatterplots_compound.tiff', dpi=300, facecolor='white')

# full figure

In [None]:
var_to_plot = ["TSS_enrichment", "FRIP", "Dupl_rate"]
order =['BioRad ATAC',
 '10x ATAC A',
 '10x ATAC B',
 '10x ATAC C',
 '10x mtATAC',
 '10x Multiome',
 'Hydrop ATAC',
 's3 ATAC']

f, axes = plt.subplots(len(var_to_plot), len(order), figsize=(len(order)*3, len(var_to_plot)*3), sharex="col", sharey="row", dpi=300)
for tech in order:
    tech_index = order.index(tech)
    print(f'{tech}, {tech_index}')
    for var in var_to_plot:
        var_index = var_to_plot.index(var)
        print(f'\t{var}, {var_index}')
        
        # plot scatter
        hue_order = reversed(sorted(list(metadata_bc_df[metadata_bc_df['tech'] == tech]["is_cell"].unique())))
        plot = sns.scatterplot(data=metadata_bc_df[metadata_bc_df['tech'] == tech].sort_values('is_cell', ascending=False), x="Unique_nr_frag", y=var, hue="is_cell", hue_order=hue_order, palette=color_alias_dict, s=1.5, linewidth=0, edgecolor=None, ax=axes[var_index, tech_index])
        
        # draw cutoff lines, but only when the cutoff is defined
        if var in cutoff_dict.keys():
            for sample in metadata_bc_df[metadata_bc_df['tech'] == tech]['sample_id'].unique():
                axes[var_index, tech_index].axvline(cutoff_dict["Unique_nr_frag"][sample], color=color_dict[sample])
                print(f'\t\tadding vline at {cutoff_dict["Unique_nr_frag"][sample]}')
                
                axes[var_index, tech_index].axhline(cutoff_dict[var][sample], color=color_dict[sample])
                print(f'\t\tadding hline at {cutoff_dict[var][sample]}')
        
        # plot settings
        plot.set(xscale="log")
        axes[var_index, tech_index].set_xlim([100, x_max])
        axes[var_index, tech_index].set_ylim(ylim_dict[var])
        axes[var_index, tech_index].get_legend().remove()
        sns.despine(top=True, right=True, left=False, bottom=False, ax=axes[var_index, tech_index])

        if var_index == 0:
            plot.set_title(tech)
        
        if tech_index == 0:
            plot.set_ylabel(ylabel_dict[var])
        
        # add subtitle with cell counts
        counts = metadata_bc_df[metadata_bc_df['tech'] == tech]['is_cell'].value_counts()
        idx = sorted(counts.index)
        if var == var_to_plot[-1]:
            if tech == 's3 ATAC':
                plot.set_xlabel(f'log(unique fragments)\n{idx[1]}: {counts[idx[1]]} cells', y=-0.01)
            else:
                plot.set_xlabel(f'log(unique fragments)\n{idx[0]}: {counts[idx[0]]} cells\n{idx[1]}: {counts[idx[1]]} cells', y=-0.01)
            
    print('\n')

f.tight_layout()
f.savefig('plts_pub/scatterplots_compound.png', dpi=300)