# pycisTopic analysis

Full dataset, using CONSENSUS peaks.

In [7]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

import pickle
import os
import glob
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [8]:
!pwd

/lustre1/project/stg_00090/scatac_benchmark/public_3_cistopic_qc


<IPython.core.display.Javascript object>

In [9]:
wdir = '/lustre1/project/stg_00090/scatac_benchmark/public_3_cistopic_qc'
os.chdir(wdir)

<IPython.core.display.Javascript object>

## Create the cisTopic objects for each sample

Make a new output dir

In [10]:
cistopic_objects_out = 'cistopic_objects'
if not os.path.exists(os.path.join(wdir, cistopic_objects_out)):
    os.makedirs(os.path.join(wdir, cistopic_objects_out))

<IPython.core.display.Javascript object>

Create a dictionary with fragments files for each sample

In [11]:
filenames = sorted(glob.glob('../1_data_repository/publicdata_full_fragments_vsn/*.fragments.tsv.gz'))
fragments_dict = {}
for filename in filenames:
    sample = filename.split('/')[-1].split('.fragments.tsv.gz')[0]
    fragments_dict[sample] = filename
fragments_dict

{'BIO_ddseq_m1c1.FULL': '../1_data_repository/publicdata_full_fragments_vsn/BIO_ddseq_m1c1.FULL.fragments.tsv.gz',
 'BIO_ddseq_m1c2.FULL': '../1_data_repository/publicdata_full_fragments_vsn/BIO_ddseq_m1c2.FULL.fragments.tsv.gz',
 'BIO_ddseq_m1c3.FULL': '../1_data_repository/publicdata_full_fragments_vsn/BIO_ddseq_m1c3.FULL.fragments.tsv.gz',
 'BIO_ddseq_m1c4.FULL': '../1_data_repository/publicdata_full_fragments_vsn/BIO_ddseq_m1c4.FULL.fragments.tsv.gz',
 'BIO_ddseq_m1c5.FULL': '../1_data_repository/publicdata_full_fragments_vsn/BIO_ddseq_m1c5.FULL.fragments.tsv.gz',
 'BIO_ddseq_m1c6.FULL': '../1_data_repository/publicdata_full_fragments_vsn/BIO_ddseq_m1c6.FULL.fragments.tsv.gz',
 'BIO_ddseq_m1c7.FULL': '../1_data_repository/publicdata_full_fragments_vsn/BIO_ddseq_m1c7.FULL.fragments.tsv.gz',
 'BIO_ddseq_m1c8.FULL': '../1_data_repository/publicdata_full_fragments_vsn/BIO_ddseq_m1c8.FULL.fragments.tsv.gz',
 'BIO_ddseq_m2c1.FULL': '../1_data_repository/publicdata_full_fragments_vsn/BIO_

<IPython.core.display.Javascript object>

In [12]:
metadata_bc_sub_dict = {x.split('/')[-1].split(f'__')[0]: x for x in sorted(glob.glob("cistopic_qc_out/*metadata_bc.pkl"))}
metadata_bc_sub_dict

{'BIO_ddseq_m1c1.FULL': 'cistopic_qc_out/BIO_ddseq_m1c1.FULL__metadata_bc.pkl',
 'BIO_ddseq_m1c2.FULL': 'cistopic_qc_out/BIO_ddseq_m1c2.FULL__metadata_bc.pkl',
 'BIO_ddseq_m1c3.FULL': 'cistopic_qc_out/BIO_ddseq_m1c3.FULL__metadata_bc.pkl',
 'BIO_ddseq_m1c4.FULL': 'cistopic_qc_out/BIO_ddseq_m1c4.FULL__metadata_bc.pkl',
 'BIO_ddseq_m1c5.FULL': 'cistopic_qc_out/BIO_ddseq_m1c5.FULL__metadata_bc.pkl',
 'BIO_ddseq_m1c6.FULL': 'cistopic_qc_out/BIO_ddseq_m1c6.FULL__metadata_bc.pkl',
 'BIO_ddseq_m1c7.FULL': 'cistopic_qc_out/BIO_ddseq_m1c7.FULL__metadata_bc.pkl',
 'BIO_ddseq_m1c8.FULL': 'cistopic_qc_out/BIO_ddseq_m1c8.FULL__metadata_bc.pkl',
 'BIO_ddseq_m2c1.FULL': 'cistopic_qc_out/BIO_ddseq_m2c1.FULL__metadata_bc.pkl',
 'BIO_ddseq_m2c2.FULL': 'cistopic_qc_out/BIO_ddseq_m2c2.FULL__metadata_bc.pkl',
 'BIO_ddseq_m2c3.FULL': 'cistopic_qc_out/BIO_ddseq_m2c3.FULL__metadata_bc.pkl',
 'BIO_ddseq_m2c4.FULL': 'cistopic_qc_out/BIO_ddseq_m2c4.FULL__metadata_bc.pkl',
 'OHS_s3atac_mouse.FULL': 'cistopic_qc_o

<IPython.core.display.Javascript object>

In [13]:
bc_passing_filters_sub_dict = {x.split('/')[-1].split(f'_bc_passing_filters_otsu.pkl')[0]: x for x in sorted(glob.glob("selected_barcodes/*otsu.pkl"))}
bc_passing_filters_sub_dict

{'BIO_ddseq_m1c1.FULL': 'selected_barcodes/BIO_ddseq_m1c1.FULL_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_m1c2.FULL': 'selected_barcodes/BIO_ddseq_m1c2.FULL_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_m1c3.FULL': 'selected_barcodes/BIO_ddseq_m1c3.FULL_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_m1c4.FULL': 'selected_barcodes/BIO_ddseq_m1c4.FULL_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_m1c5.FULL': 'selected_barcodes/BIO_ddseq_m1c5.FULL_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_m1c6.FULL': 'selected_barcodes/BIO_ddseq_m1c6.FULL_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_m1c7.FULL': 'selected_barcodes/BIO_ddseq_m1c7.FULL_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_m1c8.FULL': 'selected_barcodes/BIO_ddseq_m1c8.FULL_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_m2c1.FULL': 'selected_barcodes/BIO_ddseq_m2c1.FULL_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_m2c2.FULL': 'selected_barcodes/BIO_ddseq_m2c2.FULL_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_m2c3.FULL': 'selected_barcodes/BIO_ddseq_m2c3.FULL_bc_pass

<IPython.core.display.Javascript object>

Read bed files for SCREEN regions and blacklist (blacklisted regions in genome where many reads can map)

In [14]:
regions_path = '/lustre1/project/stg_00090/scatac_benchmark/0_resources/regions/V2.mm10-rDHS-Unfiltered.blacklisted.bed'

<IPython.core.display.Javascript object>

In [15]:
# path_to_regions = '../0_resources/regions/V2.hg38-rDHS-Unfiltered.blacklisted.bed'
path_to_blacklist = '../0_resources/regions/mm10-blacklist.v2.bed'

<IPython.core.display.Javascript object>

Create cistopic objects for each sample. If pandas crashes, increase the number of partitions. This is necessary for the largest files.

In the following command, it is important that the barcode syntax matches the fragments, the metadata and the bc_passing filters. That is why the sample name is removed from metadata and bc passing filters.

In [16]:
from pycisTopic.cistopic_class import create_cistopic_object_from_fragments

<IPython.core.display.Javascript object>

In [17]:
import ray
ray.shutdown()

<IPython.core.display.Javascript object>

In [18]:
n_cores = 70
for sample in metadata_bc_sub_dict.keys():
    cto_path = os.path.join(cistopic_objects_out, f"{sample}__cto.pkl")
    if not os.path.exists(cto_path):
        print(f"{cto_path} does not exist, generating...")

        infile = open(metadata_bc_sub_dict[sample], 'rb')
        metadata_bc = pickle.load(infile)
        metadata_bc.index = [bc.split("___")[0] for bc in metadata_bc.index]
        infile.close()
        infile = open(bc_passing_filters_sub_dict[sample], 'rb')
        bc_passing_filters = pickle.load(infile)
        infile.close()
        bc_passing_filters_fixed = [bc.split("___")[0] for bc in bc_passing_filters]

        cto = create_cistopic_object_from_fragments(path_to_fragments=fragments_dict[sample],
                                                        path_to_regions=regions_path,
                                                        path_to_blacklist=path_to_blacklist,
                                                        metrics=metadata_bc,
                                                        valid_bc=bc_passing_filters_fixed,
                                                        n_cpu=n_cores,
                                                        partition=10,
                                                        project=sample)

        cto_path = os.path.join(cistopic_objects_out, f"{sample}__cto.pkl")
        print(f"Writing {sample} cto in {cto_path}...")

        with open(
            cto_path, "wb"
        ) as f:
            pickle.dump(cto, f, protocol=4)
    else:
        print(f"{cto_path} already exists, skipping...")

else:
    print("All samples already processed.")

cistopic_objects/BIO_ddseq_m1c1.FULL__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_m1c2.FULL__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_m1c3.FULL__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_m1c4.FULL__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_m1c5.FULL__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_m1c6.FULL__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_m1c7.FULL__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_m1c8.FULL__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_m2c1.FULL__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_m2c2.FULL__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_m2c3.FULL__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_m2c4.FULL__cto.pkl already exists, skipping...
cistopic_objects/OHS_s3atac_mouse.FULL__cto.pkl already exists, skipping...
cistopic_objects/TXG_10xmultiome_e18

<IPython.core.display.Javascript object>

# Run Scrublet

In [19]:
import scrublet as scr
import pandas as pd
import matplotlib.pyplot as plt

<IPython.core.display.Javascript object>

In [20]:
cto_dict = {x.split('/')[-1].split('__cto.pkl')[0]:x for x in sorted(glob.glob('cistopic_objects/*__cto.pkl'))}
cto_dict

{'BIO_ddseq_m1c1.FULL': 'cistopic_objects/BIO_ddseq_m1c1.FULL__cto.pkl',
 'BIO_ddseq_m1c2.FULL': 'cistopic_objects/BIO_ddseq_m1c2.FULL__cto.pkl',
 'BIO_ddseq_m1c3.FULL': 'cistopic_objects/BIO_ddseq_m1c3.FULL__cto.pkl',
 'BIO_ddseq_m1c4.FULL': 'cistopic_objects/BIO_ddseq_m1c4.FULL__cto.pkl',
 'BIO_ddseq_m1c5.FULL': 'cistopic_objects/BIO_ddseq_m1c5.FULL__cto.pkl',
 'BIO_ddseq_m1c6.FULL': 'cistopic_objects/BIO_ddseq_m1c6.FULL__cto.pkl',
 'BIO_ddseq_m1c7.FULL': 'cistopic_objects/BIO_ddseq_m1c7.FULL__cto.pkl',
 'BIO_ddseq_m1c8.FULL': 'cistopic_objects/BIO_ddseq_m1c8.FULL__cto.pkl',
 'BIO_ddseq_m2c1.FULL': 'cistopic_objects/BIO_ddseq_m2c1.FULL__cto.pkl',
 'BIO_ddseq_m2c2.FULL': 'cistopic_objects/BIO_ddseq_m2c2.FULL__cto.pkl',
 'BIO_ddseq_m2c3.FULL': 'cistopic_objects/BIO_ddseq_m2c3.FULL__cto.pkl',
 'BIO_ddseq_m2c4.FULL': 'cistopic_objects/BIO_ddseq_m2c4.FULL__cto.pkl',
 'OHS_s3atac_mouse.FULL': 'cistopic_objects/OHS_s3atac_mouse.FULL__cto.pkl',
 'TXG_10xmultiome_e18mousebrainfresh.FULL': 'ci

<IPython.core.display.Javascript object>

In [21]:
fragments_sub_dict = {}
scrubcto_dict = {}
scrub_threshold = 0.4
scrub_name_suffix = "0-4"
#regions_sub_dict = {}
for sample in cto_dict:
    cto = os.path.join('cistopic_objects', sample + f"__cto.scrublet{scrub_name_suffix}.pkl")
    print(f"Checking if {cto} exist...")
    if os.path.exists(cto):
        print(f"\t{cto} exists! Skipping...")
    else:
        print(f"\t{cto} does not exist, adding to subdict to generate")
        scrubcto_dict[sample] = cto

Checking if cistopic_objects/BIO_ddseq_m1c1.FULL__cto.scrublet0-4.pkl exist...
	cistopic_objects/BIO_ddseq_m1c1.FULL__cto.scrublet0-4.pkl exists! Skipping...
Checking if cistopic_objects/BIO_ddseq_m1c2.FULL__cto.scrublet0-4.pkl exist...
	cistopic_objects/BIO_ddseq_m1c2.FULL__cto.scrublet0-4.pkl exists! Skipping...
Checking if cistopic_objects/BIO_ddseq_m1c3.FULL__cto.scrublet0-4.pkl exist...
	cistopic_objects/BIO_ddseq_m1c3.FULL__cto.scrublet0-4.pkl exists! Skipping...
Checking if cistopic_objects/BIO_ddseq_m1c4.FULL__cto.scrublet0-4.pkl exist...
	cistopic_objects/BIO_ddseq_m1c4.FULL__cto.scrublet0-4.pkl exists! Skipping...
Checking if cistopic_objects/BIO_ddseq_m1c5.FULL__cto.scrublet0-4.pkl exist...
	cistopic_objects/BIO_ddseq_m1c5.FULL__cto.scrublet0-4.pkl exists! Skipping...
Checking if cistopic_objects/BIO_ddseq_m1c6.FULL__cto.scrublet0-4.pkl exist...
	cistopic_objects/BIO_ddseq_m1c6.FULL__cto.scrublet0-4.pkl exists! Skipping...
Checking if cistopic_objects/BIO_ddseq_m1c7.FULL__ct

<IPython.core.display.Javascript object>

In [22]:
if cto_dict != {}:
    for sample in scrubcto_dict.keys():
        with open(cto_dict[sample], 'rb') as f:
            cto = pickle.load(f)
        print(f"Loaded {cto_dict[sample]}")
        scrub = scr.Scrublet(cto.fragment_matrix.T, expected_doublet_rate=0.1)
        doublet_scores, predicted_doublets = scrub.scrub_doublets()
        # scrub.plot_histogram()
        scrub.call_doublets(threshold=scrub_threshold)
        a,b = scrub.plot_histogram()
        a.suptitle(sample)
        plt.savefig(f'plots_qc/{sample}.scrublet_histogram.png', dpi=150, facecolor='white')
        plt.show()
        plt.close()

        print('Running UMAP...')
        scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
        scrub.plot_embedding('UMAP', order_points=True);
        plt.savefig(f'plots_qc/{sample}.scrublet_umap.png', dpi=150, facecolor='white')
        plt.show()
        plt.close()

        scrublet = pd.DataFrame([scrub.doublet_scores_obs_, scrub.predicted_doublets_], 
            columns=cto.cell_names,
            index=['Doublet_scores_fragments', 'Predicted_doublets_fragments']).T
        cto.add_cell_data(scrublet)
        n_scrublet_doublets = len(cto.cell_data["Predicted_doublets_fragments"] == True)
        print(f"{sample}: found doublets: \n\t({n_scrublet_doublets}: Scrublet)"
             )

        cto_path_new = scrubcto_dict[sample]

        with open(
            cto_path_new, "wb"
        ) as f:
            pickle.dump(cto, f, protocol=4)

        print('\n')
        
else:
    print("All samples already processed.")

<IPython.core.display.Javascript object>

# Read scrublet ctos

In [23]:
scrubcto_dict = {x.split('/')[-1].split(f'__cto.scrublet{scrub_name_suffix}.pkl')[0]:x for x in sorted(glob.glob(f'cistopic_objects/*__cto.scrublet{scrub_name_suffix}.pkl'))}
scrubcto_dict

{'BIO_ddseq_m1c1.FULL': 'cistopic_objects/BIO_ddseq_m1c1.FULL__cto.scrublet0-4.pkl',
 'BIO_ddseq_m1c2.FULL': 'cistopic_objects/BIO_ddseq_m1c2.FULL__cto.scrublet0-4.pkl',
 'BIO_ddseq_m1c3.FULL': 'cistopic_objects/BIO_ddseq_m1c3.FULL__cto.scrublet0-4.pkl',
 'BIO_ddseq_m1c4.FULL': 'cistopic_objects/BIO_ddseq_m1c4.FULL__cto.scrublet0-4.pkl',
 'BIO_ddseq_m1c5.FULL': 'cistopic_objects/BIO_ddseq_m1c5.FULL__cto.scrublet0-4.pkl',
 'BIO_ddseq_m1c6.FULL': 'cistopic_objects/BIO_ddseq_m1c6.FULL__cto.scrublet0-4.pkl',
 'BIO_ddseq_m1c7.FULL': 'cistopic_objects/BIO_ddseq_m1c7.FULL__cto.scrublet0-4.pkl',
 'BIO_ddseq_m1c8.FULL': 'cistopic_objects/BIO_ddseq_m1c8.FULL__cto.scrublet0-4.pkl',
 'BIO_ddseq_m2c1.FULL': 'cistopic_objects/BIO_ddseq_m2c1.FULL__cto.scrublet0-4.pkl',
 'BIO_ddseq_m2c2.FULL': 'cistopic_objects/BIO_ddseq_m2c2.FULL__cto.scrublet0-4.pkl',
 'BIO_ddseq_m2c3.FULL': 'cistopic_objects/BIO_ddseq_m2c3.FULL__cto.scrublet0-4.pkl',
 'BIO_ddseq_m2c4.FULL': 'cistopic_objects/BIO_ddseq_m2c4.FULL__ct

<IPython.core.display.Javascript object>

In [24]:
removed_bcs_dict = {}

<IPython.core.display.Javascript object>

In [25]:
for sample in scrubcto_dict.keys():
    print(f"{sample}")
    cto_path = scrubcto_dict[sample]
    newcto_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.pkl')
    if os.path.exists(newcto_path):
        print(f"\t{newcto_path} exists! Skipping...")
        
    else:
        with open(cto_path, 'rb') as f:
            cto = pickle.load(f)

            print(f"\tLoaded {cto_path}, removing scr doublets")
            removed_bcs_dict[sample] = {
                'dbl_scrublet': cto.cell_data.barcode[cto.cell_data.Predicted_doublets_fragments == True].tolist()
            }

            # Remove doublets 
            singlets = cto.cell_data[
                (cto.cell_data.Predicted_doublets_fragments == False)
            ].index.tolist()
            print(f"\t{sample}: Removing {len(cto.cell_names)-len(singlets)} cells")

            scr_doublets = set(cto.cell_data[cto.cell_data['Predicted_doublets_fragments'] == True].index)
            print(f"\t\t{len(scr_doublets)} scr doublets")

            
            scr_doublets_unique_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.scr_doublets_unique.txt')
            with open(scr_doublets_unique_path, 'w') as f:
                for x in scr_doublets:
                    f.write(f"{x}\n")
                    
                    
            # Subset cisTopic object
            cto.subset(singlets)

            # save
            with open(newcto_path, "wb") as f:
                pickle.dump(cto, f, protocol=4)

            removed_bcs_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.removed_bcs.pkl')
            with open(removed_bcs_path, "wb") as f:
                pickle.dump(removed_bcs_path, f, protocol=4)

            cell_data_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.cell_data.tsv')
            cto.cell_data.to_csv(cell_data_path)

            print(f"\tWrote {newcto_path}, doublet lists and {cell_data_path}\n")

BIO_ddseq_m1c1.FULL
	Loaded cistopic_objects/BIO_ddseq_m1c1.FULL__cto.scrublet0-4.pkl, removing scr doublets
	BIO_ddseq_m1c1.FULL: Removing 150 cells
		150 scr doublets
	Wrote cistopic_objects/BIO_ddseq_m1c1.FULL__cto.scrublet0-4.fmx.singlets.pkl, doublet lists and cistopic_objects/BIO_ddseq_m1c1.FULL__cto.scrublet0-4.fmx.singlets.cell_data.tsv

BIO_ddseq_m1c2.FULL
	Loaded cistopic_objects/BIO_ddseq_m1c2.FULL__cto.scrublet0-4.pkl, removing scr doublets
	BIO_ddseq_m1c2.FULL: Removing 178 cells
		178 scr doublets
	Wrote cistopic_objects/BIO_ddseq_m1c2.FULL__cto.scrublet0-4.fmx.singlets.pkl, doublet lists and cistopic_objects/BIO_ddseq_m1c2.FULL__cto.scrublet0-4.fmx.singlets.cell_data.tsv

BIO_ddseq_m1c3.FULL
	Loaded cistopic_objects/BIO_ddseq_m1c3.FULL__cto.scrublet0-4.pkl, removing scr doublets
	BIO_ddseq_m1c3.FULL: Removing 164 cells
		164 scr doublets
	Wrote cistopic_objects/BIO_ddseq_m1c3.FULL__cto.scrublet0-4.fmx.singlets.pkl, doublet lists and cistopic_objects/BIO_ddseq_m1c3.FULL__

<IPython.core.display.Javascript object>