# pycisTopic analysis

Full dataset, using CONSENSUS peaks.

In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

import pickle
import os
import glob
%load_ext nb_black

<IPython.core.display.Javascript object>

# Set downsampling rate

In [2]:
!pwd

/lustre1/project/stg_00090/scatac_benchmark/full_4_merged


<IPython.core.display.Javascript object>

In [3]:
wdir = f'/lustre1/project/stg_00090/scatac_benchmark/full_4_merged'
os.chdir(wdir)

<IPython.core.display.Javascript object>

## Create the cisTopic objects for each sample

Make a new output dir

In [4]:
cistopic_objects_out = 'cistopic_objects'
if not os.path.exists(os.path.join(wdir, cistopic_objects_out)):
    os.makedirs(os.path.join(wdir, cistopic_objects_out))

<IPython.core.display.Javascript object>

Create a dictionary with fragments files for each sample

In [5]:
filenames = sorted(glob.glob('../1_data_repository/full_fragments/*.fragments.tsv.gz'))
fragments_dict = {}
for filename in filenames:
    sample = filename.split('/')[-1].split('.fragments.tsv.gz')[0]
    fragments_dict[sample] = filename
fragments_dict

{'BIO_ddseq_1.FULL': '../1_data_repository/full_fragments/BIO_ddseq_1.FULL.fragments.tsv.gz',
 'BIO_ddseq_2.FULL': '../1_data_repository/full_fragments/BIO_ddseq_2.FULL.fragments.tsv.gz',
 'BIO_ddseq_3.FULL': '../1_data_repository/full_fragments/BIO_ddseq_3.FULL.fragments.tsv.gz',
 'BIO_ddseq_4.FULL': '../1_data_repository/full_fragments/BIO_ddseq_4.FULL.fragments.tsv.gz',
 'BRO_mtscatac_1.FULL': '../1_data_repository/full_fragments/BRO_mtscatac_1.FULL.fragments.tsv.gz',
 'BRO_mtscatac_2.FULL': '../1_data_repository/full_fragments/BRO_mtscatac_2.FULL.fragments.tsv.gz',
 'CNA_10xmultiome_1.FULL': '../1_data_repository/full_fragments/CNA_10xmultiome_1.FULL.fragments.tsv.gz',
 'CNA_10xmultiome_2.FULL': '../1_data_repository/full_fragments/CNA_10xmultiome_2.FULL.fragments.tsv.gz',
 'CNA_10xv11_1.FULL': '../1_data_repository/full_fragments/CNA_10xv11_1.FULL.fragments.tsv.gz',
 'CNA_10xv11_2.FULL': '../1_data_repository/full_fragments/CNA_10xv11_2.FULL.fragments.tsv.gz',
 'CNA_10xv11_3.FULL'

<IPython.core.display.Javascript object>

In [6]:
metadata_bc_sub_dict = {x.split('/')[-1].split(f'__')[0]: x for x in sorted(glob.glob("cistopic_qc_out_MASTER/*metadata_bc.pkl"))}
metadata_bc_sub_dict

{'BIO_ddseq_1.FULL': 'cistopic_qc_out_MASTER/BIO_ddseq_1.FULL__metadata_bc.pkl',
 'BIO_ddseq_2.FULL': 'cistopic_qc_out_MASTER/BIO_ddseq_2.FULL__metadata_bc.pkl',
 'BIO_ddseq_3.FULL': 'cistopic_qc_out_MASTER/BIO_ddseq_3.FULL__metadata_bc.pkl',
 'BIO_ddseq_4.FULL': 'cistopic_qc_out_MASTER/BIO_ddseq_4.FULL__metadata_bc.pkl',
 'BRO_mtscatac_1.FULL': 'cistopic_qc_out_MASTER/BRO_mtscatac_1.FULL__metadata_bc.pkl',
 'BRO_mtscatac_2.FULL': 'cistopic_qc_out_MASTER/BRO_mtscatac_2.FULL__metadata_bc.pkl',
 'CNA_10xmultiome_1.FULL': 'cistopic_qc_out_MASTER/CNA_10xmultiome_1.FULL__metadata_bc.pkl',
 'CNA_10xmultiome_2.FULL': 'cistopic_qc_out_MASTER/CNA_10xmultiome_2.FULL__metadata_bc.pkl',
 'CNA_10xv11_1.FULL': 'cistopic_qc_out_MASTER/CNA_10xv11_1.FULL__metadata_bc.pkl',
 'CNA_10xv11_2.FULL': 'cistopic_qc_out_MASTER/CNA_10xv11_2.FULL__metadata_bc.pkl',
 'CNA_10xv11_3.FULL': 'cistopic_qc_out_MASTER/CNA_10xv11_3.FULL__metadata_bc.pkl',
 'CNA_10xv11_4.FULL': 'cistopic_qc_out_MASTER/CNA_10xv11_4.FULL__me

<IPython.core.display.Javascript object>

metadata_bc_sub_dict.pop('VIB_hydrop_1.FULL')
metadata_bc_sub_dict.pop('VIB_hydrop_2.FULL')

In [7]:
bc_passing_filters_sub_dict = {x.split('/')[-1].split(f'_bc_passing_filters_otsu.pkl')[0]: x for x in sorted(glob.glob("selected_barcodes/*otsu.pkl"))}
bc_passing_filters_sub_dict

{'BIO_ddseq_1.FULL': 'selected_barcodes/BIO_ddseq_1.FULL_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_2.FULL': 'selected_barcodes/BIO_ddseq_2.FULL_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_3.FULL': 'selected_barcodes/BIO_ddseq_3.FULL_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_4.FULL': 'selected_barcodes/BIO_ddseq_4.FULL_bc_passing_filters_otsu.pkl',
 'BRO_mtscatac_1.FULL': 'selected_barcodes/BRO_mtscatac_1.FULL_bc_passing_filters_otsu.pkl',
 'BRO_mtscatac_2.FULL': 'selected_barcodes/BRO_mtscatac_2.FULL_bc_passing_filters_otsu.pkl',
 'CNA_10xmultiome_1.FULL': 'selected_barcodes/CNA_10xmultiome_1.FULL_bc_passing_filters_otsu.pkl',
 'CNA_10xmultiome_2.FULL': 'selected_barcodes/CNA_10xmultiome_2.FULL_bc_passing_filters_otsu.pkl',
 'CNA_10xv11_1.FULL': 'selected_barcodes/CNA_10xv11_1.FULL_bc_passing_filters_otsu.pkl',
 'CNA_10xv11_2.FULL': 'selected_barcodes/CNA_10xv11_2.FULL_bc_passing_filters_otsu.pkl',
 'CNA_10xv11_3.FULL': 'selected_barcodes/CNA_10xv11_3.FULL_bc_passing_filters_otsu.pkl',
 

<IPython.core.display.Javascript object>

Read bed files for SCREEN regions and blacklist (blacklisted regions in genome where many reads can map)

In [8]:
regions = "../full_3_cistopic_consensus/master_peaks/all.FULL.master_peaks.occurrence_filtered7.bed"

<IPython.core.display.Javascript object>

In [9]:
os.path.exists(regions)

True

<IPython.core.display.Javascript object>

In [10]:
# path_to_regions = '../0_resources/regions/V2.hg38-rDHS-Unfiltered.blacklisted.bed'
path_to_blacklist = '../0_resources/regions/hg38-blacklist.v2.bed'

<IPython.core.display.Javascript object>

Create cistopic objects for each sample. If pandas crashes, increase the number of partitions. This is necessary for the largest files.

In the following command, it is important that the barcode syntax matches the fragments, the metadata and the bc_passing filters. That is why the sample name is removed from metadata and bc passing filters.

In [11]:
from pycisTopic.cistopic_class import create_cistopic_object_from_fragments

<IPython.core.display.Javascript object>

In [12]:
# end test

<IPython.core.display.Javascript object>

In [13]:
import ray
ray.shutdown()

<IPython.core.display.Javascript object>

In [14]:
n_cores = 10
for sample in metadata_bc_sub_dict.keys():
    cto_path = os.path.join(cistopic_objects_out, f"{sample}__cto.pkl")
    if not os.path.exists(cto_path):
        print(f"{cto_path} does not exist, generating...")

        infile = open(metadata_bc_sub_dict[sample], 'rb')
        metadata_bc = pickle.load(infile)
        metadata_bc.index = [bc.split("___")[0] for bc in metadata_bc.index]
        infile.close()
        infile = open(bc_passing_filters_sub_dict[sample], 'rb')
        bc_passing_filters = pickle.load(infile)
        infile.close()
        bc_passing_filters_fixed = [bc.split("___")[0] for bc in bc_passing_filters]

        cto = create_cistopic_object_from_fragments(path_to_fragments=fragments_dict[sample],
                                                        path_to_regions=regions,
                                                        path_to_blacklist=path_to_blacklist,
                                                        metrics=metadata_bc,
                                                        valid_bc=bc_passing_filters_fixed,
                                                        n_cpu=n_cores,
                                                        partition=10,
                                                        project=sample)

        cto_path = os.path.join(cistopic_objects_out, f"{sample}__cto.pkl")
        print(f"Writing {sample} cto in {cto_path}...")

        with open(
            cto_path, "wb"
        ) as f:
            pickle.dump(cto, f, protocol=4)
    else:
        print(f"{cto_path} already exists, skipping...")

cistopic_objects/BIO_ddseq_1.FULL__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_2.FULL__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_3.FULL__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_4.FULL__cto.pkl already exists, skipping...
cistopic_objects/BRO_mtscatac_1.FULL__cto.pkl already exists, skipping...
cistopic_objects/BRO_mtscatac_2.FULL__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xmultiome_1.FULL__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xmultiome_2.FULL__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xv11_1.FULL__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xv11_2.FULL__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xv11_3.FULL__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xv11_4.FULL__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xv11_5.FULL__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xv2_1.FULL__cto.pkl already exi

<IPython.core.display.Javascript object>

# Merge

In [15]:
from pycisTopic.cistopic_class import *

<IPython.core.display.Javascript object>

In [16]:
cto_paths_list = glob.glob('cistopic_objects/*_cto.pkl')
cto_paths_list

['cistopic_objects/VIB_10xv2_1.FULL__cto.pkl',
 'cistopic_objects/CNA_mtscatac_1.FULL__cto.pkl',
 'cistopic_objects/EPF_hydrop_4.FULL__cto.pkl',
 'cistopic_objects/STA_10xv11_1.FULL__cto.pkl',
 'cistopic_objects/CNA_10xmultiome_1.FULL__cto.pkl',
 'cistopic_objects/TXG_10xv11_1.FULL__cto.pkl',
 'cistopic_objects/CNA_10xv11_1.FULL__cto.pkl',
 'cistopic_objects/MDC_mtscatac_1.FULL__cto.pkl',
 'cistopic_objects/CNA_10xmultiome_2.FULL__cto.pkl',
 'cistopic_objects/SAN_10xmultiome_1.FULL__cto.pkl',
 'cistopic_objects/OHS_s3atac_1.FULL__cto.pkl',
 'cistopic_objects/EPF_hydrop_1.FULL__cto.pkl',
 'cistopic_objects/HAR_ddseq_1.FULL__cto.pkl',
 'cistopic_objects/UCS_ddseq_1.FULL__cto.pkl',
 'cistopic_objects/VIB_10xmultiome_2.FULL__cto.pkl',
 'cistopic_objects/BRO_mtscatac_2.FULL__cto.pkl',
 'cistopic_objects/EPF_hydrop_2.FULL__cto.pkl',
 'cistopic_objects/CNA_10xv11_3.FULL__cto.pkl',
 'cistopic_objects/UCS_ddseq_2.FULL__cto.pkl',
 'cistopic_objects/BIO_ddseq_3.FULL__cto.pkl',
 'cistopic_objects/

<IPython.core.display.Javascript object>

In [17]:
cto_path_new = 'cistopic_objects_master/master_all_1.FULL__cto.pkl'
if not os.path.exists(cto_path_new):
    cto_list = []
    for file in cto_paths_list:
        print(file)
        with open(file, 'rb') as f:
            cto = pickle.load(f)

        cto_list.append(cto)

    cto_merged = merge(cto_list)
    with open(
        cto_path_new, "wb"
    ) as f:
        pickle.dump(cto_merged, f, protocol=4)
else:
    print(cto_path + ' exists')

cistopic_objects/VIB_hydrop_2.FULL__cto.pkl exists


<IPython.core.display.Javascript object>

# Run Scrublet

In [18]:
import scrublet as scr
import pandas as pd
import matplotlib.pyplot as plt

<IPython.core.display.Javascript object>

In [19]:
cto_dict = {x.split('/')[-1].split('__cto.pkl')[0]:x for x in sorted(glob.glob('cistopic_objects_master/*cto.pkl'))}
cto_dict

{'master_all_1.FULL': 'cistopic_objects_master/master_all_1.FULL__cto.pkl'}

<IPython.core.display.Javascript object>

In [20]:
fragments_sub_dict = {}
scrubcto_dict = {}
scrub_threshold = 0.4
scrub_name_suffix = "0-4"
#regions_sub_dict = {}
for sample in cto_dict:
    cto = os.path.join('cistopic_objects_master', sample + f"__cto.scrublet{scrub_name_suffix}.pkl")
    print(f"Checking if {cto} exist...")
    if os.path.exists(cto):
        print(f"\t{cto} exists! Skipping...")
    else:
        print(f"\t{cto} does not exist, adding to subdict to generate")
        scrubcto_dict[sample] = cto

Checking if cistopic_objects_master/master_all_1.FULL__cto.scrublet0-4.pkl exist...
	cistopic_objects_master/master_all_1.FULL__cto.scrublet0-4.pkl exists! Skipping...


<IPython.core.display.Javascript object>

In [21]:
if cto_dict != {}:
    for sample in scrubcto_dict.keys():
        with open(cto_dict[sample], 'rb') as f:
            cto = pickle.load(f)
        print(f"Loaded {cto_dict[sample]}")
        scrub = scr.Scrublet(cto.fragment_matrix.T, expected_doublet_rate=0.1)
        doublet_scores, predicted_doublets = scrub.scrub_doublets()
        # scrub.plot_histogram()
        scrub.call_doublets(threshold=scrub_threshold)
        a,b = scrub.plot_histogram()
        a.suptitle(sample)
        plt.savefig(f'plots_qc/{sample}.scrublet_histogram.png', dpi=150, facecolor='white')
        plt.show()
        plt.close()

        print('Running UMAP...')
        scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
        scrub.plot_embedding('UMAP', order_points=True);
        plt.savefig(f'plots_qc/{sample}.scrublet_umap.png', dpi=150, facecolor='white')
        plt.show()
        plt.close()

        scrublet = pd.DataFrame([scrub.doublet_scores_obs_, scrub.predicted_doublets_], 
            columns=cto.cell_names,
            index=['Doublet_scores_fragments', 'Predicted_doublets_fragments']).T
        cto.add_cell_data(scrublet)
        n_scrublet_doublets = len(cto.cell_data["Predicted_doublets_fragments"] == True)
        print(f"{sample}: found doublets: \n\t({n_scrublet_doublets}: Scrublet)"
             )

        cto_path_new = scrubcto_dict[sample]

        with open(
            cto_path_new, "wb"
        ) as f:
            pickle.dump(cto, f, protocol=4)

        print('\n')
        
else:
    print("All samples already processed.")

<IPython.core.display.Javascript object>

## Cell filtering

### Read in Freemuxlet doublet predictions

In [22]:
f_fmx = 'out_fmx/genotype_concordance_unified.txt'
fmx = pd.read_csv(f_fmx, sep='\t')
fmx

Unnamed: 0,INT_ID,BARCODE,NUM.SNPS,NUM.READS,DROPLET.TYPE,BEST.GUESS,BEST.LLK,NEXT.GUESS,NEXT.LLK,DIFF.LLK.BEST.NEXT,...,SNG.BEST.LLK,SNG.NEXT.GUESS,SNG.NEXT.LLK,SNG.ONLY.POSTERIOR,DBL.BEST.GUESS,DBL.BEST.LLK,DIFF.LLK.SNG.DBL,ubarcode,replicate,sample
CNA_mtscatac_1.FULL.1,0,GTCCCAGGAAAGGCCA,2339,2339,SNG,11,-4102.33,10,-4403.45,301.12,...,-4102.33,0,-5006.85,1.00000,10,-4403.45,301.12,CNA_mtscatac_1.FULL#GTCCCAGGAAAGGCCA,CNA_mtscatac_1.FULL,sampleA
CNA_mtscatac_1.FULL.2,1,TGCGAGGCTCTGCCAG,1738,1738,SNG,00,-3068.55,10,-3307.30,238.75,...,-3068.55,1,-3808.83,1.00000,10,-3307.30,238.75,CNA_mtscatac_1.FULL#TGCGAGGCTCTGCCAG,CNA_mtscatac_1.FULL,sampleB
CNA_mtscatac_1.FULL.3,2,AGCGTATACCAATAGA,5432,5432,SNG,11,-9844.04,10,-10194.35,350.30,...,-9844.04,0,-11116.48,1.00000,10,-10194.35,350.30,CNA_mtscatac_1.FULL#AGCGTATACCAATAGA,CNA_mtscatac_1.FULL,sampleA
CNA_mtscatac_1.FULL.4,3,TGTCGAAACGACCGAA,1196,1196,SNG,11,-2089.42,10,-2240.71,151.28,...,-2089.42,0,-2537.00,1.00000,10,-2240.71,151.28,CNA_mtscatac_1.FULL#TGTCGAAACGACCGAA,CNA_mtscatac_1.FULL,sampleA
CNA_mtscatac_1.FULL.5,4,AGGTCAGGACGGTTGC,6424,6424,SNG,11,-11307.49,10,-12145.77,838.28,...,-11307.49,0,-13886.90,1.00000,10,-12145.77,838.28,CNA_mtscatac_1.FULL#AGGTCAGGACGGTTGC,CNA_mtscatac_1.FULL,sampleA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VIB_hydrop_22.FULL.965,964,TAACCGAATCTCTCGCACGA,227,227,SNG,00,-404.03,10,-432.16,28.13,...,-404.03,1,-495.29,1.00000,10,-432.16,28.13,VIB_hydrop_22.FULL#TAACCGAATCTCTCGCACGA,VIB_hydrop_22.FULL,sampleB
VIB_hydrop_22.FULL.966,965,AACGATACCATGCTTGCCGT,175,175,SNG,00,-315.26,10,-335.18,19.92,...,-315.26,1,-385.62,1.00000,10,-335.18,19.92,VIB_hydrop_22.FULL#AACGATACCATGCTTGCCGT,VIB_hydrop_22.FULL,sampleB
VIB_hydrop_22.FULL.967,966,GATTGGCGGTGTCCAGGCTT,274,274,SNG,00,-481.27,10,-513.34,32.07,...,-481.27,1,-582.89,1.00000,10,-513.34,32.07,VIB_hydrop_22.FULL#GATTGGCGGTGTCCAGGCTT,VIB_hydrop_22.FULL,sampleB
VIB_hydrop_22.FULL.968,967,CGTTGACCACGGTAGGTAGG,172,172,SNG,11,-312.80,10,-321.39,8.59,...,-312.80,0,-342.76,1.00000,10,-321.39,8.59,VIB_hydrop_22.FULL#CGTTGACCACGGTAGGTAGG,VIB_hydrop_22.FULL,sampleA


<IPython.core.display.Javascript object>

In [23]:
fmx['DROPLET.TYPE'].unique()

array(['SNG', 'DBL'], dtype=object)

<IPython.core.display.Javascript object>

In [24]:
# create an annotation df:
fmx['cell_names'] = fmx['BARCODE'] + "___" + fmx['replicate']

fmx_annot = fmx[['DROPLET.TYPE','sample','cell_names']].copy().set_index('cell_names')
fmx_annot.columns = ['fmx_droplet_type','fmx_sample']
fmx_annot

Unnamed: 0_level_0,fmx_droplet_type,fmx_sample
cell_names,Unnamed: 1_level_1,Unnamed: 2_level_1
GTCCCAGGAAAGGCCA___CNA_mtscatac_1.FULL,SNG,sampleA
TGCGAGGCTCTGCCAG___CNA_mtscatac_1.FULL,SNG,sampleB
AGCGTATACCAATAGA___CNA_mtscatac_1.FULL,SNG,sampleA
TGTCGAAACGACCGAA___CNA_mtscatac_1.FULL,SNG,sampleA
AGGTCAGGACGGTTGC___CNA_mtscatac_1.FULL,SNG,sampleA
...,...,...
TAACCGAATCTCTCGCACGA___VIB_hydrop_22.FULL,SNG,sampleB
AACGATACCATGCTTGCCGT___VIB_hydrop_22.FULL,SNG,sampleB
GATTGGCGGTGTCCAGGCTT___VIB_hydrop_22.FULL,SNG,sampleB
CGTTGACCACGGTAGGTAGG___VIB_hydrop_22.FULL,SNG,sampleA


<IPython.core.display.Javascript object>

# Read scrublet ctos

In [25]:
scrubcto_dict = {x.split('/')[-1].split(f'__cto.scrublet{scrub_name_suffix}.pkl')[0]:x for x in sorted(glob.glob(f'cistopic_objects_master/*__cto.scrublet{scrub_name_suffix}.pkl'))}
scrubcto_dict

{'master_all_1.FULL': 'cistopic_objects_master/master_all_1.FULL__cto.scrublet0-4.pkl'}

<IPython.core.display.Javascript object>

In [26]:
len(scrubcto_dict)

1

<IPython.core.display.Javascript object>

In [27]:
non_fmx_samples = ['BIO_ddseq_1.FULL',
'BIO_ddseq_2.FULL',
'BIO_ddseq_3.FULL',
'BIO_ddseq_4.FULL',
'OHS_s3atac_2.FULL',
'TXG_10xv11_1.FULL',
'TXG_10xv2_1.FULL',
'TXG_10xv2_2.FULL',
'UCS_ddseq_1.FULL',
'UCS_ddseq_2.FULL']

<IPython.core.display.Javascript object>

In [28]:
fmx_samples_run = set([x.split('___')[-1] for x in fmx_annot.index])
fmx_samples_run

{'BRO_mtscatac_1.FULL',
 'BRO_mtscatac_2.FULL',
 'CNA_10xmultiome_1.FULL',
 'CNA_10xmultiome_2.FULL',
 'CNA_10xv11_1.FULL',
 'CNA_10xv11_2.FULL',
 'CNA_10xv11_3.FULL',
 'CNA_10xv11_4.FULL',
 'CNA_10xv11_5.FULL',
 'CNA_10xv2_1.FULL',
 'CNA_10xv2_2.FULL',
 'CNA_hydrop_1.FULL',
 'CNA_hydrop_2.FULL',
 'CNA_hydrop_3.FULL',
 'CNA_mtscatac_1.FULL',
 'CNA_mtscatac_2.FULL',
 'EPF_hydrop_1.FULL',
 'EPF_hydrop_2.FULL',
 'EPF_hydrop_3.FULL',
 'EPF_hydrop_4.FULL',
 'HAR_ddseq_1.FULL',
 'HAR_ddseq_2.FULL',
 'MDC_mtscatac_1.FULL',
 'MDC_mtscatac_2.FULL',
 'SAN_10xmultiome_1.FULL',
 'SAN_10xmultiome_2.FULL',
 'STA_10xv11_1.FULL',
 'STA_10xv11_2.FULL',
 'VIB_10xmultiome_1.FULL',
 'VIB_10xmultiome_2.FULL',
 'VIB_10xv1_1.FULL',
 'VIB_10xv1_2.FULL',
 'VIB_10xv2_1.FULL',
 'VIB_10xv2_2.FULL',
 'VIB_hydrop_11.FULL',
 'VIB_hydrop_12.FULL',
 'VIB_hydrop_21.FULL',
 'VIB_hydrop_22.FULL'}

<IPython.core.display.Javascript object>

In [29]:
fmx_annot

Unnamed: 0_level_0,fmx_droplet_type,fmx_sample
cell_names,Unnamed: 1_level_1,Unnamed: 2_level_1
GTCCCAGGAAAGGCCA___CNA_mtscatac_1.FULL,SNG,sampleA
TGCGAGGCTCTGCCAG___CNA_mtscatac_1.FULL,SNG,sampleB
AGCGTATACCAATAGA___CNA_mtscatac_1.FULL,SNG,sampleA
TGTCGAAACGACCGAA___CNA_mtscatac_1.FULL,SNG,sampleA
AGGTCAGGACGGTTGC___CNA_mtscatac_1.FULL,SNG,sampleA
...,...,...
TAACCGAATCTCTCGCACGA___VIB_hydrop_22.FULL,SNG,sampleB
AACGATACCATGCTTGCCGT___VIB_hydrop_22.FULL,SNG,sampleB
GATTGGCGGTGTCCAGGCTT___VIB_hydrop_22.FULL,SNG,sampleB
CGTTGACCACGGTAGGTAGG___VIB_hydrop_22.FULL,SNG,sampleA


<IPython.core.display.Javascript object>

In [30]:
fmx_samples = sorted(list(set([x.split('.')[0] + '.' + x.split('.')[1] for x in fmx.index])))
removed_bcs_dict = {}
# for sample in ["CNA_10xv11_1.FULL"]:

for sample in scrubcto_dict.keys():
    print(f"{sample}")
    cto_path = scrubcto_dict[sample]
    newcto_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.pkl')
    if os.path.exists(newcto_path):
        print(f"\t{newcto_path} exists! Skipping...")
        
    else:
        with open(cto_path, 'rb') as f:
            cto = pickle.load(f)

            print(f"\tLoaded {cto_path}, adding fmx data and removing fmx + scr doublets")
            cto.cell_data['fmx_droplet_type'] = fmx_annot['fmx_droplet_type']
            cto.cell_data['fmx_sample'] = fmx_annot['fmx_sample']
            
            removed_bcs_dict[sample] = {
                'dbl_scrublet': cto.cell_data.barcode[cto.cell_data.Predicted_doublets_fragments == True].tolist(),
                'dbl_fmx': cto.cell_data.barcode[cto.cell_data.fmx_droplet_type == 'DBL'].tolist(),    
            }

            # Remove doublets 
            singlets = cto.cell_data[
                (cto.cell_data.Predicted_doublets_fragments == False) & 
                (cto.cell_data.fmx_droplet_type != 'DBL')
            ].index.tolist()
            print(f"\t{sample}: Removing {len(cto.cell_names)-len(singlets)} cells")

            fmx_doublets = set(cto.cell_data[cto.cell_data['fmx_droplet_type'] == 'DBL'].index)
            scr_doublets = set(cto.cell_data[cto.cell_data['Predicted_doublets_fragments'] == True].index)

            fmx_doublets_unique = fmx_doublets - scr_doublets
            scr_doublets_unique = scr_doublets - fmx_doublets
            common_doublets = fmx_doublets.intersection(scr_doublets)

            print(f"\t\t{len(fmx_doublets_unique)} unique fmx doublets")
            print(f"\t\t{len(scr_doublets_unique)} unique scr doublets")
            print(f"\t\t{len(common_doublets)} common doublets")

            fmx_doublets_unique_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.fmx_doublets_unique.txt')
            with open(fmx_doublets_unique_path, 'w') as f:
                for x in fmx_doublets_unique:
                    f.write(f"{x}\n")

            scr_doublets_unique_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.scr_doublets_unique.txt')
            with open(scr_doublets_unique_path, 'w') as f:
                for x in scr_doublets_unique:
                    f.write(f"{x}\n")

            common_doublets_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.common_doublets.txt')
            with open(common_doublets_path, 'w') as f:
                for x in common_doublets:
                    f.write(f"{x}\n")

            # Subset cisTopic object
            cto.subset(singlets)

            # save
            with open(newcto_path, "wb") as f:
                pickle.dump(cto, f, protocol=4)

            removed_bcs_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.removed_bcs.pkl')
            with open(removed_bcs_path, "wb") as f:
                pickle.dump(removed_bcs_path, f, protocol=4)

            cell_data_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.cell_data.tsv')
            cto.cell_data.to_csv(cell_data_path)

            print(f"\tWrote {newcto_path}, doublet lists and {cell_data_path}\n")

master_all_1.FULL
	cistopic_objects_master/master_all_1.FULL__cto.scrublet0-4.fmx.singlets.pkl exists! Skipping...


<IPython.core.display.Javascript object>

# Write looms from these ctos

In [31]:
import loompy as lp

<IPython.core.display.Javascript object>

In [32]:
scrub_name_suffix = "0-4"
scrubcto_dict = {x.split('/')[-1].split(f'__cto.scrublet{scrub_name_suffix}.pkl')[0]:x for x in sorted(glob.glob(f'cistopic_objects_master/*__cto.scrublet{scrub_name_suffix}.*singlets.pkl'))}
scrubcto_dict

{'master_all_1.FULL__cto.scrublet0-4.fmx.singlets.pkl': 'cistopic_objects_master/master_all_1.FULL__cto.scrublet0-4.fmx.singlets.pkl'}

<IPython.core.display.Javascript object>

In [33]:
len(scrubcto_dict)

1

<IPython.core.display.Javascript object>

In [34]:
loom_out = 'cell_region_looms'
if not os.path.exists(os.path.join(loom_out)):
    os.makedirs(os.path.join(loom_out))

<IPython.core.display.Javascript object>

In [35]:
loom_path = 'cell_region_looms/master_all_1.FULL.scrublet0-4.singlets.ID.loom'
if not os.path.exists(loom_path):
    cto_path = 'cistopic_objects_master/master_all_1.FULL__cto.scrublet0-4.fmx.singlets.pkl'
    with open(cto_path, 'rb') as f:
        cto = pickle.load(f)

    lp.create(
        filename = loom_path,
        layers=cto.fragment_matrix,
        row_attrs={ 'Gene': cto.region_names }, 
        col_attrs={ 'CellID': [x.replace('___', '__').split('.')[0] for x in cto.cell_names] }
    )
    print(f"Finished {loom_path} loom writing")

Finished cell_region_looms/master_all_1.FULL.scrublet0-4.singlets.ID.loom loom writing


<IPython.core.display.Javascript object>

In [36]:
loom_path = 'cell_region_looms/master_all_1.FULL_cto.scrublet0-4.singlets.fixedcb.loom'
if not os.path.exists(loom_path):
    cto_path = 'cistopic_objects_master/master_all_1.FULL__cto.scrublet0-4.fmx.singlets.pkl'
    with open(cto_path, 'rb') as f:
        cto = pickle.load(f)

    print(f"Loaded filtered cistopic object {sample}")
    lp.create(
        filename = loom_path,
        layers=cto.fragment_matrix,
        row_attrs={ 'Gene': cto.region_names }, 
        col_attrs={ 'CellID': [x for x in cto.cell_names ] }
    )
    print(f"Finished {loom_path} loom writing")

Loaded filtered cistopic object master_all_1.FULL
Finished cell_region_looms/master_all_1.FULL_cto.scrublet0-4.singlets.fixedcb.loom loom writing


<IPython.core.display.Javascript object>