# pycisTopic analysis

Downsampled dataset, using MASTER peaks.

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

import pickle
import os
import glob
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
!pwd

/dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/fixedcells_4_merged


<IPython.core.display.Javascript object>

In [3]:
wdir = f'/dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/fixedcells_4_merged'
os.chdir(wdir)

<IPython.core.display.Javascript object>

## Create the cisTopic objects for each sample

Make a new output dir

In [4]:
cistopic_objects_out = 'cistopic_objects'
if not os.path.exists(os.path.join(wdir, cistopic_objects_out)):
    os.makedirs(os.path.join(wdir, cistopic_objects_out))

<IPython.core.display.Javascript object>

Create a dictionary with fragments files for each sample

In [5]:
filenames = sorted(glob.glob('../1_data_repository/fixedcells_fragments/*.fragments.tsv.gz'))
fragments_dict = {}
for filename in filenames:
    sample = filename.split('/')[-1].split('.fragments.tsv.gz')[0]
    fragments_dict[sample] = filename
fragments_dict

{'BIO_ddseq_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_1.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_2.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_3.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_3.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_4.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_4.FIXEDCELLS.fragments.tsv.gz',
 'BRO_mtscatac_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BRO_mtscatac_1.FIXEDCELLS.fragments.tsv.gz',
 'BRO_mtscatac_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BRO_mtscatac_2.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xmultiome_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xmultiome_1.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xmultiome_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xmultiome_2.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xv11_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragmen

<IPython.core.display.Javascript object>

In [6]:
metadata_bc_sub_dict = {x.split('/')[-1].split(f'__')[0]: x for x in sorted(glob.glob("cistopic_qc_out_MASTER/*metadata_bc.pkl"))}
metadata_bc_sub_dict

{'BIO_ddseq_1.FIXEDCELLS': 'cistopic_qc_out_MASTER/BIO_ddseq_1.FIXEDCELLS__metadata_bc.pkl',
 'BIO_ddseq_2.FIXEDCELLS': 'cistopic_qc_out_MASTER/BIO_ddseq_2.FIXEDCELLS__metadata_bc.pkl',
 'BIO_ddseq_3.FIXEDCELLS': 'cistopic_qc_out_MASTER/BIO_ddseq_3.FIXEDCELLS__metadata_bc.pkl',
 'BIO_ddseq_4.FIXEDCELLS': 'cistopic_qc_out_MASTER/BIO_ddseq_4.FIXEDCELLS__metadata_bc.pkl',
 'BRO_mtscatac_1.FIXEDCELLS': 'cistopic_qc_out_MASTER/BRO_mtscatac_1.FIXEDCELLS__metadata_bc.pkl',
 'BRO_mtscatac_2.FIXEDCELLS': 'cistopic_qc_out_MASTER/BRO_mtscatac_2.FIXEDCELLS__metadata_bc.pkl',
 'CNA_10xmultiome_1.FIXEDCELLS': 'cistopic_qc_out_MASTER/CNA_10xmultiome_1.FIXEDCELLS__metadata_bc.pkl',
 'CNA_10xmultiome_2.FIXEDCELLS': 'cistopic_qc_out_MASTER/CNA_10xmultiome_2.FIXEDCELLS__metadata_bc.pkl',
 'CNA_10xv11_1.FIXEDCELLS': 'cistopic_qc_out_MASTER/CNA_10xv11_1.FIXEDCELLS__metadata_bc.pkl',
 'CNA_10xv11_2.FIXEDCELLS': 'cistopic_qc_out_MASTER/CNA_10xv11_2.FIXEDCELLS__metadata_bc.pkl',
 'CNA_10xv11_3.FIXEDCELLS': 'c

<IPython.core.display.Javascript object>

metadata_bc_sub_dict.pop('VIB_hydrop_1.FIXEDCELLS')
metadata_bc_sub_dict.pop('VIB_hydrop_2.FIXEDCELLS')

In [7]:
bc_passing_filters_sub_dict = {x.split('/')[-1].split(f'_bc_passing_filters_otsu.pkl')[0]: x for x in sorted(glob.glob("selected_barcodes/*otsu.pkl"))}
bc_passing_filters_sub_dict

{'BIO_ddseq_1.FIXEDCELLS': 'selected_barcodes/BIO_ddseq_1.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_2.FIXEDCELLS': 'selected_barcodes/BIO_ddseq_2.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_3.FIXEDCELLS': 'selected_barcodes/BIO_ddseq_3.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_4.FIXEDCELLS': 'selected_barcodes/BIO_ddseq_4.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'BRO_mtscatac_1.FIXEDCELLS': 'selected_barcodes/BRO_mtscatac_1.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'BRO_mtscatac_2.FIXEDCELLS': 'selected_barcodes/BRO_mtscatac_2.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'CNA_10xmultiome_1.FIXEDCELLS': 'selected_barcodes/CNA_10xmultiome_1.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'CNA_10xmultiome_2.FIXEDCELLS': 'selected_barcodes/CNA_10xmultiome_2.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'CNA_10xv11_1.FIXEDCELLS': 'selected_barcodes/CNA_10xv11_1.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'CNA_10xv11_2.FIXEDCELLS': 'selected_barcodes/CNA_10xv11_2.FIXEDCELLS_

<IPython.core.display.Javascript object>

Read bed files for SCREEN regions and blacklist (blacklisted regions in genome where many reads can map)

In [8]:
regions = "../fixedcells_3_cistopic_consensus/master_peaks/all.FIXEDCELLS.master_peaks.occurrence_filtered9.bed"

<IPython.core.display.Javascript object>

In [9]:
# path_to_regions = '../0_resources/regions/V2.hg38-rDHS-Unfiltered.blacklisted.bed'
path_to_blacklist = '../0_resources/regions/hg38-blacklist.v2.bed'

<IPython.core.display.Javascript object>

Create cistopic objects for each sample. If pandas crashes, increase the number of partitions. This is necessary for the largest files.

In the following command, it is important that the barcode syntax matches the fragments, the metadata and the bc_passing filters. That is why the sample name is removed from metadata and bc passing filters.

In [10]:
from pycisTopic.cistopic_class import create_cistopic_object_from_fragments

<IPython.core.display.Javascript object>

In [11]:
import ray
ray.shutdown()

<IPython.core.display.Javascript object>

In [12]:
n_cores = 1
for sample in metadata_bc_sub_dict.keys():
    cto_path = os.path.join(cistopic_objects_out, f"{sample}__cto.pkl")
    if not os.path.exists(cto_path):
        print(f"{cto_path} does not exist, generating...")

        infile = open(metadata_bc_sub_dict[sample], 'rb')
        metadata_bc = pickle.load(infile)
        metadata_bc.index = [bc.split("___")[0] for bc in metadata_bc.index]
        infile.close()
        infile = open(bc_passing_filters_sub_dict[sample], 'rb')
        bc_passing_filters = pickle.load(infile)
        infile.close()
        bc_passing_filters_fixed = [bc.split("___")[0] for bc in bc_passing_filters]

        cto = create_cistopic_object_from_fragments(path_to_fragments=fragments_dict[sample],
                                                        path_to_regions=regions,
                                                        path_to_blacklist=path_to_blacklist,
                                                        metrics=metadata_bc,
                                                        valid_bc=bc_passing_filters_fixed,
                                                        n_cpu=n_cores,
                                                        partition=10,
                                                        project=sample)

        cto_path = os.path.join(cistopic_objects_out, f"{sample}__cto.pkl")
        print(f"Writing {sample} cto in {cto_path}...")

        with open(
            cto_path, "wb"
        ) as f:
            pickle.dump(cto, f, protocol=4)
    else:
        print(f"{cto_path} already exists, skipping...")

cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xmultiome_1.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xmultiome_2.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xv11_1.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xv11_2.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xv11_3.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xv11_4.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xv11_5.FIXEDCELLS__cto.pkl alre

<IPython.core.display.Javascript object>

# Merge

In [13]:
from pycisTopic.cistopic_class import *

<IPython.core.display.Javascript object>

In [14]:
cto_paths_list = glob.glob('cistopic_objects/*_cto.pkl')

<IPython.core.display.Javascript object>

In [15]:
cto_path_new = 'master/master_all_1.FIXEDCELLS.cto.pkl'
if not os.path.exists(cto_path_new):
    cto_list = []
    for file in cto_paths_list:
        print(file)
        with open(file, 'rb') as f:
            cto = pickle.load(f)

        cto_list.append(cto)

    cto_merged = merge(cto_list)
    with open(
        cto_path_new, "wb"
    ) as f:
        pickle.dump(cto_merged, f, protocol=4)
else:
    print(cto_path + ' exists')

cistopic_objects/VIB_hydrop_2.FIXEDCELLS__cto.pkl exists


<IPython.core.display.Javascript object>

# Run Scrublet

In [16]:
import scrublet as scr
import pandas as pd
import matplotlib.pyplot as plt

<IPython.core.display.Javascript object>

In [17]:
cto_dict = {x.split('/')[-1].split('__cto.pkl')[0]:x for x in sorted(glob.glob('cistopic_objects_master/*cto.pkl'))}
cto_dict

{'master_all_1.FIXEDCELLS': 'cistopic_objects_master/master_all_1.FIXEDCELLS__cto.pkl'}

<IPython.core.display.Javascript object>

In [18]:
fragments_sub_dict = {}
scrubcto_dict = {}
scrub_threshold = 0.4
scrub_name_suffix = "0-4"
#regions_sub_dict = {}
for sample in cto_dict:
    cto = os.path.join('cistopic_objects_master', sample + f"__cto.scrublet{scrub_name_suffix}.pkl")
    print(f"Checking if {cto} exist...")
    if os.path.exists(cto):
        print(f"\t{cto} exists! Skipping...")
    else:
        print(f"\t{cto} does not exist, adding to subdict to generate")
        scrubcto_dict[sample] = cto

Checking if cistopic_objects_master/master_all_1.FIXEDCELLS__cto.scrublet0-4.pkl exist...
	cistopic_objects_master/master_all_1.FIXEDCELLS__cto.scrublet0-4.pkl exists! Skipping...


<IPython.core.display.Javascript object>

In [19]:
if cto_dict != {}:
    for sample in scrubcto_dict.keys():
        with open(cto_dict[sample], 'rb') as f:
            cto = pickle.load(f)
        print(f"Loaded {cto_dict[sample]}")
        scrub = scr.Scrublet(cto.fragment_matrix.T, expected_doublet_rate=0.1)
        doublet_scores, predicted_doublets = scrub.scrub_doublets()
        # scrub.plot_histogram()
        scrub.call_doublets(threshold=scrub_threshold)
        a,b = scrub.plot_histogram()
        a.suptitle(sample)
        plt.savefig(f'plots_qc/{sample}.scrublet_histogram.png', dpi=150, facecolor='white')
        plt.show()
        plt.close()

        print('Running UMAP...')
        scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
        scrub.plot_embedding('UMAP', order_points=True);
        plt.savefig(f'plots_qc/{sample}.scrublet_umap.png', dpi=150, facecolor='white')
        plt.show()
        plt.close()

        scrublet = pd.DataFrame([scrub.doublet_scores_obs_, scrub.predicted_doublets_], 
            columns=cto.cell_names,
            index=['Doublet_scores_fragments', 'Predicted_doublets_fragments']).T
        cto.add_cell_data(scrublet)
        n_scrublet_doublets = len(cto.cell_data["Predicted_doublets_fragments"] == True)
        print(f"{sample}: found doublets: \n\t({n_scrublet_doublets}: Scrublet)"
             )

        cto_path_new = scrubcto_dict[sample]

        with open(
            cto_path_new, "wb"
        ) as f:
            pickle.dump(cto, f, protocol=4)

        print('\n')
        
else:
    print("All samples already processed.")

<IPython.core.display.Javascript object>

## Cell filtering

### Read in Freemuxlet doublet predictions

In [20]:
f_fmx = 'out_fmx/genotype_concordance_unified.txt'
fmx = pd.read_csv(f_fmx, sep='\t')
fmx

Unnamed: 0,INT_ID,BARCODE,NUM.SNPS,NUM.READS,DROPLET.TYPE,BEST.GUESS,BEST.LLK,NEXT.GUESS,NEXT.LLK,DIFF.LLK.BEST.NEXT,...,SNG.BEST.LLK,SNG.NEXT.GUESS,SNG.NEXT.LLK,SNG.ONLY.POSTERIOR,DBL.BEST.GUESS,DBL.BEST.LLK,DIFF.LLK.SNG.DBL,ubarcode,replicate,sample
CNA_hydrop_2.FIXEDCELLS.1,0,CGACATTACATAGGAGTCAA,182,182,SNG,11,-324.11,10,-343.05,18.94,...,-324.11,0,-373.28,1.0,10,-343.05,18.94,CNA_hydrop_2.FIXEDCELLS#CGACATTACATAGGAGTCAA,CNA_hydrop_2.FIXEDCELLS,sampleB
CNA_hydrop_2.FIXEDCELLS.2,1,GGCAACCTCTGAGCTAGTAA,209,209,SNG,00,-369.10,10,-390.29,21.19,...,-369.10,1,-426.11,1.0,10,-390.29,21.19,CNA_hydrop_2.FIXEDCELLS#GGCAACCTCTGAGCTAGTAA,CNA_hydrop_2.FIXEDCELLS,sampleA
CNA_hydrop_2.FIXEDCELLS.3,2,CAACACCATTTCTCGCACGA,273,273,SNG,00,-492.20,10,-520.48,28.28,...,-492.20,1,-576.63,1.0,10,-520.48,28.28,CNA_hydrop_2.FIXEDCELLS#CAACACCATTTCTCGCACGA,CNA_hydrop_2.FIXEDCELLS,sampleA
CNA_hydrop_2.FIXEDCELLS.4,3,TGCATGAGGTTACGGACGGT,354,354,SNG,11,-636.69,10,-669.83,33.14,...,-636.69,0,-734.25,1.0,10,-669.83,33.14,CNA_hydrop_2.FIXEDCELLS#TGCATGAGGTTACGGACGGT,CNA_hydrop_2.FIXEDCELLS,sampleB
CNA_hydrop_2.FIXEDCELLS.5,4,ACAGTGAAGATCCAGTGTTC,577,577,SNG,00,-1069.99,10,-1093.26,23.27,...,-1069.99,1,-1173.29,1.0,10,-1093.26,23.27,CNA_hydrop_2.FIXEDCELLS#ACAGTGAAGATCCAGTGTTC,CNA_hydrop_2.FIXEDCELLS,sampleA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VIB_hydrop_12.FIXEDCELLS.1417,1416,GTCGTTGAGAGTGACCAGTA,126,126,SNG,11,-221.86,10,-240.94,19.08,...,-221.86,0,-283.50,1.0,10,-240.94,19.08,VIB_hydrop_12.FIXEDCELLS#GTCGTTGAGAGTGACCAGTA,VIB_hydrop_12.FIXEDCELLS,sampleB
VIB_hydrop_12.FIXEDCELLS.1418,1417,ACCGAAGGCTTTGCAGTTCT,103,103,SNG,11,-185.75,10,-198.81,13.06,...,-185.75,0,-226.20,1.0,10,-198.81,13.06,VIB_hydrop_12.FIXEDCELLS#ACCGAAGGCTTTGCAGTTCT,VIB_hydrop_12.FIXEDCELLS,sampleB
VIB_hydrop_12.FIXEDCELLS.1419,1418,TAGAGCCTGATTGTGTAGGA,115,115,SNG,11,-207.21,10,-219.27,12.06,...,-207.21,0,-244.26,1.0,10,-219.27,12.06,VIB_hydrop_12.FIXEDCELLS#TAGAGCCTGATTGTGTAGGA,VIB_hydrop_12.FIXEDCELLS,sampleB
VIB_hydrop_12.FIXEDCELLS.1420,1419,CAATTGGAGACCACACGGAT,101,101,SNG,11,-177.98,10,-188.29,10.31,...,-177.98,0,-214.34,1.0,10,-188.29,10.31,VIB_hydrop_12.FIXEDCELLS#CAATTGGAGACCACACGGAT,VIB_hydrop_12.FIXEDCELLS,sampleB


<IPython.core.display.Javascript object>

In [21]:
fmx['DROPLET.TYPE'].unique()

array(['SNG', 'DBL', 'AMB'], dtype=object)

<IPython.core.display.Javascript object>

In [22]:
# create an annotation df:
fmx['cell_names'] = fmx['BARCODE'] + "___" + fmx['replicate']

fmx_annot = fmx[['DROPLET.TYPE','sample','cell_names']].copy().set_index('cell_names')
fmx_annot.columns = ['fmx_droplet_type','fmx_sample']
fmx_annot

Unnamed: 0_level_0,fmx_droplet_type,fmx_sample
cell_names,Unnamed: 1_level_1,Unnamed: 2_level_1
CGACATTACATAGGAGTCAA___CNA_hydrop_2.FIXEDCELLS,SNG,sampleB
GGCAACCTCTGAGCTAGTAA___CNA_hydrop_2.FIXEDCELLS,SNG,sampleA
CAACACCATTTCTCGCACGA___CNA_hydrop_2.FIXEDCELLS,SNG,sampleA
TGCATGAGGTTACGGACGGT___CNA_hydrop_2.FIXEDCELLS,SNG,sampleB
ACAGTGAAGATCCAGTGTTC___CNA_hydrop_2.FIXEDCELLS,SNG,sampleA
...,...,...
GTCGTTGAGAGTGACCAGTA___VIB_hydrop_12.FIXEDCELLS,SNG,sampleB
ACCGAAGGCTTTGCAGTTCT___VIB_hydrop_12.FIXEDCELLS,SNG,sampleB
TAGAGCCTGATTGTGTAGGA___VIB_hydrop_12.FIXEDCELLS,SNG,sampleB
CAATTGGAGACCACACGGAT___VIB_hydrop_12.FIXEDCELLS,SNG,sampleB


<IPython.core.display.Javascript object>

# Read scrublet ctos

In [23]:
scrubcto_dict = {x.split('/')[-1].split(f'__cto.scrublet{scrub_name_suffix}.pkl')[0]:x for x in sorted(glob.glob(f'cistopic_objects_master/*__cto.scrublet{scrub_name_suffix}.pkl'))}
scrubcto_dict

{'master_all_1.FIXEDCELLS': 'cistopic_objects_master/master_all_1.FIXEDCELLS__cto.scrublet0-4.pkl'}

<IPython.core.display.Javascript object>

In [24]:
len(scrubcto_dict)

1

<IPython.core.display.Javascript object>

In [25]:
non_fmx_samples = ['BIO_ddseq_1.FIXEDCELLS',
'BIO_ddseq_2.FIXEDCELLS',
'BIO_ddseq_3.FIXEDCELLS',
'BIO_ddseq_4.FIXEDCELLS',
'OHS_s3atac_2.FIXEDCELLS',
'TXG_10xv11_1.FIXEDCELLS',
'TXG_10xv2_1.FIXEDCELLS',
'TXG_10xv2_2.FIXEDCELLS',
'UCS_ddseq_1.FIXEDCELLS',
'UCS_ddseq_2.FIXEDCELLS']

<IPython.core.display.Javascript object>

In [26]:
fmx_samples_run = set([x.split('___')[-1] for x in fmx_annot.index])
fmx_samples_run

{'BRO_mtscatac_1.FIXEDCELLS',
 'BRO_mtscatac_2.FIXEDCELLS',
 'CNA_10xmultiome_1.FIXEDCELLS',
 'CNA_10xmultiome_2.FIXEDCELLS',
 'CNA_10xv11_1.FIXEDCELLS',
 'CNA_10xv11_2.FIXEDCELLS',
 'CNA_10xv11_3.FIXEDCELLS',
 'CNA_10xv11_4.FIXEDCELLS',
 'CNA_10xv11_5.FIXEDCELLS',
 'CNA_10xv2_1.FIXEDCELLS',
 'CNA_10xv2_2.FIXEDCELLS',
 'CNA_hydrop_1.FIXEDCELLS',
 'CNA_hydrop_2.FIXEDCELLS',
 'CNA_hydrop_3.FIXEDCELLS',
 'CNA_mtscatac_1.FIXEDCELLS',
 'CNA_mtscatac_2.FIXEDCELLS',
 'EPF_hydrop_1.FIXEDCELLS',
 'EPF_hydrop_2.FIXEDCELLS',
 'EPF_hydrop_3.FIXEDCELLS',
 'EPF_hydrop_4.FIXEDCELLS',
 'HAR_ddseq_1.FIXEDCELLS',
 'HAR_ddseq_2.FIXEDCELLS',
 'MDC_mtscatac_1.FIXEDCELLS',
 'MDC_mtscatac_2.FIXEDCELLS',
 'OHS_s3atac_1.FIXEDCELLS',
 'SAN_10xmultiome_1.FIXEDCELLS',
 'SAN_10xmultiome_2.FIXEDCELLS',
 'STA_10xv11_1.FIXEDCELLS',
 'STA_10xv11_2.FIXEDCELLS',
 'VIB_10xmultiome_1.FIXEDCELLS',
 'VIB_10xmultiome_2.FIXEDCELLS',
 'VIB_10xv1_1.FIXEDCELLS',
 'VIB_10xv1_2.FIXEDCELLS',
 'VIB_10xv2_1.FIXEDCELLS',
 'VIB_10xv2_2

<IPython.core.display.Javascript object>

In [27]:
fmx_annot

Unnamed: 0_level_0,fmx_droplet_type,fmx_sample
cell_names,Unnamed: 1_level_1,Unnamed: 2_level_1
CGACATTACATAGGAGTCAA___CNA_hydrop_2.FIXEDCELLS,SNG,sampleB
GGCAACCTCTGAGCTAGTAA___CNA_hydrop_2.FIXEDCELLS,SNG,sampleA
CAACACCATTTCTCGCACGA___CNA_hydrop_2.FIXEDCELLS,SNG,sampleA
TGCATGAGGTTACGGACGGT___CNA_hydrop_2.FIXEDCELLS,SNG,sampleB
ACAGTGAAGATCCAGTGTTC___CNA_hydrop_2.FIXEDCELLS,SNG,sampleA
...,...,...
GTCGTTGAGAGTGACCAGTA___VIB_hydrop_12.FIXEDCELLS,SNG,sampleB
ACCGAAGGCTTTGCAGTTCT___VIB_hydrop_12.FIXEDCELLS,SNG,sampleB
TAGAGCCTGATTGTGTAGGA___VIB_hydrop_12.FIXEDCELLS,SNG,sampleB
CAATTGGAGACCACACGGAT___VIB_hydrop_12.FIXEDCELLS,SNG,sampleB


<IPython.core.display.Javascript object>

In [28]:
fmx_samples = sorted(list(set([x.split('.')[0] + '.' + x.split('.')[1] for x in fmx.index])))
removed_bcs_dict = {}
# for sample in ["CNA_10xv11_1.FULL"]:

for sample in scrubcto_dict.keys():
    print(f"{sample}")
    cto_path = scrubcto_dict[sample]
    newcto_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.pkl')
    if os.path.exists(newcto_path):
        print(f"\t{newcto_path} exists! Skipping...")
        
    else:
        with open(cto_path, 'rb') as f:
            cto = pickle.load(f)

            print(f"\tLoaded {cto_path}, adding fmx data and removing fmx + scr doublets")
            cto.cell_data['fmx_droplet_type'] = fmx_annot['fmx_droplet_type']
            cto.cell_data['fmx_sample'] = fmx_annot['fmx_sample']
            
            removed_bcs_dict[sample] = {
                'dbl_scrublet': cto.cell_data.barcode[cto.cell_data.Predicted_doublets_fragments == True].tolist(),
                'dbl_fmx': cto.cell_data.barcode[cto.cell_data.fmx_droplet_type == 'DBL'].tolist(),    
            }

            # Remove doublets 
            singlets = cto.cell_data[
                (cto.cell_data.Predicted_doublets_fragments == False) & 
                (cto.cell_data.fmx_droplet_type != 'DBL')
            ].index.tolist()
            print(f"\t{sample}: Removing {len(cto.cell_names)-len(singlets)} cells")

            fmx_doublets = set(cto.cell_data[cto.cell_data['fmx_droplet_type'] == 'DBL'].index)
            scr_doublets = set(cto.cell_data[cto.cell_data['Predicted_doublets_fragments'] == True].index)

            fmx_doublets_unique = fmx_doublets - scr_doublets
            scr_doublets_unique = scr_doublets - fmx_doublets
            common_doublets = fmx_doublets.intersection(scr_doublets)

            print(f"\t\t{len(fmx_doublets_unique)} unique fmx doublets")
            print(f"\t\t{len(scr_doublets_unique)} unique scr doublets")
            print(f"\t\t{len(common_doublets)} common doublets")

            fmx_doublets_unique_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.fmx_doublets_unique.txt')
            with open(fmx_doublets_unique_path, 'w') as f:
                for x in fmx_doublets_unique:
                    f.write(f"{x}\n")

            scr_doublets_unique_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.scr_doublets_unique.txt')
            with open(scr_doublets_unique_path, 'w') as f:
                for x in scr_doublets_unique:
                    f.write(f"{x}\n")

            common_doublets_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.common_doublets.txt')
            with open(common_doublets_path, 'w') as f:
                for x in common_doublets:
                    f.write(f"{x}\n")

            # Subset cisTopic object
            cto.subset(singlets)

            # save
            with open(newcto_path, "wb") as f:
                pickle.dump(cto, f, protocol=4)

            removed_bcs_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.removed_bcs.pkl')
            with open(removed_bcs_path, "wb") as f:
                pickle.dump(removed_bcs_path, f, protocol=4)

            cell_data_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.cell_data.tsv')
            cto.cell_data.to_csv(cell_data_path)

            print(f"\tWrote {newcto_path}, doublet lists and {cell_data_path}\n")

master_all_1.FIXEDCELLS
	cistopic_objects_master/master_all_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl exists! Skipping...


<IPython.core.display.Javascript object>

# Write looms from these ctos

In [3]:
import loompy as lp
import glob
import os
import pickle

<IPython.core.display.Javascript object>

In [4]:
scrub_name_suffix = "0-4"
scrubcto_dict = {x.split('/')[-1].split(f'__cto.scrublet{scrub_name_suffix}.pkl')[0]:x for x in sorted(glob.glob(f'cistopic_objects_master/*__cto.scrublet{scrub_name_suffix}.*singlets.pkl'))}
scrubcto_dict

{'master_all_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl': 'cistopic_objects_master/master_all_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl'}

<IPython.core.display.Javascript object>

In [5]:
len(scrubcto_dict)

1

<IPython.core.display.Javascript object>

In [6]:
loom_out = 'cell_region_looms'
if not os.path.exists(os.path.join(loom_out)):
    os.makedirs(os.path.join(loom_out))

<IPython.core.display.Javascript object>

In [16]:
loom_path = 'cell_region_looms/master_all_1.FIXEDCELLS_cto.scrublet0-4.singlets.ID.loom'
if not os.path.exists(loom_path):
    cto_path = 'cistopic_objects_master/master_all_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl'
    with open(cto_path, 'rb') as f:
        cto = pickle.load(f)

    lp.create(
        filename = loom_path,
        layers=cto.fragment_matrix,
        row_attrs={ 'Gene': cto.region_names }, 
        col_attrs={ 'CellID': [x.replace('___', '__').split('.')[0] for x in cto.cell_names] }
    )
    print(f"Finished {loom_path} loom writing")

Finished cell_region_looms/master_all_1.FIXEDCELLS_cto.scrublet0-4.singlets.ID.loom2 loom writing


<IPython.core.display.Javascript object>