# pycisTopic analysis

Full dataset, using SCREEN regions.

In [54]:
import pycisTopic
pycisTopic.__version__

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

import pickle
import pandas as pd

import os


import glob
from collections import OrderedDict
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [59]:
!pwd

/dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/fixedcells_2_cistopic


<IPython.core.display.Javascript object>

In [60]:
wdir = '/dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/fixedcells_2_cistopic'
os.chdir(wdir)

<IPython.core.display.Javascript object>

## Create the cisTopic objects for each sample

Make a new output dir

In [None]:
cistopic_objects_out = 'cistopic_objects'
if not os.path.exists(os.path.join(wdir, cistopic_objects_out)):
    os.makedirs(os.path.join(wdir, cistopic_objects_out))

Create a dictionary with fragments files for each sample

In [None]:
filenames = sorted(glob.glob('../1_data_repository/fixedcells_fragments/*.fragments.tsv.gz'))
fragments_dict = {}
for filename in filenames:
    sample = filename.split('/')[-1].split('.fragments.tsv.gz')[0]
    fragments_dict[sample] = filename
fragments_dict

In [None]:
len(fragments_dict)

In [None]:
fragments_sub_dict = {}
#regions_sub_dict = {}
for sample in fragments_dict:
    cto = os.path.join(cistopic_objects_out, sample + "__cto.pkl")
    print(f"Checking if {cto} exist...")
    if os.path.exists(cto):
        print(f"\t{cto} exists! Skipping...")
    else:
        print(f"\t{cto} does not exist, adding to subdict to generate")
        fragments_sub_dict[sample] = fragments_dict[sample]
        metadata_bc_sub_dict = {}
        bc_passing_filters_sub_dict = {}
        for sample in fragments_sub_dict.keys():
            metadata_bc_sub_dict[sample] = f"cistopic_qc_out/{sample}__metadata_bc.pkl"
            bc_passing_filters_sub_dict[sample] = f"selected_barcodes/{sample}_bc_passing_filters_otsu.pkl"

Read bed files for SCREEN regions and blacklist (blacklisted regions in genome where many reads can map)

In [9]:
path_to_regions = '../0_resources/regions/V2.hg38-rDHS-Unfiltered.blacklisted.bed'
path_to_blacklist = '../0_resources/regions/hg38-blacklist.v2.bed'

<IPython.core.display.Javascript object>

Create cistopic objects for each sample. If pandas crashes, increase the number of partitions. This is necessary for the largest files.

In the following command, it is important that the barcode syntax matches the fragments, the metadata and the bc_passing filters. That is why the sample name is removed from metadata and bc passing filters.

In [10]:
if fragments_sub_dict != {}:
    from pycisTopic.cistopic_class import create_cistopic_object_from_fragments
    n_cores = 20
    for sample in fragments_sub_dict.keys():
        infile = open(metadata_bc_sub_dict[sample], 'rb')
        metadata_bc = pickle.load(infile)
        metadata_bc.index = [bc.split("___")[0] for bc in metadata_bc.index]
        infile.close()
        infile = open(bc_passing_filters_sub_dict[sample], 'rb')
        bc_passing_filters = pickle.load(infile)
        infile.close()
        bc_passing_filters_fixed = [bc.split("___")[0] for bc in bc_passing_filters]
        cto = create_cistopic_object_from_fragments(path_to_fragments=fragments_sub_dict[sample],
                                                        path_to_regions=path_to_regions,
                                                        path_to_blacklist=path_to_blacklist,
                                                        metrics=metadata_bc,
                                                        valid_bc=bc_passing_filters_fixed,
                                                        n_cpu=n_cores,
                                                        partition=10,
                                                        project=sample)

        cto_path = os.path.join(cistopic_objects_out, f"{sample}__cto.pkl")
        print(f"Writing {sample} cto in {cto_path}...")

        with open(
            cto_path, "wb"
        ) as f:
            pickle.dump(cto, f, protocol=4)

else:
    print("All samples already processed.")

All samples already processed.


<IPython.core.display.Javascript object>

# Run Scrublet

In [61]:
import scrublet as scr
import pandas as pd
import matplotlib.pyplot as plt

<IPython.core.display.Javascript object>

In [62]:
cto_dict = {x.split('/')[-1].split('__cto.pkl')[0]:x for x in sorted(glob.glob('cistopic_objects/*__cto.pkl'))}
cto_dict

{'BIO_ddseq_1.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.pkl',
 'BIO_ddseq_2.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.pkl',
 'BIO_ddseq_3.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.pkl',
 'BIO_ddseq_4.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.pkl',
 'BRO_mtscatac_1.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.pkl',
 'BRO_mtscatac_2.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.pkl',
 'CNA_10xmultiome_1.FIXEDCELLS': 'cistopic_objects/CNA_10xmultiome_1.FIXEDCELLS__cto.pkl',
 'CNA_10xmultiome_2.FIXEDCELLS': 'cistopic_objects/CNA_10xmultiome_2.FIXEDCELLS__cto.pkl',
 'CNA_10xv11_1.FIXEDCELLS': 'cistopic_objects/CNA_10xv11_1.FIXEDCELLS__cto.pkl',
 'CNA_10xv11_2.FIXEDCELLS': 'cistopic_objects/CNA_10xv11_2.FIXEDCELLS__cto.pkl',
 'CNA_10xv11_3.FIXEDCELLS': 'cistopic_objects/CNA_10xv11_3.FIXEDCELLS__cto.pkl',
 'CNA_10xv11_4.FIXEDCELLS': 'cistopic_objects/CNA_10xv11_4.FIXEDCELLS__cto.pkl',
 'CNA_10

<IPython.core.display.Javascript object>

In [63]:
fragments_sub_dict = {}
scrubcto_dict = {}
scrub_threshold = 0.4
scrub_name_suffix = "0-4"
#regions_sub_dict = {}
for sample in cto_dict:
    cto = os.path.join('cistopic_objects', sample + f"__cto.scrublet{scrub_name_suffix}.pkl")
    print(f"Checking if {cto} exist...")
    if os.path.exists(cto):
        print(f"\t{cto} exists! Skipping...")
    else:
        print(f"\t{cto} does not exist, adding to subdict to generate")
        scrubcto_dict[sample] = cto

Checking if cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.pkl exist...
	cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.pkl exists! Skipping...
Checking if cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.pkl exist...
	cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.pkl exists! Skipping...
Checking if cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.pkl exist...
	cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.pkl exists! Skipping...
Checking if cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.pkl exist...
	cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.pkl exists! Skipping...
Checking if cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.pkl exist...
	cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.pkl exists! Skipping...
Checking if cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.pkl exist...
	cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.pkl exists! Skipping...
Chec

<IPython.core.display.Javascript object>

In [64]:
if cto_dict != {}:
    for sample in scrubcto_dict.keys():
        with open(cto_dict[sample], 'rb') as f:
            cto = pickle.load(f)
        print(f"Loaded {cto_dict[sample]}")
        scrub = scr.Scrublet(cto.fragment_matrix.T, expected_doublet_rate=0.1)
        doublet_scores, predicted_doublets = scrub.scrub_doublets()
        # scrub.plot_histogram()
        scrub.call_doublets(threshold=scrub_threshold)
        a,b = scrub.plot_histogram()
        a.suptitle(sample)
        plt.savefig(f'plots_qc/{sample}.scrublet_histogram.png', dpi=150, facecolor='white')
        plt.show()
        plt.close()
        
        scrublet = pd.DataFrame([scrub.doublet_scores_obs_, scrub.predicted_doublets_], 
            columns=cto.cell_names,
            index=['Doublet_scores_fragments', 'Predicted_doublets_fragments']).T
        cto.add_cell_data(scrublet)
        n_scrublet_doublets = len(cto.cell_data[cto.cell_data["Predicted_doublets_fragments"] == True])
        print(f"{sample}: found doublets: \n\t({n_scrublet_doublets}: Scrublet)"
             )

        cto_path_new = scrubcto_dict[sample]

        with open(
            cto_path_new, "wb"
        ) as f:
            pickle.dump(cto, f, protocol=4)
            
        print('Running UMAP...')
        # scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
        # scrub.plot_embedding('UMAP', order_points=True);
        # plt.savefig(f'plots_qc/{sample}.scrublet_umap.png', dpi=150, facecolor='white')
        # plt.show()
        # plt.close()



        print('\n')
        
else:
    print("All samples already processed.")

<IPython.core.display.Javascript object>

## Cell filtering

### Read in Freemuxlet doublet predictions

In [65]:
f_fmx = 'out_fmx/genotype_concordance_unified.txt'
fmx = pd.read_csv(f_fmx, sep='\t')
fmx

Unnamed: 0,INT_ID,BARCODE,NUM.SNPS,NUM.READS,DROPLET.TYPE,BEST.GUESS,BEST.LLK,NEXT.GUESS,NEXT.LLK,DIFF.LLK.BEST.NEXT,...,SNG.BEST.LLK,SNG.NEXT.GUESS,SNG.NEXT.LLK,SNG.ONLY.POSTERIOR,DBL.BEST.GUESS,DBL.BEST.LLK,DIFF.LLK.SNG.DBL,ubarcode,replicate,sample
CNA_hydrop_2.FIXEDCELLS.1,0,CGACATTACATAGGAGTCAA,182,182,SNG,11,-324.11,10,-343.05,18.94,...,-324.11,0,-373.28,1.0,10,-343.05,18.94,CNA_hydrop_2.FIXEDCELLS#CGACATTACATAGGAGTCAA,CNA_hydrop_2.FIXEDCELLS,sampleB
CNA_hydrop_2.FIXEDCELLS.2,1,GGCAACCTCTGAGCTAGTAA,209,209,SNG,00,-369.10,10,-390.29,21.19,...,-369.10,1,-426.11,1.0,10,-390.29,21.19,CNA_hydrop_2.FIXEDCELLS#GGCAACCTCTGAGCTAGTAA,CNA_hydrop_2.FIXEDCELLS,sampleA
CNA_hydrop_2.FIXEDCELLS.3,2,CAACACCATTTCTCGCACGA,273,273,SNG,00,-492.20,10,-520.48,28.28,...,-492.20,1,-576.63,1.0,10,-520.48,28.28,CNA_hydrop_2.FIXEDCELLS#CAACACCATTTCTCGCACGA,CNA_hydrop_2.FIXEDCELLS,sampleA
CNA_hydrop_2.FIXEDCELLS.4,3,TGCATGAGGTTACGGACGGT,354,354,SNG,11,-636.69,10,-669.83,33.14,...,-636.69,0,-734.25,1.0,10,-669.83,33.14,CNA_hydrop_2.FIXEDCELLS#TGCATGAGGTTACGGACGGT,CNA_hydrop_2.FIXEDCELLS,sampleB
CNA_hydrop_2.FIXEDCELLS.5,4,ACAGTGAAGATCCAGTGTTC,577,577,SNG,00,-1069.99,10,-1093.26,23.27,...,-1069.99,1,-1173.29,1.0,10,-1093.26,23.27,CNA_hydrop_2.FIXEDCELLS#ACAGTGAAGATCCAGTGTTC,CNA_hydrop_2.FIXEDCELLS,sampleA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VIB_hydrop_12.FIXEDCELLS.1417,1416,GTCGTTGAGAGTGACCAGTA,126,126,SNG,11,-221.86,10,-240.94,19.08,...,-221.86,0,-283.50,1.0,10,-240.94,19.08,VIB_hydrop_12.FIXEDCELLS#GTCGTTGAGAGTGACCAGTA,VIB_hydrop_12.FIXEDCELLS,sampleB
VIB_hydrop_12.FIXEDCELLS.1418,1417,ACCGAAGGCTTTGCAGTTCT,103,103,SNG,11,-185.75,10,-198.81,13.06,...,-185.75,0,-226.20,1.0,10,-198.81,13.06,VIB_hydrop_12.FIXEDCELLS#ACCGAAGGCTTTGCAGTTCT,VIB_hydrop_12.FIXEDCELLS,sampleB
VIB_hydrop_12.FIXEDCELLS.1419,1418,TAGAGCCTGATTGTGTAGGA,115,115,SNG,11,-207.21,10,-219.27,12.06,...,-207.21,0,-244.26,1.0,10,-219.27,12.06,VIB_hydrop_12.FIXEDCELLS#TAGAGCCTGATTGTGTAGGA,VIB_hydrop_12.FIXEDCELLS,sampleB
VIB_hydrop_12.FIXEDCELLS.1420,1419,CAATTGGAGACCACACGGAT,101,101,SNG,11,-177.98,10,-188.29,10.31,...,-177.98,0,-214.34,1.0,10,-188.29,10.31,VIB_hydrop_12.FIXEDCELLS#CAATTGGAGACCACACGGAT,VIB_hydrop_12.FIXEDCELLS,sampleB


<IPython.core.display.Javascript object>

In [66]:
fmx['DROPLET.TYPE'].unique()

array(['SNG', 'DBL', 'AMB'], dtype=object)

<IPython.core.display.Javascript object>

In [67]:
# create an annotation df:
fmx['cell_names'] = fmx['BARCODE'] + "___" + fmx['replicate']

fmx_annot = fmx[['DROPLET.TYPE','sample','cell_names']].copy().set_index('cell_names')
fmx_annot.columns = ['fmx_droplet_type','fmx_sample']
fmx_annot

Unnamed: 0_level_0,fmx_droplet_type,fmx_sample
cell_names,Unnamed: 1_level_1,Unnamed: 2_level_1
CGACATTACATAGGAGTCAA___CNA_hydrop_2.FIXEDCELLS,SNG,sampleB
GGCAACCTCTGAGCTAGTAA___CNA_hydrop_2.FIXEDCELLS,SNG,sampleA
CAACACCATTTCTCGCACGA___CNA_hydrop_2.FIXEDCELLS,SNG,sampleA
TGCATGAGGTTACGGACGGT___CNA_hydrop_2.FIXEDCELLS,SNG,sampleB
ACAGTGAAGATCCAGTGTTC___CNA_hydrop_2.FIXEDCELLS,SNG,sampleA
...,...,...
GTCGTTGAGAGTGACCAGTA___VIB_hydrop_12.FIXEDCELLS,SNG,sampleB
ACCGAAGGCTTTGCAGTTCT___VIB_hydrop_12.FIXEDCELLS,SNG,sampleB
TAGAGCCTGATTGTGTAGGA___VIB_hydrop_12.FIXEDCELLS,SNG,sampleB
CAATTGGAGACCACACGGAT___VIB_hydrop_12.FIXEDCELLS,SNG,sampleB


<IPython.core.display.Javascript object>

In [68]:
fmx_samples = sorted(list(set([x.split('.')[0] + '.' + x.split('.')[1] for x in fmx.index])))
len(fmx_samples)

39

<IPython.core.display.Javascript object>

In [69]:
fmx_samples

['BRO_mtscatac_1.FIXEDCELLS',
 'BRO_mtscatac_2.FIXEDCELLS',
 'CNA_10xmultiome_1.FIXEDCELLS',
 'CNA_10xmultiome_2.FIXEDCELLS',
 'CNA_10xv11_1.FIXEDCELLS',
 'CNA_10xv11_2.FIXEDCELLS',
 'CNA_10xv11_3.FIXEDCELLS',
 'CNA_10xv11_4.FIXEDCELLS',
 'CNA_10xv11_5.FIXEDCELLS',
 'CNA_10xv2_1.FIXEDCELLS',
 'CNA_10xv2_2.FIXEDCELLS',
 'CNA_hydrop_1.FIXEDCELLS',
 'CNA_hydrop_2.FIXEDCELLS',
 'CNA_hydrop_3.FIXEDCELLS',
 'CNA_mtscatac_1.FIXEDCELLS',
 'CNA_mtscatac_2.FIXEDCELLS',
 'EPF_hydrop_1.FIXEDCELLS',
 'EPF_hydrop_2.FIXEDCELLS',
 'EPF_hydrop_3.FIXEDCELLS',
 'EPF_hydrop_4.FIXEDCELLS',
 'HAR_ddseq_1.FIXEDCELLS',
 'HAR_ddseq_2.FIXEDCELLS',
 'MDC_mtscatac_1.FIXEDCELLS',
 'MDC_mtscatac_2.FIXEDCELLS',
 'OHS_s3atac_1.FIXEDCELLS',
 'SAN_10xmultiome_1.FIXEDCELLS',
 'SAN_10xmultiome_2.FIXEDCELLS',
 'STA_10xv11_1.FIXEDCELLS',
 'STA_10xv11_2.FIXEDCELLS',
 'VIB_10xmultiome_1.FIXEDCELLS',
 'VIB_10xmultiome_2.FIXEDCELLS',
 'VIB_10xv1_1.FIXEDCELLS',
 'VIB_10xv1_2.FIXEDCELLS',
 'VIB_10xv2_1.FIXEDCELLS',
 'VIB_10xv2_2

<IPython.core.display.Javascript object>

# Read scrublet ctos

In [70]:
scrubcto_dict = {x.split('/')[-1].split(f'__cto.scrublet{scrub_name_suffix}.pkl')[0]:x for x in sorted(glob.glob(f'cistopic_objects/*__cto.scrublet{scrub_name_suffix}.pkl'))}
scrubcto_dict

{'BIO_ddseq_1.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.pkl',
 'BIO_ddseq_2.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.pkl',
 'BIO_ddseq_3.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.pkl',
 'BIO_ddseq_4.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.pkl',
 'BRO_mtscatac_1.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.pkl',
 'BRO_mtscatac_2.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.pkl',
 'CNA_10xmultiome_1.FIXEDCELLS': 'cistopic_objects/CNA_10xmultiome_1.FIXEDCELLS__cto.scrublet0-4.pkl',
 'CNA_10xmultiome_2.FIXEDCELLS': 'cistopic_objects/CNA_10xmultiome_2.FIXEDCELLS__cto.scrublet0-4.pkl',
 'CNA_10xv11_1.FIXEDCELLS': 'cistopic_objects/CNA_10xv11_1.FIXEDCELLS__cto.scrublet0-4.pkl',
 'CNA_10xv11_2.FIXEDCELLS': 'cistopic_objects/CNA_10xv11_2.FIXEDCELLS__cto.scrublet0-4.pkl',
 'CNA_10xv11_3.FIXEDCELLS': 'cistopic_objects/CNA_

<IPython.core.display.Javascript object>

In [None]:
fmx_samples = sorted(list(set([x.split('.')[0] + '.' + x.split('.')[1] for x in fmx.index])))
removed_bcs_dict = {}
# for sample in ["CNA_10xv11_1.FULL"]:

for sample in scrubcto_dict.keys():
    print(f"{sample}")
    cto_path = scrubcto_dict[sample]
    newcto_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.pkl')
    if os.path.exists(newcto_path):
        print(f"\t{newcto_path} exists! Skipping...")
        
    else:
        with open(cto_path, 'rb') as f:
            cto = pickle.load(f)

        if sample in fmx_samples:
            print(f"\tLoaded {cto_path}, adding fmx data and removing fmx + scr doublets")
            cto.add_cell_data(fmx_annot.loc[cto.cell_data.index])

            removed_bcs_dict[sample] = {
                'dbl_scrublet': cto.cell_data.barcode[cto.cell_data.Predicted_doublets_fragments == True].tolist(),
                'dbl_fmx': cto.cell_data.barcode[cto.cell_data.fmx_droplet_type == 'DBL'].tolist(),    
            }

            # Remove doublets 
            singlets = cto.cell_data[
                (cto.cell_data.Predicted_doublets_fragments == False) & 
                (cto.cell_data.fmx_droplet_type != 'DBL')
            ].index.tolist()
            print(f"\t{sample}: Removing {len(cto.cell_names)-len(singlets)} cells")

            fmx_doublets = set(cto.cell_data[cto.cell_data['fmx_droplet_type'] == 'DBL'].index)
            scr_doublets = set(cto.cell_data[cto.cell_data['Predicted_doublets_fragments'] == True].index)

            fmx_doublets_unique = fmx_doublets - scr_doublets
            scr_doublets_unique = scr_doublets - fmx_doublets
            common_doublets = fmx_doublets.intersection(scr_doublets)

            print(f"\t\t{len(fmx_doublets_unique)} unique fmx doublets")
            print(f"\t\t{len(scr_doublets_unique)} unique scr doublets")
            print(f"\t\t{len(common_doublets)} common doublets")

            fmx_doublets_unique_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.fmx_doublets_unique.txt')
            with open(fmx_doublets_unique_path, 'w') as f:
                for x in fmx_doublets_unique:
                    f.write(f"{x}\n")

            scr_doublets_unique_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.scr_doublets_unique.txt')
            with open(scr_doublets_unique_path, 'w') as f:
                for x in scr_doublets_unique:
                    f.write(f"{x}\n")

            common_doublets_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.common_doublets.txt')
            with open(common_doublets_path, 'w') as f:
                for x in common_doublets:
                    f.write(f"{x}\n")


        else:
            print(f"\tLoaded {cto_path}, removing scr doublets")
            removed_bcs_dict[sample] = {
                'dbl_scrublet': cto.cell_data.barcode[cto.cell_data.Predicted_doublets_fragments == True].tolist()
            }

            # Remove doublets 
            singlets = cto.cell_data[
                (cto.cell_data.Predicted_doublets_fragments == False)
            ].index.tolist()
            print(f"\t{sample}: Removing {len(cto.cell_names)-len(singlets)} cells")

            scr_doublets = set(cto.cell_data[cto.cell_data['Predicted_doublets_fragments'] == True].index)
            print(f"\t\t{len(scr_doublets)} scr doublets")

            
            scr_doublets_unique_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.scr_doublets_unique.txt')
            with open(scr_doublets_unique_path, 'w') as f:
                for x in scr_doublets:
                    f.write(f"{x}\n")
                    
                    
        # Subset cisTopic object
        cto.subset(singlets)

        # save
        with open(newcto_path, "wb") as f:
            pickle.dump(cto, f, protocol=4)

        removed_bcs_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.removed_bcs.pkl')
        with open(removed_bcs_path, "wb") as f:
            pickle.dump(removed_bcs_dict[sample], f, protocol=4)

        cell_data_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.cell_data.tsv')
        cto.cell_data.to_csv(cell_data_path)

        print(f"\tWrote {newcto_path}, doublet lists and {cell_data_path}\n")

BIO_ddseq_1.FIXEDCELLS
	cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl exists! Skipping...
BIO_ddseq_2.FIXEDCELLS
	cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl exists! Skipping...
BIO_ddseq_3.FIXEDCELLS
	cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl exists! Skipping...
BIO_ddseq_4.FIXEDCELLS
	cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl exists! Skipping...
BRO_mtscatac_1.FIXEDCELLS
	cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl exists! Skipping...
BRO_mtscatac_2.FIXEDCELLS
	cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl exists! Skipping...
CNA_10xmultiome_1.FIXEDCELLS
	cistopic_objects/CNA_10xmultiome_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl exists! Skipping...
CNA_10xmultiome_2.FIXEDCELLS
	cistopic_objects/CNA_10xmultiome_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl exists! Skipping...
CNA_10xv11_1.FIXEDCE

<IPython.core.display.Javascript object>

# Merge VIB_hydrop_samples

In [None]:
from pycisTopic.cistopic_class import *

<IPython.core.display.Javascript object>

In [23]:
paths_list = ['cistopic_objects/VIB_hydrop_21.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl', 'cistopic_objects/VIB_hydrop_22.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl']
cto_path_new = 'cistopic_objects/VIB_hydrop_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl'
cto_list = []
for file in paths_list:
    print(file)
    with open(file, 'rb') as f:
        cto = pickle.load(f)
    
    cto_list.append(cto)
    
cto_merged = merge(cto_list)
with open(
    cto_path_new, "wb"
) as f:
    pickle.dump(cto_merged, f, protocol=4)

cistopic_objects/VIB_hydrop_21.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl
cistopic_objects/VIB_hydrop_22.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl
2022-09-13 17:56:36,546 cisTopic     INFO     cisTopic object 1 merged


<IPython.core.display.Javascript object>

In [24]:
paths_list = ['cistopic_objects/VIB_hydrop_11.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl', 'cistopic_objects/VIB_hydrop_12.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl']
cto_path_new = 'cistopic_objects/VIB_hydrop_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl'
cto_list = []
for file in paths_list:
    print(file)
    with open(file, 'rb') as f:
        cto = pickle.load(f)
    
    cto_list.append(cto)
    
cto_merged = merge(cto_list)
with open(
    cto_path_new, "wb"
) as f:
    pickle.dump(cto_merged, f, protocol=4)

cistopic_objects/VIB_hydrop_11.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl
cistopic_objects/VIB_hydrop_12.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl
2022-09-13 17:57:19,272 cisTopic     INFO     cisTopic object 1 merged


<IPython.core.display.Javascript object>

In [25]:
cto_path = 'cistopic_objects/VIB_hydrop_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl'
with open(cto_path, 'rb') as f:
    cto = pickle.load(f)
    
cto.cell_names = [ x.split('__')[0] + '-' + x.split('VIB_hydrop_')[-1][1] + "___"  + x.split('__')[1]  for x in cto.cell_names ]
with open(
    cto_path, "wb"
) as f:
    pickle.dump(cto, f, protocol=4)

<IPython.core.display.Javascript object>

In [None]:
cto_path = 'cistopic_objects/VIB_hydrop_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl'
with open(cto_path, 'rb') as f:
    cto = pickle.load(f)
    
cto.cell_names = [ x.split('__')[0] + '-' + x.split('VIB_hydrop_')[-1][1] + "___"  + x.split('__')[1]  for x in cto.cell_names ]
with open(
    cto_path, "wb"
) as f:
    pickle.dump(cto, f, protocol=4)

# Write looms from these ctos

In [None]:
import loompy as lp

<IPython.core.display.Javascript object>

In [None]:
scrub_name_suffix = "0-4"
singlet_cto_dict = {x.split('/')[-1].split(f'__')[0]:x for x in sorted(glob.glob(f'cistopic_objects/*__cto.scrublet{scrub_name_suffix}*singlets.pkl'))}
singlet_cto_dict

{'BIO_ddseq_1.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'BIO_ddseq_2.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'BIO_ddseq_3.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'BIO_ddseq_4.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'BRO_mtscatac_1.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'BRO_mtscatac_2.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'CNA_10xmultiome_1.FIXEDCELLS': 'cistopic_objects/CNA_10xmultiome_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'CNA_10xmultiome_2.FIXEDCELLS': 'cistopic_objects/CNA_10xmultiome_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'CNA_10xv11_1.FIXEDCELLS': 'cistopic_objects/CNA_10xv11_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'CNA_10xv11_2.FIXEDCELLS'

<IPython.core.display.Javascript object>

In [None]:
len(singlet_cto_dict)

51

<IPython.core.display.Javascript object>

In [None]:
loom_out = 'cell_region_looms'
if not os.path.exists(os.path.join(wdir, loom_out)):
    os.makedirs(os.path.join(wdir, loom_out))

<IPython.core.display.Javascript object>

In [None]:
mounts = "/lustre1,/staging,/data,/vsc-hard-mounts,/scratch"
# mounts = "/dodrio,/readonly/dodrio,/tmp"
sif = "../0_resources/cistopic_image/20220815_pycistopic.sif"
script = "../0_resources/scripts/create_loom.py"
loom_dir = "cell_region_looms"
parallel_file = 'write_looms.parallel'
with open(parallel_file, 'w') as file:

    for sample in singlet_cto_dict.keys():
        infile = singlet_cto_dict[sample]
        outfile = infile.replace('.pkl', '.loom').replace('cistopic_objects','cell_region_looms')
    
        if not os.path.exists(outfile):
            cmd = f"echo {sample} && cd {wdir} && singularity exec -B {mounts} {sif} python {script} -i {infile} -o {outfile}"

            print(cmd)
            file.write(cmd + '\n')
            
        else:
            print(f'{outfile} already exists!')

cell_region_looms/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists!
cell_region_looms/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists!
cell_region_looms/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists!
cell_region_looms/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists!
cell_region_looms/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists!
cell_region_looms/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists!
cell_region_looms/CNA_10xmultiome_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists!
cell_region_looms/CNA_10xmultiome_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists!
cell_region_looms/CNA_10xv11_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists!
cell_region_looms/CNA_10xv11_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists!
cell_region_looms/CNA_10xv11_3.FIXEDCELLS__cto.scrublet0-4.f

<IPython.core.display.Javascript object>

In [None]:
loom_out = 'cell_region_looms'
if not os.path.exists(os.path.join(loom_out)):
    os.makedirs(os.path.join(loom_out))

<IPython.core.display.Javascript object>

In [79]:
import loompy as lp
cto_path_sub_dict = {}
for sample in singlet_cto_dict.keys():
    cto_path = singlet_cto_dict[sample]
    loom_path = os.path.join(loom_out, cto_path.split('/')[-1].replace('.pkl', '.loom'))
    if not os.path.exists(loom_path):
        with open(cto_path, 'rb') as f:
            cto = pickle.load(f)

        print(f"Loaded filtered cistopic object {cto_path}")        
        lp.create(
            filename = loom_path,
            layers=cto.fragment_matrix,
            row_attrs={ 'Gene': cto.region_names }, 
            col_attrs={ 'CellID': [ x.split('__')[0]  for x in cto.cell_names ] }
        )
        
        print(f"\tFinished {loom_path} loom writing")
    else:
        print(f'{loom_path} already exists, skipping...')

cell_region_looms/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists, skipping...
cell_region_looms/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists, skipping...
cell_region_looms/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists, skipping...
cell_region_looms/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists, skipping...
cell_region_looms/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists, skipping...
cell_region_looms/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists, skipping...
cell_region_looms/CNA_10xmultiome_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists, skipping...
cell_region_looms/CNA_10xmultiome_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists, skipping...
cell_region_looms/CNA_10xv11_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.loom already exists, skipping...
cell_region_looms/CNA_10xv11_2.FIXEDCELLS__ct

<IPython.core.display.Javascript object>

In [80]:
!cat ../0_resources/scripts/create_loom.py

import loompy as lp
import pickle
import os
import argparse

def make_argument_parser():
    """
    Creates an ArgumentParser to read the options for this script from
    sys.argv
    """
    parser = argparse.ArgumentParser(
        description="Write loom",)
    parser.add_argument('--input_cto', '-i', type=str, required=True,
                        help='Path to cisTopic object pickle file.')

    parser.add_argument('--output_loom', '-o', type=str, required=True,
                    help='Path to out loom.')

    return parser


def main():
    """
    The main executable function
    """

    parser = make_argument_parser()
    args = parser.parse_args()


    cto_path = args.input_cto
    print('input file:', cto_path)

    loom_path = args.output_loom
    print('output file:', loom_path)
    
    if not os.path.exists(loom_path):
        with open(cto_path, 'rb') as f:
            cto = pickle.load(f)

        print(f"Loaded filtered cistopic object {cto_path}")
        lp.cre

<IPython.core.display.Javascript object>