# pycisTopic analysis

Downsampled dataset, using CONSENSUS peaks.

In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

import pickle
import os
import glob
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
!pwd

/lustre1/project/stg_00090/scatac_benchmark/fixedcells_3_cistopic_consensus


<IPython.core.display.Javascript object>

In [3]:
wdir = '/lustre1/project/stg_00090/scatac_benchmark/fixedcells_3_cistopic_consensus'
os.chdir(wdir)

<IPython.core.display.Javascript object>

## Create the cisTopic objects for each sample

Make a new output dir

In [4]:
cistopic_objects_out = 'cistopic_objects'
if not os.path.exists(os.path.join(wdir, cistopic_objects_out)):
    os.makedirs(os.path.join(wdir, cistopic_objects_out))

<IPython.core.display.Javascript object>

Create a dictionary with fragments files for each sample

In [5]:
filenames = sorted(glob.glob('../1_data_repository/fixedcells_fragments/*.fragments.tsv.gz'))
fragments_dict = {}
for filename in filenames:
    sample = filename.split('/')[-1].split('.fragments.tsv.gz')[0]
    fragments_dict[sample] = filename
fragments_dict

{'BIO_ddseq_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_1.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_2.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_3.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_3.FIXEDCELLS.fragments.tsv.gz',
 'BIO_ddseq_4.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BIO_ddseq_4.FIXEDCELLS.fragments.tsv.gz',
 'BRO_mtscatac_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BRO_mtscatac_1.FIXEDCELLS.fragments.tsv.gz',
 'BRO_mtscatac_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/BRO_mtscatac_2.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xmultiome_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xmultiome_1.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xmultiome_2.FIXEDCELLS': '../1_data_repository/fixedcells_fragments/CNA_10xmultiome_2.FIXEDCELLS.fragments.tsv.gz',
 'CNA_10xv11_1.FIXEDCELLS': '../1_data_repository/fixedcells_fragmen

<IPython.core.display.Javascript object>

In [6]:
metadata_bc_sub_dict = {x.split('/')[-1].split(f'__')[0]: x for x in sorted(glob.glob("cistopic_qc_out_CONSENSUS/*metadata_bc.pkl"))}
metadata_bc_sub_dict

{'BIO_ddseq_1.FIXEDCELLS': 'cistopic_qc_out_CONSENSUS/BIO_ddseq_1.FIXEDCELLS__metadata_bc.pkl',
 'BIO_ddseq_2.FIXEDCELLS': 'cistopic_qc_out_CONSENSUS/BIO_ddseq_2.FIXEDCELLS__metadata_bc.pkl',
 'BIO_ddseq_3.FIXEDCELLS': 'cistopic_qc_out_CONSENSUS/BIO_ddseq_3.FIXEDCELLS__metadata_bc.pkl',
 'BIO_ddseq_4.FIXEDCELLS': 'cistopic_qc_out_CONSENSUS/BIO_ddseq_4.FIXEDCELLS__metadata_bc.pkl',
 'BRO_mtscatac_1.FIXEDCELLS': 'cistopic_qc_out_CONSENSUS/BRO_mtscatac_1.FIXEDCELLS__metadata_bc.pkl',
 'BRO_mtscatac_2.FIXEDCELLS': 'cistopic_qc_out_CONSENSUS/BRO_mtscatac_2.FIXEDCELLS__metadata_bc.pkl',
 'CNA_10xmultiome_1.FIXEDCELLS': 'cistopic_qc_out_CONSENSUS/CNA_10xmultiome_1.FIXEDCELLS__metadata_bc.pkl',
 'CNA_10xmultiome_2.FIXEDCELLS': 'cistopic_qc_out_CONSENSUS/CNA_10xmultiome_2.FIXEDCELLS__metadata_bc.pkl',
 'CNA_10xv11_1.FIXEDCELLS': 'cistopic_qc_out_CONSENSUS/CNA_10xv11_1.FIXEDCELLS__metadata_bc.pkl',
 'CNA_10xv11_2.FIXEDCELLS': 'cistopic_qc_out_CONSENSUS/CNA_10xv11_2.FIXEDCELLS__metadata_bc.pkl',


<IPython.core.display.Javascript object>

In [7]:
metadata_bc_sub_dict.pop('VIB_hydrop_1.FIXEDCELLS')
metadata_bc_sub_dict.pop('VIB_hydrop_2.FIXEDCELLS')

'cistopic_qc_out_CONSENSUS/VIB_hydrop_2.FIXEDCELLS__metadata_bc.pkl'

<IPython.core.display.Javascript object>

In [8]:
bc_passing_filters_sub_dict = {x.split('/')[-1].split(f'_bc_passing_filters_otsu.pkl')[0]: x for x in sorted(glob.glob("selected_barcodes/*otsu.pkl"))}
bc_passing_filters_sub_dict

{'BIO_ddseq_1.FIXEDCELLS': 'selected_barcodes/BIO_ddseq_1.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_2.FIXEDCELLS': 'selected_barcodes/BIO_ddseq_2.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_3.FIXEDCELLS': 'selected_barcodes/BIO_ddseq_3.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'BIO_ddseq_4.FIXEDCELLS': 'selected_barcodes/BIO_ddseq_4.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'BRO_mtscatac_1.FIXEDCELLS': 'selected_barcodes/BRO_mtscatac_1.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'BRO_mtscatac_2.FIXEDCELLS': 'selected_barcodes/BRO_mtscatac_2.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'CNA_10xmultiome_1.FIXEDCELLS': 'selected_barcodes/CNA_10xmultiome_1.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'CNA_10xmultiome_2.FIXEDCELLS': 'selected_barcodes/CNA_10xmultiome_2.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'CNA_10xv11_1.FIXEDCELLS': 'selected_barcodes/CNA_10xv11_1.FIXEDCELLS_bc_passing_filters_otsu.pkl',
 'CNA_10xv11_2.FIXEDCELLS': 'selected_barcodes/CNA_10xv11_2.FIXEDCELLS_

<IPython.core.display.Javascript object>

Read bed files for SCREEN regions and blacklist (blacklisted regions in genome where many reads can map)

In [9]:
regions_paths_dict = {
    x.split("/")[-1].split(f"__")[0]: x
    for x in sorted(glob.glob("SCREEN_peaks/*consensus_peaks.bed"))
}
regions_paths_dict

{'BIO_ddseq_1.FIXEDCELLS': 'SCREEN_peaks/BIO_ddseq_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_2.FIXEDCELLS': 'SCREEN_peaks/BIO_ddseq_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_3.FIXEDCELLS': 'SCREEN_peaks/BIO_ddseq_3.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BIO_ddseq_4.FIXEDCELLS': 'SCREEN_peaks/BIO_ddseq_4.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_1.FIXEDCELLS': 'SCREEN_peaks/BRO_mtscatac_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'BRO_mtscatac_2.FIXEDCELLS': 'SCREEN_peaks/BRO_mtscatac_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_1.FIXEDCELLS': 'SCREEN_peaks/CNA_10xmultiome_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xmultiome_2.FIXEDCELLS': 'SCREEN_peaks/CNA_10xmultiome_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_1.FIXEDCELLS': 'SCREEN_peaks/CNA_10xv11_1.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_2.FIXEDCELLS': 'SCREEN_peaks/CNA_10xv11_2.FIXEDCELLS__SCREEN_consensus_peaks.bed',
 'CNA_10xv11_3.FIXED

<IPython.core.display.Javascript object>

In [10]:
regions_paths_dict['VIB_hydrop_11.FIXEDCELLS'] = 'SCREEN_peaks/VIB_hydrop_1.FIXEDCELLS__SCREEN_consensus_peaks.bed'
regions_paths_dict['VIB_hydrop_12.FIXEDCELLS'] = 'SCREEN_peaks/VIB_hydrop_1.FIXEDCELLS__SCREEN_consensus_peaks.bed'
regions_paths_dict['VIB_hydrop_21.FIXEDCELLS'] = 'SCREEN_peaks/VIB_hydrop_2.FIXEDCELLS__SCREEN_consensus_peaks.bed'
regions_paths_dict['VIB_hydrop_22.FIXEDCELLS'] = 'SCREEN_peaks/VIB_hydrop_2.FIXEDCELLS__SCREEN_consensus_peaks.bed'

<IPython.core.display.Javascript object>

In [11]:
# path_to_regions = '../0_resources/regions/V2.hg38-rDHS-Unfiltered.blacklisted.bed'
path_to_blacklist = '../0_resources/regions/hg38-blacklist.v2.bed'

<IPython.core.display.Javascript object>

Create cistopic objects for each sample. If pandas crashes, increase the number of partitions. This is necessary for the largest files.

In the following command, it is important that the barcode syntax matches the fragments, the metadata and the bc_passing filters. That is why the sample name is removed from metadata and bc passing filters.

In [12]:
from pycisTopic.cistopic_class import create_cistopic_object_from_fragments

<IPython.core.display.Javascript object>

In [13]:
import ray
ray.shutdown()

<IPython.core.display.Javascript object>

In [14]:
if regions_paths_dict != {}:
    n_cores = 20
    for sample in metadata_bc_sub_dict.keys():
        cto_path = os.path.join(cistopic_objects_out, f"{sample}__cto.pkl")
        if not os.path.exists(cto_path):
            print(f"{cto_path} does not exist, generating...")

            infile = open(metadata_bc_sub_dict[sample], 'rb')
            metadata_bc = pickle.load(infile)
            metadata_bc.index = [bc.split("___")[0] for bc in metadata_bc.index]
            infile.close()
            infile = open(bc_passing_filters_sub_dict[sample], 'rb')
            bc_passing_filters = pickle.load(infile)
            infile.close()
            bc_passing_filters_fixed = [bc.split("___")[0] for bc in bc_passing_filters]

            cto = create_cistopic_object_from_fragments(path_to_fragments=fragments_dict[sample],
                                                            path_to_regions=regions_paths_dict[sample],
                                                            path_to_blacklist=path_to_blacklist,
                                                            metrics=metadata_bc,
                                                            valid_bc=bc_passing_filters_fixed,
                                                            n_cpu=n_cores,
                                                            partition=10,
                                                            project=sample)

            cto_path = os.path.join(cistopic_objects_out, f"{sample}__cto.pkl")
            print(f"Writing {sample} cto in {cto_path}...")

            with open(
                cto_path, "wb"
            ) as f:
                pickle.dump(cto, f, protocol=4)
        else:
            print(f"{cto_path} already exists, skipping...")

else:
    print("All samples already processed.")

cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xmultiome_1.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xmultiome_2.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xv11_1.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xv11_2.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xv11_3.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xv11_4.FIXEDCELLS__cto.pkl already exists, skipping...
cistopic_objects/CNA_10xv11_5.FIXEDCELLS__cto.pkl alre

<IPython.core.display.Javascript object>

# Run Scrublet

In [15]:
import scrublet as scr
import pandas as pd
import matplotlib.pyplot as plt

<IPython.core.display.Javascript object>

In [16]:
cto_dict = {x.split('/')[-1].split('__cto.pkl')[0]:x for x in sorted(glob.glob('cistopic_objects/*__cto.pkl'))}
cto_dict

{'BIO_ddseq_1.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.pkl',
 'BIO_ddseq_2.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.pkl',
 'BIO_ddseq_3.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.pkl',
 'BIO_ddseq_4.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.pkl',
 'BRO_mtscatac_1.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.pkl',
 'BRO_mtscatac_2.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.pkl',
 'CNA_10xmultiome_1.FIXEDCELLS': 'cistopic_objects/CNA_10xmultiome_1.FIXEDCELLS__cto.pkl',
 'CNA_10xmultiome_2.FIXEDCELLS': 'cistopic_objects/CNA_10xmultiome_2.FIXEDCELLS__cto.pkl',
 'CNA_10xv11_1.FIXEDCELLS': 'cistopic_objects/CNA_10xv11_1.FIXEDCELLS__cto.pkl',
 'CNA_10xv11_2.FIXEDCELLS': 'cistopic_objects/CNA_10xv11_2.FIXEDCELLS__cto.pkl',
 'CNA_10xv11_3.FIXEDCELLS': 'cistopic_objects/CNA_10xv11_3.FIXEDCELLS__cto.pkl',
 'CNA_10xv11_4.FIXEDCELLS': 'cistopic_objects/CNA_10xv11_4.FIXEDCELLS__cto.pkl',
 'CNA_10

<IPython.core.display.Javascript object>

In [17]:
fragments_sub_dict = {}
scrubcto_dict = {}
scrub_threshold = 0.4
scrub_name_suffix = "0-4"
#regions_sub_dict = {}
for sample in cto_dict:
    cto = os.path.join('cistopic_objects', sample + f"__cto.scrublet{scrub_name_suffix}.pkl")
    print(f"Checking if {cto} exist...")
    if os.path.exists(cto):
        print(f"\t{cto} exists! Skipping...")
    else:
        print(f"\t{cto} does not exist, adding to subdict to generate")
        scrubcto_dict[sample] = cto

Checking if cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.pkl exist...
	cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.pkl exists! Skipping...
Checking if cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.pkl exist...
	cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.pkl exists! Skipping...
Checking if cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.pkl exist...
	cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.pkl exists! Skipping...
Checking if cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.pkl exist...
	cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.pkl exists! Skipping...
Checking if cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.pkl exist...
	cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.pkl exists! Skipping...
Checking if cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.pkl exist...
	cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.pkl exists! Skipping...
Chec

<IPython.core.display.Javascript object>

In [None]:
if cto_dict != {}:
    for sample in scrubcto_dict.keys():
        with open(cto_dict[sample], 'rb') as f:
            cto = pickle.load(f)
        print(f"Loaded {cto_dict[sample]}")
        scrub = scr.Scrublet(cto.fragment_matrix.T, expected_doublet_rate=0.1)
        doublet_scores, predicted_doublets = scrub.scrub_doublets()
        # scrub.plot_histogram()
        scrub.call_doublets(threshold=scrub_threshold)
        a,b = scrub.plot_histogram()
        a.suptitle(sample)
        plt.savefig(f'plots_qc/{sample}.scrublet_histogram.png', dpi=150, facecolor='white')
        plt.show()
        plt.close()

        print('Running UMAP...')
        scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
        scrub.plot_embedding('UMAP', order_points=True);
        plt.savefig(f'plots_qc/{sample}.scrublet_umap.png', dpi=150, facecolor='white')
        plt.show()
        plt.close()

        scrublet = pd.DataFrame([scrub.doublet_scores_obs_, scrub.predicted_doublets_], 
            columns=cto.cell_names,
            index=['Doublet_scores_fragments', 'Predicted_doublets_fragments']).T
        cto.add_cell_data(scrublet)
        n_scrublet_doublets = len(cto.cell_data["Predicted_doublets_fragments"] == True)
        print(f"{sample}: found doublets: \n\t({n_scrublet_doublets}: Scrublet)"
             )

        cto_path_new = scrubcto_dict[sample]

        with open(
            cto_path_new, "wb"
        ) as f:
            pickle.dump(cto, f, protocol=4)

        print('\n')
        
else:
    print("All samples already processed.")

Loaded cistopic_objects/VIB_hydrop_11.FIXEDCELLS__cto.pkl
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...


## Cell filtering

### Read in Freemuxlet doublet predictions

In [None]:
f_fmx = 'out_fmx/genotype_concordance_unified.txt'
fmx = pd.read_csv(f_fmx, sep='\t')
fmx

In [None]:
fmx['DROPLET.TYPE'].unique()

In [None]:
# create an annotation df:
fmx['cell_names'] = fmx['BARCODE'] + "___" + fmx['replicate']

fmx_annot = fmx[['DROPLET.TYPE','sample','cell_names']].copy().set_index('cell_names')
fmx_annot.columns = ['fmx_droplet_type','fmx_sample']
fmx_annot

# Read scrublet ctos

In [None]:
scrubcto_dict = {x.split('/')[-1].split(f'__cto.scrublet{scrub_name_suffix}.pkl')[0]:x for x in sorted(glob.glob(f'cistopic_objects/*__cto.scrublet{scrub_name_suffix}.pkl'))}
scrubcto_dict

In [None]:
len(scrubcto_dict)

In [None]:
non_fmx_samples = ['BIO_ddseq_1.FIXEDCELLS',
'BIO_ddseq_2.FIXEDCELLS',
'BIO_ddseq_3.FIXEDCELLS',
'BIO_ddseq_4.FIXEDCELLS',
'OHS_s3atac_2.FIXEDCELLS',
'TXG_10xv11_1.FIXEDCELLS',
'TXG_10xv2_1.FIXEDCELLS',
'TXG_10xv2_2.FIXEDCELLS',
'UCS_ddseq_1.FIXEDCELLS',
'UCS_ddseq_2.FIXEDCELLS']

In [None]:
len(scrubcto_dict)

In [None]:
fmx_samples_run = set([x.split('___')[-1] for x in fmx_annot.index])
fmx_samples_run

In [None]:
fmx_samples = sorted(list(set([x.split('.')[0] + '.' + x.split('.')[1] for x in fmx.index])))
removed_bcs_dict = {}
# for sample in ["CNA_10xv11_1.FULL"]:

for sample in scrubcto_dict.keys():
    print(f"{sample}")
    cto_path = scrubcto_dict[sample]
    newcto_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.pkl')
    if os.path.exists(newcto_path):
        print(f"\t{newcto_path} exists! Skipping...")
        
    else:
        with open(cto_path, 'rb') as f:
            cto = pickle.load(f)

        if not sample in non_fmx_samples:
            if sample in fmx_samples_run:
                print(f"\tLoaded {cto_path}, adding fmx data and removing fmx + scr doublets")
                cto.add_cell_data(fmx_annot.loc[cto.cell_data.index])

                removed_bcs_dict[sample] = {
                    'dbl_scrublet': cto.cell_data.barcode[cto.cell_data.Predicted_doublets_fragments == True].tolist(),
                    'dbl_fmx': cto.cell_data.barcode[cto.cell_data.fmx_droplet_type == 'DBL'].tolist(),    
                }

                # Remove doublets 
                singlets = cto.cell_data[
                    (cto.cell_data.Predicted_doublets_fragments == False) & 
                    (cto.cell_data.fmx_droplet_type != 'DBL')
                ].index.tolist()
                print(f"\t{sample}: Removing {len(cto.cell_names)-len(singlets)} cells")

                fmx_doublets = set(cto.cell_data[cto.cell_data['fmx_droplet_type'] == 'DBL'].index)
                scr_doublets = set(cto.cell_data[cto.cell_data['Predicted_doublets_fragments'] == True].index)

                fmx_doublets_unique = fmx_doublets - scr_doublets
                scr_doublets_unique = scr_doublets - fmx_doublets
                common_doublets = fmx_doublets.intersection(scr_doublets)

                print(f"\t\t{len(fmx_doublets_unique)} unique fmx doublets")
                print(f"\t\t{len(scr_doublets_unique)} unique scr doublets")
                print(f"\t\t{len(common_doublets)} common doublets")

                fmx_doublets_unique_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.fmx_doublets_unique.txt')
                with open(fmx_doublets_unique_path, 'w') as f:
                    for x in fmx_doublets_unique:
                        f.write(f"{x}\n")

                scr_doublets_unique_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.scr_doublets_unique.txt')
                with open(scr_doublets_unique_path, 'w') as f:
                    for x in scr_doublets_unique:
                        f.write(f"{x}\n")

                common_doublets_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.common_doublets.txt')
                with open(common_doublets_path, 'w') as f:
                    for x in common_doublets:
                        f.write(f"{x}\n")
                        
                # Subset cisTopic object
                cto.subset(singlets)

                # save
                with open(newcto_path, "wb") as f:
                    pickle.dump(cto, f, protocol=4)

                removed_bcs_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.removed_bcs.pkl')
                with open(removed_bcs_path, "wb") as f:
                    pickle.dump(removed_bcs_path, f, protocol=4)

                cell_data_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.cell_data.tsv')
                cto.cell_data.to_csv(cell_data_path)

                print(f"\tWrote {newcto_path}, doublet lists and {cell_data_path}\n")
        
            else:
                print(f'\t{sample} FMX not run!!!')
                continue
                
        else:
            print(f"\tLoaded {cto_path}, removing scr doublets")
            removed_bcs_dict[sample] = {
                'dbl_scrublet': cto.cell_data.barcode[cto.cell_data.Predicted_doublets_fragments == True].tolist()
            }

            # Remove doublets 
            singlets = cto.cell_data[
                (cto.cell_data.Predicted_doublets_fragments == False)
            ].index.tolist()
            print(f"\t{sample}: Removing {len(cto.cell_names)-len(singlets)} cells")

            scr_doublets = set(cto.cell_data[cto.cell_data['Predicted_doublets_fragments'] == True].index)
            print(f"\t\t{len(scr_doublets)} scr doublets")

            
            scr_doublets_unique_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.scr_doublets_unique.txt')
            with open(scr_doublets_unique_path, 'w') as f:
                for x in scr_doublets:
                    f.write(f"{x}\n")
                    
                    
            # Subset cisTopic object
            cto.subset(singlets)

            # save
            with open(newcto_path, "wb") as f:
                pickle.dump(cto, f, protocol=4)

            removed_bcs_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.removed_bcs.pkl')
            with open(removed_bcs_path, "wb") as f:
                pickle.dump(removed_bcs_path, f, protocol=4)

            cell_data_path = cto_path.replace(f'__cto.scrublet{scrub_name_suffix}.pkl',f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.cell_data.tsv')
            cto.cell_data.to_csv(cell_data_path)

            print(f"\tWrote {newcto_path}, doublet lists and {cell_data_path}\n")

# Merge

In [None]:
from pycisTopic.cistopic_class import *

In [None]:
paths_list = ['cistopic_objects/VIB_hydrop_21.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl', 'cistopic_objects/VIB_hydrop_22.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl']
cto_path_new = 'cistopic_objects/VIB_hydrop_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl'
cto_list = []
for file in paths_list:
    print(file)
    with open(file, 'rb') as f:
        cto = pickle.load(f)
    
    cto_list.append(cto)
    
cto_merged = merge(cto_list)
with open(
    cto_path_new, "wb"
) as f:
    pickle.dump(cto_merged, f, protocol=4)

In [None]:
paths_list = ['cistopic_objects/VIB_hydrop_11.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl', 'cistopic_objects/VIB_hydrop_12.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl']
cto_path_new = 'cistopic_objects/VIB_hydrop_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl'
cto_list = []
for file in paths_list:
    print(file)
    with open(file, 'rb') as f:
        cto = pickle.load(f)
    
    cto_list.append(cto)
    
cto_merged = merge(cto_list)
with open(
    cto_path_new, "wb"
) as f:
    pickle.dump(cto_merged, f, protocol=4)

In [None]:
cto_path = 'cistopic_objects/VIB_hydrop_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl'
with open(cto_path, 'rb') as f:
    cto = pickle.load(f)
    
cto.cell_names = [ x.split('__')[0] + '-' + x.split('VIB_hydrop_')[-1][1] + "___"  + x.split('__')[1]  for x in cto.cell_names ]
with open(
    cto_path, "wb"
) as f:
    pickle.dump(cto, f, protocol=4)

In [None]:
cto_path = 'cistopic_objects/VIB_hydrop_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl'
with open(cto_path, 'rb') as f:
    cto = pickle.load(f)
    
cto.cell_names = [ x.split('__')[0] + '-' + x.split('VIB_hydrop_')[-1][1] + "___"  + x.split('__')[1]  for x in cto.cell_names ]
with open(
    cto_path, "wb"
) as f:
    pickle.dump(cto, f, protocol=4)

# Write looms from these ctos

In [None]:
import loompy as lp

In [None]:
scrub_name_suffix = "0-4"
scrubcto_dict = {x.split('/')[-1].split(f'__cto.scrublet{scrub_name_suffix}.pkl')[0]:x for x in sorted(glob.glob(f'cistopic_objects/*__cto.scrublet{scrub_name_suffix}.*singlets.pkl'))}
scrubcto_dict

In [None]:
len(scrubcto_dict)

In [None]:
loom_out = 'cell_region_looms'
if not os.path.exists(os.path.join(loom_out)):
    os.makedirs(os.path.join(loom_out))

In [None]:
cto_path_sub_dict = {}
for sample in scrubcto_dict.keys():
    cto_path = scrubcto_dict[sample]
    loom_path = os.path.join(loom_out, cto_path.split('/')[-1].replace('.pkl', '.loom'))
    print(f"Checking if {loom_path} exist...")
    if os.path.exists(loom_path):
        print(f"\t{loom_path} exists! Skipping...")
    else:
        print(f"\t{loom_path} does not exist, adding to subdict to generate")
        cto_path_sub_dict[sample] = scrubcto_dict[sample]

In [None]:
for sample in cto_path_sub_dict.keys():
    cto_path = cto_path_sub_dict[sample]
    with open(cto_path, 'rb') as f:
        cto = pickle.load(f)
        
    print(f"Loaded filtered cistopic object {sample}")
    loom_path = os.path.join(loom_out, cto_path.split('/')[-1].replace('.pkl', '.loom'))
    lp.create(
        filename = loom_path,
        layers=cto.fragment_matrix,
        row_attrs={ 'Gene': cto.region_names }, 
        col_attrs={ 'CellID': [ x.split('__')[0]  for x in cto.cell_names ] }
    )
    print(f"Finished {loom_path} loom writing")