# pycisTopic analysis

Cell downsampled dataset, using consensus peak regions.

In [2]:
import pycisTopic
pycisTopic.__version__

'0.1.dev300+g7494158'

In [3]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [4]:
import pickle
import pandas as pd

In [5]:
import os
wdir = '/lustre1/project/stg_00002/lcb/fderop/data/20211024_hca_benchmark_libds_merged/'
os.chdir( wdir )

In [6]:
import glob
from collections import OrderedDict
filenames = glob.glob('fragments_postbap/*.sinto.mm.fragments.tsv.gz')
samples = [item.replace(".sinto.mm.fragments.tsv.gz", "") for item in filenames]
samples = [item.replace("fragments_postbap/", "") for item in samples]
fragments_dict = {samples[i]: filenames[i] for i in range(len(samples))}
fragments_dict = OrderedDict(sorted(fragments_dict.items()))
fragments_dict.keys()

odict_keys(['Broad_1', 'Broad_2', 'Broad_mito_1', 'Broad_mito_2', 'CNAG_1', 'CNAG_2', 'Sanger_1', 'Sanger_2', 'Stanford_1', 'Stanford_2', 'VIB_1', 'VIB_2', 'VIB_Hydrop_1', 'VIB_Hydrop_2', 's3atac'])

In [7]:
fragments_dict

OrderedDict([('Broad_1',
              'fragments_postbap/Broad_1.sinto.mm.fragments.tsv.gz'),
             ('Broad_2',
              'fragments_postbap/Broad_2.sinto.mm.fragments.tsv.gz'),
             ('Broad_mito_1',
              'fragments_postbap/Broad_mito_1.sinto.mm.fragments.tsv.gz'),
             ('Broad_mito_2',
              'fragments_postbap/Broad_mito_2.sinto.mm.fragments.tsv.gz'),
             ('CNAG_1', 'fragments_postbap/CNAG_1.sinto.mm.fragments.tsv.gz'),
             ('CNAG_2', 'fragments_postbap/CNAG_2.sinto.mm.fragments.tsv.gz'),
             ('Sanger_1',
              'fragments_postbap/Sanger_1.sinto.mm.fragments.tsv.gz'),
             ('Sanger_2',
              'fragments_postbap/Sanger_2.sinto.mm.fragments.tsv.gz'),
             ('Stanford_1',
              'fragments_postbap/Stanford_1.sinto.mm.fragments.tsv.gz'),
             ('Stanford_2',
              'fragments_postbap/Stanford_2.sinto.mm.fragments.tsv.gz'),
             ('VIB_1', 'fragments_postbap/VIB_

In [8]:
consensus_peaks_dict = {}
for key in fragments_dict.keys():
    consensus_peaks_dict[key] = '/lustre1/project/stg_00002/lcb/fderop/data/20211024_hca_benchmark_libds_merged/consensus_peak_calling/libds_merged/libds_merged__consensus_regions.bed'

## Create the cisTopic objects for each sample

In [9]:
f_qc_dir = 'pycistopic_consensus_peaks/qc__consensus_peaks'

with open(os.path.join(wdir, f_qc_dir, 'metadata.pickle'), 'rb') as f:
    metadata_bc_dict = pickle.load(f)
    
with open(os.path.join(wdir, f_qc_dir, 'profile_data.pickle'), 'rb') as f:
    profile_data_dict = pickle.load(f)

# use metadata tsvs to filter out our original cells

In [10]:
metadata_dict = {}
for key in fragments_dict.keys():
    f_meta = os.path.join('/lustre1/project/stg_00002/lcb/fderop/data/20211013_hca_benchmark_libds/pycistopic_screen',key+'_cell_data.tsv')
    metadata_dict[key] = pd.read_csv(f_meta, index_col=0, header=0, sep='\t')

In [11]:
bc_passing_filters = {}
for key in metadata_dict.keys():
    bc_passing_filters[key] = list(metadata_dict[key]['barcode'])

In [12]:
cells = 0
for key in bc_passing_filters.keys():
    print(f"{key}, {len(bc_passing_filters[key])}")
    cells = cells + len(bc_passing_filters[key])
print(f"total cells: {cells}")

Broad_1, 3903
Broad_2, 3805
Broad_mito_1, 3355
Broad_mito_2, 3196
CNAG_1, 2496
CNAG_2, 2504
Sanger_1, 2903
Sanger_2, 3559
Stanford_1, 702
Stanford_2, 1366
VIB_1, 2746
VIB_2, 6928
VIB_Hydrop_1, 2208
VIB_Hydrop_2, 2531
s3atac, 3033
total cells: 45235


cell counts are correct.

# create CTOs

In [13]:
from pycisTopic.cistopic_class import create_cistopic_object_from_fragments

In [14]:
#path_to_regions = '/staging/leuven/stg_00002/lcb/cbravo/SCREEN_ENCODE3/V2.hg38-rDHS-Unfiltered.blacklisted.bed'
path_to_blacklist = '/lustre1/project/stg_00002/lcb/fderop/data/20211013_hca_benchmark_libds/hg38_regions/hg38-blacklist.v2.bed'

In [15]:
f_cto_dir = 'pycistopic_consensus_peaks/cistopic_objs__mergedconsensus'
if not os.path.exists(os.path.join(wdir, f_cto_dir)):
    os.makedirs(os.path.join(wdir, f_cto_dir))

In [16]:
fragments_dict[key]

'fragments_postbap/s3atac.sinto.mm.fragments.tsv.gz'

In [17]:
#Create all objects
import ray
ray.shutdown()

for key in fragments_dict.keys():
    f_out = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_mergedconsensus.pkl')
    if os.path.isfile(f_out):
        print(f"Skipping {key}")
        continue
        
    tmp_cto = create_cistopic_object_from_fragments(path_to_fragments=fragments_dict[key],
                                                    path_to_regions=consensus_peaks_dict[key],
                                                    path_to_blacklist=path_to_blacklist,
                                                    metrics=metadata_bc_dict[key], # why is this greyed out
                                                    valid_bc=bc_passing_filters[key],
                                                    n_cpu=6,
                                                    partition=20,
                                                    project=key)
    
    with open(f_out, 'wb') as f:
        pickle.dump(tmp_cto, f)
    
    print(f"DONE {key}")


Skipping Broad_1
Skipping Broad_2
Skipping Broad_mito_1
Skipping Broad_mito_2
Skipping CNAG_1
Skipping CNAG_2
Skipping Sanger_1
Skipping Sanger_2
Skipping Stanford_1
Skipping Stanford_2
Skipping VIB_1
Skipping VIB_2
Skipping VIB_Hydrop_1
Skipping VIB_Hydrop_2
Skipping s3atac


In [18]:
#Create all objects
import ray
ray.shutdown()

# load objects into dict:


In [19]:
cistopic_obj_dict = {}
for key in bc_passing_filters.keys():
    f_out = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_mergedconsensus.pkl')
    with open(f_out, 'rb') as f:
        cistopic_obj_dict[key] = pickle.load(f)
    print(f"Loaded {key}")

Loaded Broad_1
Loaded Broad_2
Loaded Broad_mito_1
Loaded Broad_mito_2
Loaded CNAG_1
Loaded CNAG_2
Loaded Sanger_1
Loaded Sanger_2
Loaded Stanford_1
Loaded Stanford_2
Loaded VIB_1
Loaded VIB_2
Loaded VIB_Hydrop_1
Loaded VIB_Hydrop_2
Loaded s3atac


In [20]:
import copy

In [21]:
f_out = os.path.join(wdir, f_cto_dir, 'libds_merged__cistopic_obj_mergedconsensus.pkl')
if os.path.isfile(f_out):
    print(f"Loading {f_out}")
    with open(f_out, 'rb') as f:
        cistopic_obj_dict['libds_merged'] = pickle.load(f)
else:
    # merge:
    ct_keys = list(cistopic_obj_dict.keys())
    cto_merged = copy.copy(cistopic_obj_dict[ct_keys[0]])
    cto_merged.merge([ cistopic_obj_dict[k] for k in ct_keys[1:] ], project='scATAC-seq_benchmark')
    with open(f_out, 'wb') as f:
        pickle.dump(cto_merged, f)
    
    cistopic_obj_dict['libds_merged'] = cto_merged

Loading /lustre1/project/stg_00002/lcb/fderop/data/20211024_hca_benchmark_libds_merged/pycistopic_consensus_peaks/cistopic_objs__mergedconsensus/libds_merged__cistopic_obj_mergedconsensus.pkl


## Add cell annotations

# load metadata from the single samples
such as seurat cell type. this was written in notebook 5a

In [22]:
cistopic_obj_dict[key].cell_data

Unnamed: 0,cisTopic_nr_frag,cisTopic_log_nr_frag,cisTopic_nr_acc,cisTopic_log_nr_acc,sample_id,Log_total_nr_frag,Log_unique_nr_frag,Total_nr_frag,Unique_nr_frag,Dupl_nr_frag,Dupl_rate,Total_nr_frag_in_regions,Unique_nr_frag_in_regions,FRIP,TSS_enrichment,barcode
GAAGAGTATTGCCGGAGCGGTGTAGATA-s3atac,2872,3.458184,2754,3.439964,s3atac,3.781396,3.751895,6045,5648,397,0.065674,2991,2760,0.488669,14.931959,GAAGAGTATTGCCGGAGCGGTGTAGATA
GCAGGTCGTCGTACTGGTTGCAGTAGGC-s3atac,3902,3.591287,3775,3.576917,s3atac,4.099784,4.071035,12583,11777,806,0.064055,4021,3749,0.318332,8.870119,GCAGGTCGTCGTACTGGTTGCAGTAGGC
ACGCGACGGCGAACTTGCGGGCTCATTG-s3atac,9937,3.997255,9289,3.967969,s3atac,4.401211,4.378398,25189,23900,1289,0.051173,10281,9655,0.403975,9.005776,ACGCGACGGCGAACTTGCGGGCTCATTG
GGTTAGTTGGCAAGTAGGACTTCCTGTT-s3atac,3425,3.534661,3330,3.522444,s3atac,3.933538,3.910197,8581,8132,449,0.052325,3498,3294,0.405066,7.025523,GGTTAGTTGGCAAGTAGGACTTCCTGTT
GATTCGGTCAACCATAATAATCGTAGTG-s3atac,3851,3.585574,3680,3.565848,s3atac,4.098644,4.061528,12550,11522,1028,0.081912,4112,3728,0.323555,11.419322,GATTCGGTCAACCATAATAATCGTAGTG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GGTTAGTTGGTATTGCAGCTCGGACAAC-s3atac,127,2.103804,127,2.103804,s3atac,3.208710,3.184975,1617,1531,86,0.053185,133,123,0.080340,6.560000,GGTTAGTTGGTATTGCAGCTCGGACAAC
GATTCGGTCAGTTCTCTCCTTGACGAAT-s3atac,203,2.307496,199,2.298853,s3atac,3.311118,3.289812,2047,1949,98,0.047875,204,195,0.100051,6.280000,GATTCGGTCAGTTCTCTCCTTGACGAAT
TGCGGCCTGGTCTCATTGCCCGGAACTG-s3atac,252,2.401401,250,2.39794,s3atac,3.386856,3.362671,2437,2305,132,0.054165,264,247,0.107158,8.170000,TGCGGCCTGGTCTCATTGCCCGGAACTG
GAAGAGTATTTCTCCTCCTGGTGTCGGA-s3atac,169,2.227887,166,2.220108,s3atac,3.288696,3.262451,1944,1830,114,0.058642,170,160,0.087432,4.430000,GAAGAGTATTTCTCCTCCTGGTGTCGGA


In [23]:
cistopic_obj_dict

{'Broad_1': <pycisTopic.cistopic_class.CistopicObject at 0x2b475cec5b80>,
 'Broad_2': <pycisTopic.cistopic_class.CistopicObject at 0x2b475cedf3a0>,
 'Broad_mito_1': <pycisTopic.cistopic_class.CistopicObject at 0x2b475cedf4f0>,
 'Broad_mito_2': <pycisTopic.cistopic_class.CistopicObject at 0x2b475cedf610>,
 'CNAG_1': <pycisTopic.cistopic_class.CistopicObject at 0x2b475cedf730>,
 'CNAG_2': <pycisTopic.cistopic_class.CistopicObject at 0x2b475cedf850>,
 'Sanger_1': <pycisTopic.cistopic_class.CistopicObject at 0x2b475cedf970>,
 'Sanger_2': <pycisTopic.cistopic_class.CistopicObject at 0x2b475cedfa90>,
 'Stanford_1': <pycisTopic.cistopic_class.CistopicObject at 0x2b475cedfbb0>,
 'Stanford_2': <pycisTopic.cistopic_class.CistopicObject at 0x2b475cedfd00>,
 'VIB_1': <pycisTopic.cistopic_class.CistopicObject at 0x2b475cedfdf0>,
 'VIB_2': <pycisTopic.cistopic_class.CistopicObject at 0x2b475cedff10>,
 'VIB_Hydrop_1': <pycisTopic.cistopic_class.CistopicObject at 0x2b475cecf640>,
 'VIB_Hydrop_2': <pyc

In [24]:
keys = list(cistopic_obj_dict['libds_merged'].cell_data['sample_id'].unique())

In [25]:
metadata_dict = {}
for key in keys:
    f_meta = os.path.join('/lustre1/project/stg_00002/lcb/fderop/data/20211013_hca_benchmark_libds/pycistopic_screen',key+'_cell_data.tsv')
    metadata_dict[key] = pd.read_csv(f_meta, index_col=0, header=0, sep='\t')

In [26]:
metadata_dict.keys()

dict_keys(['Broad_1', 'Broad_2', 'Broad_mito_1', 'Broad_mito_2', 'CNAG_1', 'CNAG_2', 'Sanger_1', 'Sanger_2', 'Stanford_1', 'Stanford_2', 'VIB_1', 'VIB_2', 'VIB_Hydrop_1', 'VIB_Hydrop_2', 's3atac'])

In [27]:
df_meta = pd.DataFrame()
for key in metadata_dict.keys():
    df_meta = pd.concat([df_meta, metadata_dict[key]], axis=0)

In [28]:
cistopic_obj_dict['libds_merged'].cell_data['seurat_cell_type'] = df_meta['seurat_cell_type']
cistopic_obj_dict['libds_merged'].cell_data['consensus_cell_type'] = df_meta['consensus_cell_type']

# save ctos

In [29]:
f_cto_dir = 'pycistopic_consensus_peaks/cistopic_objs__mergedconsensus'

if not os.path.exists(os.path.join(wdir, f_cto_dir)):
    os.makedirs(os.path.join(wdir, f_cto_dir))

for key in cistopic_obj_dict.keys():
    f_out = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_mergedconsensus_metadata_annotated.pkl')
    if os.path.isfile(f_out):
        print(f"Skipping {key}, already exists")
        continue
    with open(f_out, 'wb') as f:
        pickle.dump(cistopic_obj_dict[key], f)
    print(f"Generated and saved cistopic object for {key}")

Generated and saved cistopic object for Broad_1
Generated and saved cistopic object for Broad_2
Generated and saved cistopic object for Broad_mito_1
Generated and saved cistopic object for Broad_mito_2
Generated and saved cistopic object for CNAG_1
Generated and saved cistopic object for CNAG_2
Generated and saved cistopic object for Sanger_1
Generated and saved cistopic object for Sanger_2
Generated and saved cistopic object for Stanford_1
Generated and saved cistopic object for Stanford_2
Generated and saved cistopic object for VIB_1
Generated and saved cistopic object for VIB_2
Generated and saved cistopic object for VIB_Hydrop_1
Generated and saved cistopic object for VIB_Hydrop_2
Generated and saved cistopic object for s3atac
Generated and saved cistopic object for libds_merged
