# pycisTopic analysis

Cell downsampled dataset, using consensus peak regions.

In [2]:
import pycisTopic
pycisTopic.__version__

'0.1.dev300+g7494158'

In [3]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [4]:
import pickle
import pandas as pd

In [5]:
import os
wdir = '/lustre1/project/stg_00002/lcb/fderop/data/20211027_hca_benchmark_full_merged/'
os.chdir( wdir )

In [6]:
import glob
from collections import OrderedDict
filenames = glob.glob('fragments_postbap/*.sinto.mm.fragments.tsv.gz')
samples = [item.replace(".sinto.mm.fragments.tsv.gz", "") for item in filenames]
samples = [item.replace("fragments_postbap/", "") for item in samples]
fragments_dict = {samples[i]: filenames[i] for i in range(len(samples))}
fragments_dict = OrderedDict(sorted(fragments_dict.items()))
fragments_dict.keys()

odict_keys(['Broad_1', 'Broad_2', 'Broad_mito_1', 'Broad_mito_2', 'CNAG_1', 'CNAG_2', 'Sanger_1', 'Sanger_2', 'Stanford_1', 'Stanford_2', 'VIB_1', 'VIB_2', 'VIB_Hydrop_1', 'VIB_Hydrop_2', 's3atac'])

In [7]:
consensus_peaks_dict = {}
for key in fragments_dict.keys():
    consensus_peaks_dict[key] = 'pycistopic_screen/consensus_peak_calling/full_merged/full_merged__consensus_regions.bed'

## Create the cisTopic objects for each sample

In [8]:
f_qc_dir = 'pycistopic_consensus/qc__consensus_peaks'

with open(os.path.join(wdir, f_qc_dir, 'metadata.pickle'), 'rb') as f:
    metadata_bc_dict = pickle.load(f)
    
with open(os.path.join(wdir, f_qc_dir, 'profile_data.pickle'), 'rb') as f:
    profile_data_dict = pickle.load(f)

# use metadata tsvs to filter out our original cells

In [9]:
metadata_dict = {}
for key in fragments_dict.keys():
    f_meta = os.path.join('/lustre1/project/stg_00002/lcb/fderop/data/20211008_hca_benchmark_full_singles/pycistopic_screen',key+'_cell_data.tsv')
    metadata_dict[key] = pd.read_csv(f_meta, index_col=0, header=0, sep='\t')

In [10]:
bc_passing_filters = {}
for key in metadata_dict.keys():
    bc_passing_filters[key] = list(metadata_dict[key]['barcode'])

In [11]:
cells = 0
for key in bc_passing_filters.keys():
    print(f"{key}, {len(bc_passing_filters[key])}")
    cells = cells + len(bc_passing_filters[key])
print(f"total cells: {cells}")

Broad_1, 3921
Broad_2, 3839
Broad_mito_1, 3349
Broad_mito_2, 3210
CNAG_1, 2533
CNAG_2, 2561
Sanger_1, 2833
Sanger_2, 3443
Stanford_1, 749
Stanford_2, 1375
VIB_1, 2739
VIB_2, 6937
VIB_Hydrop_1, 2226
VIB_Hydrop_2, 2600
s3atac, 3045
total cells: 45360


cell counts are correct. in hindsight, it would have also been possible to read in cell barcodes at the QC step!

# create CTOs

In [13]:
from pycisTopic.cistopic_class import create_cistopic_object_from_fragments

In [14]:
#path_to_regions = '/staging/leuven/stg_00002/lcb/cbravo/SCREEN_ENCODE3/V2.hg38-rDHS-Unfiltered.blacklisted.bed'
path_to_blacklist = 'hg38_regions/hg38-blacklist.v2.bed'

In [15]:
f_cto_dir = 'pycistopic_consensus/cistopic_objs__mergedconsensus'
if not os.path.exists(os.path.join(wdir, f_cto_dir)):
    os.makedirs(os.path.join(wdir, f_cto_dir))

In [16]:
#Create all objects
import ray
ray.shutdown()

for key in fragments_dict.keys():
    f_out = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_mergedconsensus.pkl')
    if os.path.isfile(f_out):
        print(f"Skipping {key}")
        continue
        
    tmp_cto = create_cistopic_object_from_fragments(path_to_fragments=fragments_dict[key],
                                                    path_to_regions=consensus_peaks_dict[key],
                                                    path_to_blacklist=path_to_blacklist,
                                                    metrics=metadata_bc_dict[key], # why is this greyed out
                                                    valid_bc=bc_passing_filters[key],
                                                    n_cpu=6,
                                                    partition=20,
                                                    project=key)
    
    with open(f_out, 'wb') as f:
        pickle.dump(tmp_cto, f)
    
    print(f"DONE {key}")

Skipping Broad_1
Skipping Broad_2
Skipping Broad_mito_1
Skipping Broad_mito_2
Skipping CNAG_1
Skipping CNAG_2
Skipping Sanger_1
Skipping Sanger_2
Skipping Stanford_1
Skipping Stanford_2
Skipping VIB_1
Skipping VIB_2
Skipping VIB_Hydrop_1
Skipping VIB_Hydrop_2
Skipping s3atac


# load objects into dict:


In [17]:
cistopic_obj_dict = {}
for key in bc_passing_filters.keys():
    f_out = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_mergedconsensus.pkl')
    with open(f_out, 'rb') as f:
        cistopic_obj_dict[key] = pickle.load(f)
    print(f"Loaded {key}")

Loaded Broad_1
Loaded Broad_2
Loaded Broad_mito_1
Loaded Broad_mito_2
Loaded CNAG_1
Loaded CNAG_2
Loaded Sanger_1
Loaded Sanger_2
Loaded Stanford_1
Loaded Stanford_2
Loaded VIB_1
Loaded VIB_2
Loaded VIB_Hydrop_1
Loaded VIB_Hydrop_2
Loaded s3atac


In [18]:
import copy

In [19]:
f_out = os.path.join(wdir, f_cto_dir, 'full_merged__cistopic_obj_mergedconsensus.pkl')
if os.path.isfile(f_out):
    print(f"Loading {f_out}")
    with open(f_out, 'rb') as f:
        cistopic_obj_dict['full_merged'] = pickle.load(f)
else:
    # merge:
    ct_keys = list(cistopic_obj_dict.keys())
    cto_merged = copy.copy(cistopic_obj_dict[ct_keys[0]])
    cto_merged.merge([ cistopic_obj_dict[k] for k in ct_keys[1:] ], project='scATAC-seq_benchmark')
    with open(f_out, 'wb') as f:
        pickle.dump(cto_merged, f)
    
    cistopic_obj_dict['full_merged'] = cto_merged

Loading /lustre1/project/stg_00002/lcb/fderop/data/20211027_hca_benchmark_full_merged/pycistopic_consensus/cistopic_objs__mergedconsensus/full_merged__cistopic_obj_mergedconsensus.pkl


## Add cell annotations

# load metadata from the single samples
such as seurat cell type. this was written in notebook 5a

In [20]:
cistopic_obj_dict[key].cell_data

Unnamed: 0,cisTopic_nr_frag,cisTopic_log_nr_frag,cisTopic_nr_acc,cisTopic_log_nr_acc,sample_id,Log_total_nr_frag,Log_unique_nr_frag,Total_nr_frag,Unique_nr_frag,Dupl_nr_frag,Dupl_rate,Total_nr_frag_in_regions,Unique_nr_frag_in_regions,FRIP,TSS_enrichment,barcode
GGCTGGCTCTTGGCTCATATGATTCTGC-s3atac,20346,4.308479,18047,4.256405,s3atac,4.988349,4.711984,97353,51521,45832,0.470782,38048,19329,0.375167,7.676786,GGCTGGCTCTTGGCTCATATGATTCTGC
ACGCGACGGCGGCATAACCGGCAATGCA-s3atac,28120,4.449015,22666,4.355375,s3atac,5.062082,4.817651,115367,65713,49654,0.430400,49718,27045,0.411562,9.257202,ACGCGACGGCGGCATAACCGGCAATGCA
GAAGCAGCGGTGCATGGCCAGCACGGAC-s3atac,35312,4.547922,29453,4.46913,s3atac,5.306047,4.991713,202324,98110,104214,0.515085,70961,33202,0.338416,6.293275,GAAGCAGCGGTGCATGGCCAGCACGGAC
TGCGGCCTGGGTTCTCTCCTGCGGCACA-s3atac,122625,5.088579,86546,4.937247,s3atac,5.830076,5.596702,676201,395095,281106,0.415714,210602,118412,0.299705,4.692985,TGCGGCCTGGGTTCTCTCCTGCGGCACA
ATTGAGGATAAATGATGCTCTACTCATA-s3atac,85130,4.930083,64597,4.810212,s3atac,5.669796,5.382125,467516,241060,226456,0.484381,163008,81464,0.337941,4.063926,ATTGAGGATAAATGATGCTCTACTCATA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GGTTAGTTGGGGCATAACCGCTTATCGG-s3atac,633,2.801404,554,2.74351,s3atac,4.436608,4.179609,27328,15122,12206,0.446648,1126,608,0.040206,5.648072,GGTTAGTTGGGGCATAACCGCTTATCGG
ATTGAGGATAAGGCCGGCCACGCTATGT-s3atac,540,2.732394,495,2.694605,s3atac,4.330515,4.105238,21405,12742,8663,0.404719,905,530,0.041595,4.080657,ATTGAGGATAAGGCCGGCCACGCTATGT
GAAGAGTATTTGCATGGCCAATGTAAGT-s3atac,996,2.998259,887,2.947924,s3atac,4.530302,4.284634,33908,19259,14649,0.432022,1687,957,0.049691,5.904555,GAAGAGTATTTGCATGGCCAATGTAAGT
GGCTGGCTCTTGCATGGCCACGGACAAC-s3atac,711,2.85187,659,2.818885,s3atac,4.289433,4.004493,19473,10104,9369,0.481128,1366,684,0.067696,10.820000,GGCTGGCTCTTGCATGGCCACGGACAAC


In [21]:
cistopic_obj_dict

{'Broad_1': <pycisTopic.cistopic_class.CistopicObject at 0x2b43afeebd90>,
 'Broad_2': <pycisTopic.cistopic_class.CistopicObject at 0x2b43afeebdc0>,
 'Broad_mito_1': <pycisTopic.cistopic_class.CistopicObject at 0x2b49855c6d00>,
 'Broad_mito_2': <pycisTopic.cistopic_class.CistopicObject at 0x2b49855c6e50>,
 'CNAG_1': <pycisTopic.cistopic_class.CistopicObject at 0x2b49855c6f70>,
 'CNAG_2': <pycisTopic.cistopic_class.CistopicObject at 0x2b49855c68b0>,
 'Sanger_1': <pycisTopic.cistopic_class.CistopicObject at 0x2b49855c6160>,
 'Sanger_2': <pycisTopic.cistopic_class.CistopicObject at 0x2b44271e7df0>,
 'Stanford_1': <pycisTopic.cistopic_class.CistopicObject at 0x2b49855c6b80>,
 'Stanford_2': <pycisTopic.cistopic_class.CistopicObject at 0x2b499f0141f0>,
 'VIB_1': <pycisTopic.cistopic_class.CistopicObject at 0x2b499f014310>,
 'VIB_2': <pycisTopic.cistopic_class.CistopicObject at 0x2b499f014b50>,
 'VIB_Hydrop_1': <pycisTopic.cistopic_class.CistopicObject at 0x2b499f014c40>,
 'VIB_Hydrop_2': <pyc

In [22]:
keys = list(cistopic_obj_dict['full_merged'].cell_data['sample_id'].unique())

In [23]:
metadata_dict = {}
for key in keys:
    f_meta = os.path.join('/lustre1/project/stg_00002/lcb/fderop/data/20211008_hca_benchmark_full_singles/pycistopic_screen',key+'_cell_data.tsv')
    metadata_dict[key] = pd.read_csv(f_meta, index_col=0, header=0, sep='\t')

In [24]:
metadata_dict.keys()

dict_keys(['Broad_1', 'Broad_2', 'Broad_mito_1', 'Broad_mito_2', 'CNAG_1', 'CNAG_2', 'Sanger_1', 'Sanger_2', 'Stanford_1', 'Stanford_2', 'VIB_1', 'VIB_2', 'VIB_Hydrop_1', 'VIB_Hydrop_2', 's3atac'])

In [25]:
df_meta = pd.DataFrame()
for key in metadata_dict.keys():
    df_meta = pd.concat([df_meta, metadata_dict[key]], axis=0)

In [26]:
cistopic_obj_dict['full_merged'].cell_data['seurat_cell_type'] = df_meta['seurat_cell_type']
cistopic_obj_dict['full_merged'].cell_data['consensus_cell_type'] = df_meta['consensus_cell_type']

# save ctos

In [None]:
f_cto_dir = 'pycistopic_consensus_peaks/cistopic_objs__mergedconsensus'

if not os.path.exists(os.path.join(wdir, f_cto_dir)):
    os.makedirs(os.path.join(wdir, f_cto_dir))

for key in cistopic_obj_dict.keys():
    f_out = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_mergedconsensus_metadata_annotated.pkl')
    if os.path.isfile(f_out):
        print(f"Skipping {key}, already exists")
        continue
    with open(f_out, 'wb') as f:
        pickle.dump(cistopic_obj_dict[key], f)
    print(f"Generated and saved cistopic object for {key}")

Generated and saved cistopic object for Broad_1
Generated and saved cistopic object for Broad_2
Generated and saved cistopic object for Broad_mito_1
Generated and saved cistopic object for Broad_mito_2
Generated and saved cistopic object for CNAG_1
Generated and saved cistopic object for CNAG_2
Generated and saved cistopic object for Sanger_1
Generated and saved cistopic object for Sanger_2
Generated and saved cistopic object for Stanford_1
Generated and saved cistopic object for Stanford_2
Generated and saved cistopic object for VIB_1
Generated and saved cistopic object for VIB_2
Generated and saved cistopic object for VIB_Hydrop_1
