# pycisTopic analysis

Cell downsampled dataset, using consensus peak regions.

In [3]:
import pycisTopic
pycisTopic.__version__

'0.1.dev300+g7494158'

In [4]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [5]:
import pickle
import pandas as pd

In [6]:
import os
wdir = '/lustre1/project/stg_00002/lcb/fderop/data/20211013_hca_benchmark_libds/'
os.chdir( wdir )

In [7]:
import glob
from collections import OrderedDict
filenames = glob.glob('fragments_postbap/*Hydrop*.sinto.mm.fragments.tsv.gz')
samples = [item.replace(".sinto.mm.fragments.tsv.gz", "") for item in filenames]
samples = [item.replace("fragments_postbap/", "") for item in samples]
fragments_dict = {samples[i]: filenames[i] for i in range(len(samples))}
fragments_dict = OrderedDict(sorted(fragments_dict.items()))
fragments_dict.keys()

odict_keys(['VIB_Hydrop_1', 'VIB_Hydrop_2'])

# load consensus peak dict


load a dictionary with paths to consensus files

In [29]:
import glob
from collections import OrderedDict
consensus_peaks_dict = {}
filenames = glob.glob('pycistopic_screen/consensus_peak_calling/*/VIB_Hydrop_*__consensus_regions.bed')
samples = [item.replace("__consensus_regions.bed", "") for item in filenames]
samples = [item.split(sep='/')[2] for item in samples]
consensus_peaks_dict = {samples[i]: filenames[i] for i in range(len(samples))}
consensus_peaks_dict = OrderedDict(sorted(consensus_peaks_dict.items()))
consensus_peaks_dict.keys()

odict_keys(['VIB_Hydrop_1', 'VIB_Hydrop_2'])

In [32]:
consensus_peaks_dict

OrderedDict([('VIB_Hydrop_1',
              'pycistopic_screen/consensus_peak_calling/VIB_Hydrop_1/VIB_Hydrop_1__consensus_regions.bed'),
             ('VIB_Hydrop_2',
              'pycistopic_screen/consensus_peak_calling/VIB_Hydrop_2/VIB_Hydrop_2__consensus_regions.bed')])

## Library QC metrics

In [33]:
from pycisTopic.qc import compute_qc_stats
import pyranges

! cp /staging/leuven/stg_00002/lcb/cflerin/analysis/pbmc_atac/analysis2/data_freeze_Jun2021/atac_qc_multiplet_merged/jupyter/biomart_annot.pickle .

In [34]:
import pickle
f_biomart = 'biomart_annot.pickle'
with open(f_biomart, 'rb') as f:
    annot = pickle.load(f)
annot

Unnamed: 0,Chromosome,Start,Strand,Gene,Transcript_type
8986,chrY,9337464,1,TSPY4,protein_coding
8987,chrY,9337510,1,TSPY4,protein_coding
9024,chrY,22490397,1,PRY,protein_coding
9100,chrY,14056227,1,VCY1B,protein_coding
9136,chrY,12662368,1,USP9Y,protein_coding
...,...,...,...,...,...
236024,chr1,36479519,-1,CSF3R,protein_coding
236025,chr1,36471474,-1,CSF3R,protein_coding
236033,chr1,36482051,-1,CSF3R,protein_coding
236034,chr1,36323645,-1,EVA1B,protein_coding


In [35]:
import ray
ray.shutdown()
metadata_bc_dict, profile_data_dict = compute_qc_stats(
        fragments_dict=fragments_dict,
        tss_annotation=annot,
        stats=['barcode_rank_plot', 'duplicate_rate', 'insert_size_distribution', 'profile_tss', 'frip'],
        label_list=None,
        path_to_regions=consensus_peaks_dict,
        n_cpu=5,
        valid_bc=None,
        n_frag=100,
        n_bc=None,
        tss_flank_window=2000,
        tss_window=50,
        tss_minimum_signal_window=100,
        tss_rolling_window=10,
        min_norm=0.1,
        remove_duplicates = True,
        )

2021-10-22 10:48:00,699 cisTopic     INFO     n_cpu is larger than the number of samples. Setting n_cpu to the number of samples


2021-10-22 10:48:02,716	INFO services.py:1263 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(pid=20238)[0m 2021-10-22 10:48:05,608 cisTopic     INFO     Reading VIB_Hydrop_1
[2m[36m(pid=20237)[0m 2021-10-22 10:48:05,601 cisTopic     INFO     Reading VIB_Hydrop_2
[2m[36m(pid=20238)[0m 2021-10-22 10:48:28,689 cisTopic     INFO     Computing barcode rank plot for VIB_Hydrop_1
[2m[36m(pid=20238)[0m 2021-10-22 10:48:28,690 cisTopic     INFO     Counting fragments
[2m[36m(pid=20238)[0m 2021-10-22 10:48:29,884 cisTopic     INFO     Marking barcodes with more than 100
[2m[36m(pid=20238)[0m 2021-10-22 10:48:29,910 cisTopic     INFO     Returning plot data
[2m[36m(pid=20238)[0m 2021-10-22 10:48:29,910 cisTopic     INFO     Returning valid barcodes
[2m[36m(pid=20238)[0m 2021-10-22 10:48:30,701 cisTopic     INFO     Computing duplicate rate plot for VIB_Hydrop_1
[2m[36m(pid=20238)[0m 2021-10-22 10:48:32,025 cisTopic     INFO     Return plot data
[2m[36m(pid=20238)[0m 2021-10-22 10:48:32,099 cisTopic     INFO     Computing insert size distribution for 

In [36]:
f_qc_dir = 'pycistopic_consensus_peaks/qc__consensus_peaks'
if not os.path.exists(os.path.join(wdir, f_qc_dir)):
    os.makedirs(os.path.join(wdir, f_qc_dir))
    
with open(os.path.join(wdir, f_qc_dir, 'metadata.pickle'), 'wb') as f:
    pickle.dump(metadata_bc_dict, f)

with open(os.path.join(wdir, f_qc_dir, 'profile_data.pickle'), 'wb') as f:
    pickle.dump(profile_data_dict, f)

## Create the cisTopic objects for each sample

In [37]:
# Valid barcodes
with open(wdir + 'pycistopic_screen/barcodes_passing_filters_filtered.pkl', 'rb') as f:
    bc_passing_filters = pickle.load(f)

In [38]:
from pycisTopic.cistopic_class import create_cistopic_object_from_fragments

In [39]:
#path_to_regions = '/staging/leuven/stg_00002/lcb/cbravo/SCREEN_ENCODE3/V2.hg38-rDHS-Unfiltered.blacklisted.bed'
path_to_blacklist = 'hg38_regions/hg38-blacklist.v2.bed'

In [43]:
f_cto_dir = 'pycistopic_consensus_peaks/cistopic_objs__consensus'
if not os.path.exists(os.path.join(wdir, f_cto_dir)):
    os.makedirs(os.path.join(wdir, f_cto_dir))

In [57]:
#Create all objects

for key in fragments_dict.keys():
    f_out = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_metadata.pkl')
    if os.path.isfile(f_out):
        print(f"Skipping {key}")
        continue
        
    tmp_cto = create_cistopic_object_from_fragments(path_to_fragments=fragments_dict[key],
                                                    path_to_regions=consensus_peaks_dict[key],
                                                    path_to_blacklist=path_to_blacklist,
                                                    metrics=metadata_bc_dict[key], # why is this greyed out
                                                    valid_bc=bc_passing_filters[key],
                                                    n_cpu=6,
                                                    partition=20,
                                                    project=key)
    
    with open(f_out, 'wb') as f:
        pickle.dump(tmp_cto, f)
    
    print(f"DONE {key}")


2021-10-22 11:08:15,330 cisTopic     INFO     Reading data for VIB_Hydrop_1
2021-10-22 11:08:38,294 cisTopic     INFO     metrics provided!
2021-10-22 11:08:39,034 cisTopic     INFO     valid_bc provided, selecting barcodes!
2021-10-22 11:08:39,545 cisTopic     INFO     Counting fragments in regions


2021-10-22 11:08:41,674	INFO services.py:1263 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


2021-10-22 11:08:52,620 cisTopic     INFO     Creating fragment matrix
2021-10-22 11:08:58,264 cisTopic     INFO     Converting fragment matrix to sparse matrix
2021-10-22 11:09:00,936 cisTopic     INFO     Removing blacklisted regions
2021-10-22 11:09:01,425 cisTopic     INFO     Creating CistopicObject
2021-10-22 11:09:01,727 cisTopic     INFO     Done!
DONE VIB_Hydrop_1
2021-10-22 11:09:02,240 cisTopic     INFO     Reading data for VIB_Hydrop_2
2021-10-22 11:09:34,356 cisTopic     INFO     metrics provided!
2021-10-22 11:09:35,327 cisTopic     INFO     valid_bc provided, selecting barcodes!
2021-10-22 11:09:35,935 cisTopic     INFO     Counting fragments in regions


2021-10-22 11:09:38,302	INFO services.py:1263 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


2021-10-22 11:09:49,348 cisTopic     INFO     Creating fragment matrix
2021-10-22 11:09:55,302 cisTopic     INFO     Converting fragment matrix to sparse matrix
2021-10-22 11:09:57,067 cisTopic     INFO     Removing blacklisted regions
2021-10-22 11:09:57,565 cisTopic     INFO     Creating CistopicObject
2021-10-22 11:09:57,911 cisTopic     INFO     Done!
DONE VIB_Hydrop_2


# load all CTOs

In [14]:
# load objects into dict:
cistopic_obj_dict = {}
for key in bc_passing_filters.keys():
    f_out = os.path.join(wdir, f_cto_merged_dir, key + '__cistopic_obj.pkl')
    with open(f_out, 'rb') as f:
        cistopic_obj_dict[key] = pickle.load(f)
    print(f"Loaded {key}")

Loaded Broad_1
Loaded Broad_2
Loaded Broad_mito_1
Loaded Broad_mito_2
Loaded CNAG_1
Loaded CNAG_2
Loaded s3atac
Loaded Sanger_1
Loaded Sanger_2
Loaded Stanford_1
Loaded Stanford_2
Loaded VIB_1
Loaded VIB_2


## Create a set of cisTopic objects for downstream analysis

Include each sample individually (using the sample-specific consensus peaks), plus the merged sample (using the merged consensus peaks).

In [8]:
f_cto_dir = 'pycistopic_consensus_peaks/cistopic_objs__consensus'

cistopic_obj_dict = {}
# load sample objects into dict:
for key in fragments_dict.keys():
    f_out = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_metadata.pkl')
    with open(f_out, 'rb') as f:
        cistopic_obj_dict[key] = pickle.load(f)
    print(f"Loaded {key}")

Loaded VIB_Hydrop_1
Loaded VIB_Hydrop_2


## Add cell annotations

In [11]:
import pandas as pd
cellannot_dict = {}
for key in fragments_dict.keys():
    cellannot = pd.read_csv(
    os.path.join('pycistopic_screen',key+'_cell_data.tsv'),
    sep='\t', index_col=0)
    cellannot_dict[key] = cellannot

In [18]:
for key in cellannot_dict.keys():
    cistopic_obj_dict[key].add_cell_data(
        cellannot_dict[key][['consensus_cell_type']]
    )

Columns ['consensus_cell_type'] will be overwritten
Columns ['consensus_cell_type'] will be overwritten


# save ctos

In [79]:
f_cto_dir = 'pycistopic_consensus_peaks/cistopic_objs__consensus'

if not os.path.exists(os.path.join(wdir, f_cto_dir)):
    os.makedirs(os.path.join(wdir, f_cto_dir))

for key in cistopic_obj_dict.keys():
    f_out = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_metadata_annotated.pkl')
    if os.path.isfile(f_out):
        print(f"Skipping {key}, already exists")
        continue
    with open(f_out, 'wb') as f:
        pickle.dump(cistopic_obj_dict[key], f)
    print(f"Generated and saved cistopic object for {key}")

Generated and saved cistopic object for VIB_Hydrop_1
Generated and saved cistopic object for VIB_Hydrop_2
