# pycisTopic analysis

Full dataset, using SCREEN regions.

In [1]:
import pycisTopic
pycisTopic.__version__

'0.1.dev300+g7494158'

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
import pickle
import pandas as pd

In [4]:
import os
wdir = '/lustre1/project/stg_00002/lcb/fderop/data/20211008_hca_benchmark_rerun_all_with_hydrop'
os.chdir( wdir )

In [5]:
import glob
from collections import OrderedDict

In [6]:
filenames = glob.glob('cistopic_objs__screen/*merged*filtered.pkl')
samples = [item.replace("__cistopic_obj_filtered.pkl", "") for item in filenames]
samples = [item.replace("cistopic_objs__screen/", "") for item in samples]
files_dict = {samples[i]: filenames[i] for i in range(len(samples))}
files_dict = OrderedDict(sorted(files_dict.items()))
files_dict.keys()

odict_keys(['fulldata_merged'])

Create a dictionary with fragments files for each sample

# Run models for quick visualize/clustering

Read filtered cistopic objs

In [7]:
f_cto_dir = 'cistopic_objs__screen'

cistopic_obj_dict = {}
for key in files_dict.keys():
    f_cto = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_filtered.pkl')
    if(os.path.isfile(f_cto)):
        with open(f_cto, 'rb') as f:
            cistopic_obj_dict[key] = pickle.load(f)
        print(f"Loaded filtered cistopic object {key}")
    else:
        print(f"file {f_cto} doesn't exist")

Loaded filtered cistopic object fulldata_merged


In [8]:
from pycisTopic.lda_models import run_cgs_models_mallet

#### mallet models

In [9]:
# set the memory limit for mallet (1GB default is not enough)
os.environ['MALLET_MEMORY'] = '100G'

In [10]:
f_mod_dir = 'models__screen__mallet'
if not os.path.exists(os.path.join(wdir, f_mod_dir)):
    os.makedirs(os.path.join(wdir, f_mod_dir))

n_topics = [50]
n_iter = 400

In [11]:
# mallet implementation
models_dict = {}
for key in cistopic_obj_dict.keys():
    f_mod = os.path.join(wdir, f_mod_dir, key + '__models_' + str(n_iter) + '_iter.pkl')
    if os.path.isfile(f_mod):
        print(f"Loading {f_mod}")
        with open(f_mod, 'rb') as f:
            models_dict[key] = pickle.load(f)
    else:
        print(f"Running {key}")
        model = run_cgs_models_mallet(
                            'mallet',
                            cistopic_obj_dict[key],
                            n_topics=n_topics,
                            n_cpu=32,
                            n_iter=n_iter,
                            random_state=555,
                            alpha=50,
                            alpha_by_topic=True,
                            eta=0.1,
                            eta_by_topic=False,
                            tmp_path=os.path.join('/tmp', key + '_mallet2')
                            )
        # Save
        with open(f_mod, 'wb') as f:
            pickle.dump(model, f)
        print(f"Finished {key}")
        models_dict[key] = model

Running fulldata_merged
2021-10-11 13:56:18,938 cisTopic     INFO     Formatting input to corpus
2021-10-11 13:56:31,112 gensim.corpora.dictionary INFO     adding document #0 to Dictionary(0 unique tokens: [])
2021-10-11 13:57:45,537 gensim.corpora.dictionary INFO     adding document #10000 to Dictionary(0 unique tokens: [])
2021-10-11 13:59:22,333 gensim.corpora.dictionary INFO     adding document #20000 to Dictionary(0 unique tokens: [])
2021-10-11 14:02:02,753 gensim.corpora.dictionary INFO     adding document #30000 to Dictionary(0 unique tokens: [])
2021-10-11 14:03:05,360 gensim.corpora.dictionary INFO     adding document #40000 to Dictionary(0 unique tokens: [])
2021-10-11 14:05:14,692 gensim.corpora.dictionary INFO     built Dictionary(2208284 unique tokens: ['0', '1', '2', '3', '4']...) from 45357 documents (total 573904247 corpus positions)
2021-10-11 14:05:14,695 cisTopic     INFO     Running model with 50 topics
2021-10-11 14:05:14,762 LDAMalletWrapper INFO     Serializing 

### Read in LDA/mallett models

In [12]:
from pycisTopic.lda_models import evaluate_models
import matplotlib.pyplot as plt

In [13]:
f_mod_dir = 'models__screen__mallet'

models_dict = {}
for key in cistopic_obj_dict.keys():
    f_mod = os.path.join(wdir, f_mod_dir, key + '__models_' + str(n_iter) + '_iter.pkl')
    if(os.path.isfile(f_mod)):
        with open(f_mod, 'rb') as f:
            models_dict[key] = pickle.load(f)
        print(f"Loaded {key}")

Loaded fulldata_merged


In [14]:
for key in models_dict.keys():
    model=evaluate_models(models_dict[key],
                         select_model=50, 
                         return_model=True, 
                         metrics=['Arun_2010','Cao_Juan_2009', 'Minmo_2011', 'loglikelihood'],
                         plot=False, # disabled since we only test one model here
                         plot_metrics=False
                         )
    cistopic_obj_dict[key].add_LDA_model(model)

In [15]:
from pycisTopic.clust_vis import find_clusters, run_umap, run_tsne, plot_metadata, plot_topic

# dimensionality reduction

In [16]:
for key in cistopic_obj_dict.keys():
    find_clusters(cistopic_obj_dict[key],
              target  = 'cell',
              k = 10,
              res = [0.6, 0.8],
              prefix = 'pycisTopic_')
    cistopic_obj_dict[key].projections['cell'] = {}
    run_umap(cistopic_obj_dict[key], target = 'cell')
    #run_tsne(cistopic_obj_dict[key], target = 'cell')

2021-10-11 19:33:05,780 cisTopic     INFO     Finding neighbours
2021-10-11 19:33:44,041 cisTopic     INFO     Running UMAP


<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [17]:
for key in cistopic_obj_dict.keys():
    f_out = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_filtered_models.pkl')
    if os.path.isfile(f_out):
        print(f"Skipping {key}, already exists")
        continue
    with open(f_out, 'wb') as f:
        pickle.dump(tmp_cto, f)
    
    print(f"Generated and saved filtered cistopic object for {key}")

NameError: name 'tmp_cto' is not defined

# here, perform seurat predictions
and save

### Read in cell type predictions

In [None]:
f_pred_dir = '/staging/leuven/stg_00002/lcb/cflerin/analysis/pbmc_atac/analysis2/data_freeze_Jun2021/cell_type_classification/predictions__screen/'

ct_pred = {}
for key in bc_passing_filters.keys():
    ct_pred[key] = pd.read_csv(os.pathh.join(f_pred_dir, key + '__cell_type_seurat.txt'),sep='\t')
    # format to add to cistopic object:
    ct_annot = ct_pred[key][['composite_sample_id','cell_type','cell_type_pred_score']].copy().set_index('composite_sample_id')
    ct_annot.columns = ['seurat_cell_type','seurat_cell_type_pred_score']
    cistopic_obj_dict[key].add_cell_data(ct_annot)

## Visualizations

In [None]:
for key in cistopic_obj_dict.keys():
    print(key)
    plot_metadata(cistopic_obj_dict[key],
                  reduction_name='UMAP',
                  variables=['Unique_nr_frag', 'TSS_enrichment', 'Dupl_rate', 'FRIP',
                             'Doublet_scores_fragments', 'fmx_sample', 'pycisTopic_leiden_10_0.6', 'seurat_cell_type'],
                  target='cell',
                  num_columns=4,
                  text_size=16,
                  dot_size=15,
                  figsize=(18,9),
                 )

## Batch effect correction with Harmony

In [None]:
from pycisTopic.clust_vis import harmony

In [None]:
harmony(cistopic_obj_dict['merged'],
        'sample_id',
        random_state=555,
        max_iter_harmony=20
       )

In [None]:
run_umap(cistopic_obj_dict['merged'], target = 'cell', reduction_name='UMAP_harmony', harmony=True)
#run_tsne(cistopic_obj_dict['merged'], target = 'cell', reduction_name='tSNE_harmony', harmony=True)

In [None]:
plot_metadata(cistopic_obj_dict['merged'],
              reduction_name='UMAP_harmony',
              variables=['Unique_nr_frag', 'TSS_enrichment', 'Dupl_rate', 'FRIP',
                             'Doublet_scores_fragments', 'fmx_sample', 'pycisTopic_leiden_10_0.6', 'seurat_cell_type', 'sample_id'],
              target='cell',
              num_columns=3,
              text_size=16,
              dot_size=10,
              figsize=(23,16),
             )

In [None]:
print("UMAP without batch effect correction")
plot_metadata(cistopic_obj_dict['merged'],
              reduction_name='UMAP',
              variables=['sample_id'],
              target='cell',
              show_legend=True,
              show_label=False,
              num_columns=1,
              text_size=16,
              dot_size=10,
              figsize=(10,10),
             )

In [None]:
print("UMAP with batch effect correction")
plot_metadata(cistopic_obj_dict['merged'],
              reduction_name='UMAP_harmony',
              variables=['sample_id'],
              target='cell',
              show_legend=True,
              show_label=False,
              num_columns=1,
              text_size=16,
              dot_size=10,
              figsize=(10,10),
             )

# Cluster-cell type consensus

Here we use the label transfer from Seurat to generate a consensus cell type for each cluster identified in each sample.

In [None]:
clustering='pycisTopic_leiden_10_0.8'
ct_pred_thr=0.7

clust_consensus = {}
for key in cistopic_obj_dict.keys():
    if key=='merged':
        continue
    celldata = cistopic_obj_dict[key].cell_data

    major_cell_type = []
    frac_of_cluster = []
    next_celltype = []
    next_frac_of_cluster = []
    diff_to_next = []

    for c in celldata[clustering].unique():
        c1 = celldata[celldata[clustering]==c]
        # find proportions of each unique cell type detected in this cluster:
        ctp = pd.DataFrame([
            c1['seurat_cell_type'].unique(),
            [ c1[c1['seurat_cell_type']==x].shape[0] / c1.shape[0] for x in c1['seurat_cell_type'].unique() ]
        ]).T
        # sort/rank by proportion:
        ctps = ctp.sort_values(1,ascending=False)
        if ctp.shape[0]>1:
            n = ctps.iloc[1,0] # next cell type detected
            dtn = ctps.iloc[0,1] - ctps.iloc[1,1] # distance to next
            dtnf = ctps.iloc[1,1]
        else:
            dtn = None
            dtnf = None
            n = None
        major_cell_type.append(ctps.iloc[0,0])
        frac_of_cluster.append(ctps.iloc[0,1])
        next_celltype.append(n)
        next_frac_of_cluster.append(dtnf)
        diff_to_next.append(dtn)
        
    # collect results for this sample
    res = pd.DataFrame([
            celldata[clustering].unique(),
            major_cell_type,
            frac_of_cluster,
            #
            next_celltype,
            next_frac_of_cluster,
            diff_to_next,
        ]).T.set_axis(
            ['cluster', 'major_cell_type', 'frac_of_cluster', 
             'next_celltype', 'next_frac_of_cluster', 'diff_to_next'],
            axis=1
        ).set_index(key + '__' + celldata[clustering].unique())
    clust_consensus[key] = res
    print(key)
    # print only rows where there is another cell type within 20%:
    if(sum(res['diff_to_next']<0.20)>0):
        display(res[ res['diff_to_next']<0.20 ])
        plot_metadata(cistopic_obj_dict[key],
                      reduction_name='UMAP',
                      variables=[clustering, 'seurat_cell_type'],
                      target='cell',
                      num_columns=2,
                      text_size=16,
                      dot_size=15,
                      figsize=(12,6),
                     )
    else:
        print("No discrepant clusters")

In [None]:
#clust_consensus = pd.concat(clust_consensus, axis=0)
clust_consensus['CNAG_1']

### Add cluster-based cell identities to the cisTopic object

In [None]:
for key in cistopic_obj_dict.keys():
    if key=='merged':
        continue
    cistopic_obj_dict[key].cell_data['consensus_cell_type'] = ""
    # add each cluster identity:
    for i,r in clust_consensus[key].iterrows():
        ix = cistopic_obj_dict[key].cell_data[clustering] == r['cluster']
        cistopic_obj_dict[key].cell_data.loc[ix,'consensus_cell_type'] = r['major_cell_type']

In [None]:
# add to the merged object (use a concatenation of all objects' cell_data):
cistopic_obj_dict['merged'].add_cell_data(
    pd.concat(
        [ cistopic_obj_dict[key].cell_data[['consensus_cell_type']] 
             for key in cistopic_obj_dict.keys() if key not in 'merged' ],
        axis=0
    )
)

## Save/load cisTopic objects

In [None]:
f_out = os.path.join(wdir, f_cto_dir, 'fulldata_all_filtered_annotated__cistopic_obj.pkl')
if os.path.isfile(f_out):
    print(f"Loading {f_out}")
    with open(f_out, 'rb') as f:
        cistopic_obj_dict = pickle.load(f)
else:
    with open(f_out, 'wb') as f:
        pickle.dump(cistopic_obj_dict, f)

In [None]:
# export filtered set of barcodes (without doublets, etc)
bc_passing_filters2 = {}
for key in cistopic_obj_dict.keys():
    if(key == 'merged'):
        continue
    bc_passing_filters2[key] = cistopic_obj_dict[key].cell_data['barcode'].tolist()

with open(os.path.join(wdir,'barcodes_passing_filters2.pkl'), 'wb') as f:
    pickle.dump(bc_passing_filters2, f)

In [None]:
cistopic_obj_dict['merged'].cell_data.to_csv(
    os.path.join(wdir,'fulldata_filtered_cell_data.tsv'),
    sep='\t'
)

## Export pseudobulk profiles

In [None]:
from pycisTopic.pseudobulk_peak_calling import export_pseudobulk, peak_calling
import pyranges as pr
import requests

In [None]:
# get chromosome sizes (hg38)
target_url = 'http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes'
chromsizes = pd.read_csv(target_url, sep='\t', header=None)
chromsizes.columns = ['Chromosome', 'End']
chromsizes['Start'] = [0]*chromsizes.shape[0]
chromsizes = chromsizes.loc[:,['Chromosome', 'Start', 'End']]
chromsizes = pr.PyRanges(chromsizes)
chromsizes

In [None]:
f_pseudo_dir = 'consensus_peak_calling'
if not os.path.exists(os.path.join(wdir, f_pseudo_dir)):
    os.makedirs(os.path.join(wdir, f_pseudo_dir))

In [None]:
bw_paths_dict = {}
bed_paths_dict = {}
for key in cistopic_obj_dict.keys():
    print(f"Starting {key}")
    f_dir = os.path.join(wdir, f_pseudo_dir, key)
    if not os.path.exists(f_dir):
        os.makedirs(f_dir)
    
    bw_paths, bed_paths = export_pseudobulk(
                #input_data = cistopic_obj_dict['merged'].cell_data,
                input_data = cistopic_obj_dict[key],
                variable = 'consensus_cell_type',
                sample_id_col = 'sample_id',
                chromsizes = chromsizes,
                bed_path = os.path.join(f_dir, 'pseudobulk_bed_files'),
                bigwig_path =  os.path.join(f_dir, 'pseudobulk_bw_files'),
                #path_to_fragments = fr_dict,
                n_cpu = 5,
                normalize_bigwig = True,
                remove_duplicates = True,
                )
    bw_paths_dict[key] = bw_paths.copy()
    bed_paths_dict[key] = bed_paths.copy()
    if ray.is_initialized():
        print("Shutting down Ray")
        ray.shutdown()

### Infer consensus peaks

In [None]:
#Infer consensus peaks

narrow_peaks_dict = {}

for key in cistopic_obj_dict.keys():
    print(f"Starting {key}")
    f_dir = os.path.join(wdir, f_pseudo_dir, key, 'macs2')
    if not os.path.exists(f_dir):
        os.makedirs(f_dir)

    # Run peak calling
    narrow_peaks_dict[key] = peak_calling('macs2',
                                     bed_paths_dict[key],
                                     f_dir,
                                     genome_size='hs',
                                     n_cpu=5,
                                     input_format='BEDPE',
                                     shift=73, 
                                     ext_size=146,
                                     keep_dup = 'all',
                                     q_value = 0.05,
                                     )

### Derive the consensus peaks

In [None]:
from pycisTopic.iterative_peak_calling import get_consensus_peaks

In [None]:
peak_half_width = 250

# Get consensus peaks
consensus_peaks_dict = {}
for key in cistopic_obj_dict.keys():
    print(f"Starting {key}")
    f_dir = os.path.join(wdir, f_pseudo_dir, key)
    if not os.path.exists(f_dir):
        os.makedirs(f_dir)
        
    consensus_peaks_dict[key] = get_consensus_peaks(
        narrow_peaks_dict[key],
        peak_half_width,
        chromsizes = chromsizes,
        path_to_blacklist = path_to_blacklist
    )
    consensus_peaks_dict[key].to_bed(
        path=os.path.join(f_dir, key + '__consensus_regions.bed'),
        keep=True,
        compression='infer',
        chain=False
    )