# pycisTopic analysis

Full dataset, using consensus peak regions.

In [58]:
import pycisTopic
pycisTopic.__version__

'0.1.dev300+g7494158'

In [59]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [60]:
import pickle
import pandas as pd

In [61]:
import os
wdir = '/lustre1/project/stg_00002/lcb/fderop/data/20211024_hca_benchmark_libds_merged/'
os.chdir( wdir )

In [62]:
# create output directory:
f_final_dir = os.path.join(wdir, 'downstream_analysis')
if not os.path.exists(f_final_dir):
    os.makedirs(f_final_dir)

In [63]:
import copy

## Load the cisTopic objects

In [64]:
f_cto_dir = 'pycistopic_consensus_peaks/cistopic_objs__mergedconsensus/'

cistopic_obj_dict = {}
for key in ['libds_merged']:
    f_cto = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_mergedconsensus_metadata_annotated_models.pkl')
    if(os.path.isfile(f_cto)):
        with open(f_cto, 'rb') as f:
            cistopic_obj_dict[key] = pickle.load(f)
        print(f"Loaded filtered cistopic object {key}")
    else:
        print(f"file {f_cto} doesn't exist")

Loaded filtered cistopic object libds_merged


In [65]:
from collections import OrderedDict

In [66]:
cistopic_obj_dict['libds_merged'].cell_data['sample_id'].unique()

array(['Broad_1', 'Broad_2', 'Broad_mito_1', 'Broad_mito_2', 'CNAG_1',
       'CNAG_2', 'Sanger_1', 'Sanger_2', 'Stanford_1', 'Stanford_2',
       'VIB_1', 'VIB_2', 'VIB_Hydrop_1', 'VIB_Hydrop_2', 's3atac'],
      dtype=object)

In [67]:
# display names of each sample. NOTE: the samples will be dislpayed in the order of this dict!
alias_dict = OrderedDict({
    "Broad_1": "BioRad ATAC 1",
    "Broad_2": "BioRad ATAC 2",
    "Stanford_1": "10x ATAC A1",
    "Stanford_2": "10x ATAC A2",
    "VIB_1": "10x ATAC B1",
    "VIB_2": "10x ATAC B2",
    "CNAG_1": "10x ATAC C1",
    "CNAG_2": "10x ATAC C2",
    "Broad_mito_1": "10x mtATAC 1",
    "Broad_mito_2": "10x mtATAC 2",
    "Sanger_1": "10x Multiome 1",
    "Sanger_2": "10x Multiome 2",
    "VIB_Hydrop_1": "Hydrop ATAC 1",
    "VIB_Hydrop_2": "Hydrop ATAC 2",
    "s3atac": "s3 ATAC",
    "merged": "Merged"
})

In [68]:
cistopic_obj_dict['libds_merged'].cell_data['alias'] = [alias_dict[x] for x in cistopic_obj_dict['libds_merged'].cell_data['sample_id']]

In [69]:
cistopic_obj_dict['libds_merged'].cell_data['sample_id'].unique()

array(['Broad_1', 'Broad_2', 'Broad_mito_1', 'Broad_mito_2', 'CNAG_1',
       'CNAG_2', 'Sanger_1', 'Sanger_2', 'Stanford_1', 'Stanford_2',
       'VIB_1', 'VIB_2', 'VIB_Hydrop_1', 'VIB_Hydrop_2', 's3atac'],
      dtype=object)

In [70]:
for key in cistopic_obj_dict['libds_merged'].cell_data['sample_id'].unique():
    print(alias_dict[key])

BioRad ATAC 1
BioRad ATAC 2
10x mtATAC 1
10x mtATAC 2
10x ATAC C1
10x ATAC C2
10x Multiome 1
10x Multiome 2
10x ATAC A1
10x ATAC A2
10x ATAC B1
10x ATAC B2
Hydrop ATAC 1
Hydrop ATAC 2
s3 ATAC


#### Save/load

In [71]:
f_out = os.path.join(f_final_dir, 'region_bin_topics.pkl')
if os.path.isfile(f_out):
    print(f"Loading {f_out}")
    with open(f_out, 'rb') as f:
        region_bin_topics_dict = pickle.load(f)

Loading /lustre1/project/stg_00002/lcb/fderop/data/20211024_hca_benchmark_libds_merged/downstream_analysis/region_bin_topics.pkl


In [72]:
f_out = os.path.join(f_final_dir, 'binarized_cell_topic.pkl')
if os.path.isfile(f_out):
    print(f"Loading {f_out}")
    with open(f_out, 'rb') as f:
        binarized_cell_topic_dict = pickle.load(f)

Loading /lustre1/project/stg_00002/lcb/fderop/data/20211024_hca_benchmark_libds_merged/downstream_analysis/binarized_cell_topic.pkl


In [73]:
f_out = os.path.join(f_final_dir, 'imputed_acc_obj.pkl')
if os.path.isfile(f_out):
    print(f"Loading {f_out}")
    with open(f_out, 'rb') as f:
        imputed_acc_obj_dict = pickle.load(f)

Loading /lustre1/project/stg_00002/lcb/fderop/data/20211024_hca_benchmark_libds_merged/downstream_analysis/imputed_acc_obj.pkl


In [74]:
f_out = os.path.join(f_final_dir, 'markers_dict.pkl')
if os.path.isfile(f_out):
    print(f"Loading {f_out}")
    with open(f_out, 'rb') as f:
        markers_dict_dict = pickle.load(f)

Loading /lustre1/project/stg_00002/lcb/fderop/data/20211024_hca_benchmark_libds_merged/downstream_analysis/markers_dict.pkl


## Gene activity

In [75]:
import pyranges as pr
import requests
import pybiomart as pbm

### Infer gene activity

In [76]:
f_out = os.path.join(f_final_dir, 'gene_act_dict.pkl')
if os.path.isfile(f_out):
    print(f"Loading {f_out}")
    with open(f_out, 'rb') as f:
        gene_act_dict = pickle.load(f)
else:
    from pycisTopic.gene_activity import get_gene_activity
    from pycisTopic.diff_features import find_diff_features
    # For human
    dataset = pbm.Dataset(name='hsapiens_gene_ensembl',  host='http://www.ensembl.org')
    annot = dataset.query(attributes=['chromosome_name', 'start_position', 'end_position', 'strand', 'external_gene_name', 'transcription_start_site', 'transcript_biotype'])
    annot['Chromosome/scaffold name'] = 'chr' + annot['Chromosome/scaffold name'].astype(str)
    annot.columns=['Chromosome', 'Start', 'End', 'Strand', 'Gene','Transcription_Start_Site', 'Transcript_type']
    annot = annot[annot.Transcript_type == 'protein_coding']
    annot.Strand[annot.Strand == 1] = '+'
    annot.Strand[annot.Strand == -1] = '-'
    pr_annotation = pr.PyRanges(annot.dropna(axis = 0))
    pr_annotation

    # get chromosome sizes (hg38)
    target_url = 'http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes'
    chromsizes = pd.read_csv(target_url, sep='\t', header=None)
    chromsizes.columns = ['Chromosome', 'End']
    chromsizes['Start'] = [0]*chromsizes.shape[0]
    chromsizes = chromsizes.loc[:,['Chromosome', 'Start', 'End']]
    chromsizes = pr.PyRanges(chromsizes)
    chromsizes

    gene_act_dict = {}
    for key in cistopic_obj_dict.keys():
        print(key)
        gene_act, weights = get_gene_activity(
            imputed_acc_obj_dict[key], # Region-cell probabilities
            pr_annotation, # Gene annotation
            chromsizes, # Chromosome size
            use_gene_boundaries=True, # Whether to use the whole search space or stop when encountering another gene
            upstream=[1000, 100000], # Search space upstream. The minimum means that even if there is a gene right next to it 
                                     #these bp will be taken (1kbp here)
            downstream=[1000,100000], # Search space downstream
            distance_weight=True, # Whether to add a distance weight (an exponential function, the weight will decrease with distance)
            decay_rate=1, # Exponent for the distance exponential funciton (the higher the faster will be the decrease)
            extend_gene_body_upstream=10000, # Number of bp upstream immune to the distance weight (their value will be maximum for 
                                             #this weight)
            extend_gene_body_downstream=500, # Number of bp downstream immune to the distance weight
            gene_size_weight=False, # Whether to add a weights based on the length of the gene
            gene_size_scale_factor='median', # Dividend to calculate the gene size weigth. Default is the median value of all genes
                                             #in the genome
            remove_promoters=False, # Whether to remove promoters when computing gene activity scores
            average_scores=True, # Whether to divide by the total number of region assigned to a gene when calculating the gene 
                                 # activity score
            scale_factor=1, # Value to multiply for the final gene activity matrix
            extend_tss=[10,10], # Space to consider a promoter
            gini_weight = True, # Whether to add a gini index weigth. The more unique the region is, the higher this weight will be
            return_weights= True, # Whether to return the final weights
            project='Gene_activity') # Project name for the gene activity object
        gene_act_dict[key] = copy.copy(gene_act)

    with open(f_out, 'wb') as f:
        pickle.dump(gene_act_dict, f)
    print(f'saved {f_out}')

Loading /lustre1/project/stg_00002/lcb/fderop/data/20211024_hca_benchmark_libds_merged/downstream_analysis/gene_act_dict.pkl


In [81]:
cistopic_obj_dict['libds_merged'].projections

{'cell': {'UMAP':                                                        UMAP_1     UMAP_2
  CCTCCTTCTTCATCCAGAGAG_TAAGAGGGTGGCGCCTTGCGA_CGG...   5.778868  11.895809
  CCTTAGGACGAGAATTATCAT_CCGCGATACCTACCAGATAGG-Bro...  -0.684854  19.437815
  ATGAATAGTGCATTGCAGTGT-Broad_1                        2.187975  14.684908
  TGTTTAGATAGGCATAAGGTA-Broad_1                        6.169397  11.672524
  TTACAGAGGTGTTTCCAAGCT_GGACGACAGTTTCTCTAGAGC-Bro...   5.618959  11.884871
  ...                                                       ...        ...
  GGTTAGTTGGTATTGCAGCTCGGACAAC-s3atac                -10.563517   3.115543
  GATTCGGTCAGTTCTCTCCTTGACGAAT-s3atac                -10.507968   3.265422
  TGCGGCCTGGTCTCATTGCCCGGAACTG-s3atac                 -7.065394  -2.570484
  GAAGAGTATTTCTCCTCCTGGTGTCGGA-s3atac                -10.610051   2.621628
  TGCGGCCTGGAATGATGCTCATTGTGAA-s3atac                 -7.511748  -3.023273
  
  [45235 rows x 2 columns],
  'UMAP_harmony':                                    

In [82]:
help(export_gene_activity_to_loom)

Help on function export_gene_activity_to_loom in module pycisTopic.loom:

export_gene_activity_to_loom(gene_activity_matrix: Union[ForwardRef('CistopicImputedFeatures'), pandas.core.frame.DataFrame], cistopic_obj: 'CistopicObject', regulons: List[pyscenic.genesig.Regulon], out_fname: str, selected_genes: Union[List[str], NoneType] = None, selected_cells: Union[List[str], NoneType] = None, auc_mtx: Union[pandas.core.frame.DataFrame, NoneType] = None, auc_thresholds: Union[pandas.core.frame.DataFrame, NoneType] = None, cluster_annotation: List[str] = None, cluster_markers: Dict[str, Dict[str, pandas.core.frame.DataFrame]] = None, tree_structure: Sequence[str] = (), title: str = None, nomenclature: str = 'Unknown', **kwargs)
    Create SCope [Davie et al, 2018] compatible loom files for gene activity exploration
    
    Parameters
    ---------
    gene_activity_matrix: class::CistopicImputedFeatures or class::pd.DataFrame
        A cisTopic imputed features object containing imputed gen

## Export to loom

In [22]:
f_region_loom_dir = os.path.join(f_final_dir, 'region_acc_loom')
if not os.path.exists(f_region_loom_dir):
    os.makedirs(f_region_loom_dir)
    
f_gene_loom_dir = os.path.join(f_final_dir, 'gene_act_loom')
if not os.path.exists(f_gene_loom_dir):
    os.makedirs(f_gene_loom_dir)

In [23]:
from pycisTopic.loom import (
    export_region_accessibility_to_loom,
    export_gene_activity_to_loom
)

### Gene activity

In [24]:
cistopic_obj_dict['libds_merged'].cell_data.columns

Index(['cisTopic_nr_frag', 'cisTopic_log_nr_frag', 'cisTopic_nr_acc',
       'cisTopic_log_nr_acc', 'sample_id', 'Log_total_nr_frag',
       'Log_unique_nr_frag', 'Total_nr_frag', 'Unique_nr_frag', 'Dupl_nr_frag',
       'Dupl_rate', 'Total_nr_frag_in_regions', 'Unique_nr_frag_in_regions',
       'FRIP', 'TSS_enrichment', 'barcode', 'seurat_cell_type',
       'consensus_cell_type', 'pycisTopic_leiden_10_0.6',
       'pycisTopic_leiden_10_0.8', 'pycisTopic_leiden_10_1.0',
       'pycisTopic_leiden_10_1.2', 'fmx_sample', 'alias'],
      dtype='object')

In [25]:
from ctxcore.genesig import Regulon

# generate a dummy regulon (required for export_gene_activity_to_loom):
phreg = Regulon(
        name='placeholder regulon',
        gene2weight={'phreg': 1.0},
        transcription_factor="phreg",
        gene2occurrence={"phreg": 1},
    )

In [26]:
for key in cistopic_obj_dict.keys():
    print(key)
    s = 'Merged' if key=='libds_merged' else alias_dict[key]
    f_out = os.path.join(f_gene_loom_dir, s + '__libDS_gene_activity.loom')
    if os.path.exists(f_out):
        print(f"Skipping {f_out}: already exists.")
        continue
        
    export_gene_activity_to_loom(
        gene_activity_matrix = gene_act_dict[key],
        cistopic_obj = cistopic_obj_dict[key], 
        regulons = [phreg],
        # selected_cells = [ x.split('-')[0] + '-' + x.split('-')[1]  for x in cistopic_obj_dict[key].cell_names ], # this leaves a cell barcode of the format type 'TGCATGTCGCCGTTCCAAGA-21'
        # selected_cells = cistopic_obj_dict[key].projections['cell']['UMAP'].index.tolist(), # cflerin original 
        out_fname = f_out,
        cluster_annotation = ['consensus_cell_type', 'alias', 'fmx_sample'],
        cluster_markers = {'consensus_cell_type': markers_dict_dict[key]},
        tree_structure = ('scATAC-seq_Benchmark', 'ATAC_library_downsampled', 'Gene_activity'),
        title = 'Gene activity from library-downsampled and sample-merged dataset',
        nomenclature = "hg38"
    )

libds_merged
2021-11-24 20:06:44,272 cisTopic     INFO     Creating minimal loom
2021-11-24 20:14:23,840 cisTopic     INFO     Adding annotations
2021-11-24 20:15:03,265 cisTopic     INFO     Adding clusterings
2021-11-24 20:15:04,569 cisTopic     INFO     Adding markers


--- Logging error ---
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/logging/__init__.py", line 1081, in emit
    msg = self.format(record)
  File "/usr/local/lib/python3.8/logging/__init__.py", line 925, in format
    return fmt.format(record)
  File "/usr/local/lib/python3.8/logging/__init__.py", line 664, in format
    record.message = record.getMessage()
  File "/usr/local/lib/python3.8/logging/__init__.py", line 369, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/opt/venv/bin/ipython", line 8, in <module>
    sys.exit(start_ipython())
  File "/opt/venv/lib/python3.8/site-packages/IPython/__init__.py", line 126, in start_ipython
    return launch_new_instance(argv=argv, **kwargs)
  File "/opt/venv/lib/python3.8/site-packages/traitlets/config/application.py", line 846, in launch_instance
    app.start()
  File "/opt/venv/lib/python3.8/site-packages/IPython/terminal/ipapp.py", line 352,

2021-11-24 20:15:04,890 cisTopic     INFO     Exporting


### Region accessibility

In [27]:
for key in cistopic_obj_dict.keys():
    print(key)
    s = 'merged' if key=='libds_merged' else key
    f_out = os.path.join(f_region_loom_dir, s + '__libDS_region_accessibility.loom')
    if os.path.exists(f_out):
        print(f"Skipping {f_out}: already exists.")
        continue
    # Subset regions, we will use only regions in topics and DARs here to make it faster
    regions_in_topics = list(set(sum([region_bin_topics_dict[key][i].index.tolist() for i in region_bin_topics_dict[key].keys()],[])))
    regions_in_DARs = list(set(sum([markers_dict_dict[key][i].index.tolist() for i in markers_dict_dict[key].keys()],[])))
    # make sure we only take regions that actually exist in the accessibility matrix:
    selected_regions = list(set(regions_in_topics + regions_in_DARs).intersection(set(imputed_acc_obj_dict[key].feature_names)))

    # Export to loom
    export_region_accessibility_to_loom(
        accessibility_matrix = imputed_acc_obj_dict[key],
        cistopic_obj = cistopic_obj_dict[key], 
        binarized_topic_region = region_bin_topics_dict[key],
        binarized_cell_topic = binarized_cell_topic_dict[key],
        out_fname = f_out,
        selected_regions = selected_regions ,
        # selected_cells = [ x.split('-')[0] + '-' + x.split('-')[1]  for x in cistopic_obj_dict[key].cell_names ], # this leaves a cell barcode of the format type 'TGCATGTCGCCGTTCCAAGA-21'
        # selected_cells = cistopic_obj_dict[key].projections['cell']['UMAP'].index.tolist(), # cflerin original
        cluster_annotation = ['consensus_cell_type'],
        cluster_markers = {'consensus_cell_type': markers_dict_dict[key]},
        tree_structure = ('scATAC-seq_Benchmark', 'ATAC_library_downsampled', 'Region_accessibility'),
        title = s + ' - Region accessibility all',
        nomenclature = "hg38"
    )

libds_merged
2021-11-24 20:25:41,210 cisTopic     INFO     Creating minimal loom
Regulon name does not seem to be compatible with SCOPE. It should include a space to allow selection of the TF. 
Please run: 
 regulons = [r.rename(r.name.replace('(+)',' ('+str(len(r))+'g)')) for r in regulons] 
or:
 regulons = [r.rename(r.name.replace('(',' (')) for r in regulons]
2021-11-25 01:13:05,808 cisTopic     INFO     Adding annotations
2021-11-25 01:24:50,795 cisTopic     INFO     Adding clusterings
2021-11-25 01:24:51,477 cisTopic     INFO     Adding markers
2021-11-25 01:24:55,589 cisTopic     INFO     Exporting


# caution:
the export loom function contains a bug which adds the barcodes as a clustering, causing scope to crash on loading. this is removed in the final loom jupyter notebook.

# upload LOOMs

In [None]:
curl -F 'UUID=scATAC-seq_Benchmark' -F 'file-type=Loom' -F 'file=@/lustre1/project/stg_00002/lcb/fderop/data/20211024_hca_benchmark_libds_merged/downstream_analysis/gene_act_loom/Merged__libDS_gene_activity.loom' https://scope.aertslab.org/upload/ | tee /dev/null