# pycisTopic analysis

Full dataset, using consensus peak regions.

In [1]:
import pycisTopic
pycisTopic.__version__

'0.1.dev300+g7494158'

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
import pickle
import pandas as pd

In [4]:
import copy

In [5]:
import os
wdir = '/lustre1/project/stg_00002/lcb/fderop/data/20211013_hca_benchmark_libds_singles//'
os.chdir( wdir )

In [6]:
# create output directory:
f_final_dir = os.path.join(wdir, 'downstream_analysis')
if not os.path.exists(f_final_dir):
    os.makedirs(f_final_dir)

## Load the cisTopic objects

In [7]:
f_cto_dir = 'pycistopic_consensus_peaks/cistopic_objs__consensus'

cistopic_obj_dict = {}
for key in ['VIB_Hydrop_1', 'VIB_Hydrop_2']:
    f_cto = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_metadata_annotated_dimreduc.pkl')
    if(os.path.isfile(f_cto)):
        with open(f_cto, 'rb') as f:
            cistopic_obj_dict[key] = pickle.load(f)
        print(f"Loaded filtered cistopic object {key}")
    else:
        print(f"file {f_cto} doesn't exist")

Loaded filtered cistopic object VIB_Hydrop_1
Loaded filtered cistopic object VIB_Hydrop_2


In [8]:
from collections import OrderedDict

#### Save/load

In [9]:
f_out = os.path.join(f_final_dir, 'region_bin_topics.pkl')
if os.path.isfile(f_out):
    with open(f_out, 'rb') as f:
        region_bin_topics_dict = pickle.load(f)
    print(f"Loaded {f_out}")

Loaded /lustre1/project/stg_00002/lcb/fderop/data/20211013_hca_benchmark_libds_singles//downstream_analysis/region_bin_topics.pkl


In [10]:
f_out = os.path.join(f_final_dir, 'binarized_cell_topic.pkl')
if os.path.isfile(f_out):
    with open(f_out, 'rb') as f:
        binarized_cell_topic_dict = pickle.load(f)
    print(f"Loaded {f_out}")

Loaded /lustre1/project/stg_00002/lcb/fderop/data/20211013_hca_benchmark_libds_singles//downstream_analysis/binarized_cell_topic.pkl


In [11]:
f_out = os.path.join(f_final_dir, 'imputed_acc_obj.pkl')
if os.path.isfile(f_out):
    with open(f_out, 'rb') as f:
        imputed_acc_obj_dict = pickle.load(f)
    print(f"Loaded {f_out}")

Loaded /lustre1/project/stg_00002/lcb/fderop/data/20211013_hca_benchmark_libds_singles//downstream_analysis/imputed_acc_obj.pkl


In [12]:
f_out = os.path.join(f_final_dir, 'markers_dict.pkl')
if os.path.isfile(f_out):
    with open(f_out, 'rb') as f:
        markers_dict_dict = pickle.load(f)
    print(f"Loaded {f_out}")

Loaded /lustre1/project/stg_00002/lcb/fderop/data/20211013_hca_benchmark_libds_singles//downstream_analysis/markers_dict.pkl


## Gene activity

In [13]:
import pyranges as pr
import requests
import pybiomart as pbm

In [14]:
    from pycisTopic.gene_activity import get_gene_activity
    from pycisTopic.diff_features import find_diff_features

### Infer gene activity

In [15]:
f_out = os.path.join(f_final_dir, 'gene_act_dict.pkl')
if os.path.isfile(f_out):
    print(f"Loading {f_out}")
    with open(f_out, 'rb') as f:
        gene_act_dict = pickle.load(f)
else:
    from pycisTopic.gene_activity import get_gene_activity
    from pycisTopic.diff_features import find_diff_features
    # For human
    dataset = pbm.Dataset(name='hsapiens_gene_ensembl',  host='http://www.ensembl.org')
    annot = dataset.query(attributes=['chromosome_name', 'start_position', 'end_position', 'strand', 'external_gene_name', 'transcription_start_site', 'transcript_biotype'])
    annot['Chromosome/scaffold name'] = 'chr' + annot['Chromosome/scaffold name'].astype(str)
    annot.columns=['Chromosome', 'Start', 'End', 'Strand', 'Gene','Transcription_Start_Site', 'Transcript_type']
    annot = annot[annot.Transcript_type == 'protein_coding']
    annot.Strand[annot.Strand == 1] = '+'
    annot.Strand[annot.Strand == -1] = '-'
    pr_annotation = pr.PyRanges(annot.dropna(axis = 0))
    pr_annotation

    # get chromosome sizes (hg38)
    target_url = 'http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes'
    chromsizes = pd.read_csv(target_url, sep='\t', header=None)
    chromsizes.columns = ['Chromosome', 'End']
    chromsizes['Start'] = [0]*chromsizes.shape[0]
    chromsizes = chromsizes.loc[:,['Chromosome', 'Start', 'End']]
    chromsizes = pr.PyRanges(chromsizes)
    chromsizes

    gene_act_dict = {}
    for key in cistopic_obj_dict.keys():
        print(key)
        gene_act, weights = get_gene_activity(
            imputed_acc_obj_dict[key], # Region-cell probabilities
            pr_annotation, # Gene annotation
            chromsizes, # Chromosome size
            use_gene_boundaries=True, # Whether to use the whole search space or stop when encountering another gene
            upstream=[1000, 100000], # Search space upstream. The minimum means that even if there is a gene right next to it 
                                     #these bp will be taken (1kbp here)
            downstream=[1000,100000], # Search space downstream
            distance_weight=True, # Whether to add a distance weight (an exponential function, the weight will decrease with distance)
            decay_rate=1, # Exponent for the distance exponential funciton (the higher the faster will be the decrease)
            extend_gene_body_upstream=10000, # Number of bp upstream immune to the distance weight (their value will be maximum for 
                                             #this weight)
            extend_gene_body_downstream=500, # Number of bp downstream immune to the distance weight
            gene_size_weight=False, # Whether to add a weights based on the length of the gene
            gene_size_scale_factor='median', # Dividend to calculate the gene size weigth. Default is the median value of all genes
                                             #in the genome
            remove_promoters=False, # Whether to remove promoters when computing gene activity scores
            average_scores=True, # Whether to divide by the total number of region assigned to a gene when calculating the gene 
                                 # activity score
            scale_factor=1, # Value to multiply for the final gene activity matrix
            extend_tss=[10,10], # Space to consider a promoter
            gini_weight = True, # Whether to add a gini index weigth. The more unique the region is, the higher this weight will be
            return_weights= True, # Whether to return the final weights
            project='Gene_activity') # Project name for the gene activity object
        gene_act_dict[key] = copy.copy(gene_act)

    with open(f_out, 'wb') as f:
        pickle.dump(gene_act_dict, f)
    print(f'saved {f_out}')

Loading /lustre1/project/stg_00002/lcb/fderop/data/20211013_hca_benchmark_libds_singles//downstream_analysis/gene_act_dict.pkl


## Export to loom

In [16]:
f_region_loom_dir = os.path.join(f_final_dir, 'region_acc_loom')
if not os.path.exists(f_region_loom_dir):
    os.makedirs(f_region_loom_dir)
    
f_gene_loom_dir = os.path.join(f_final_dir, 'gene_act_loom')
if not os.path.exists(f_gene_loom_dir):
    os.makedirs(f_gene_loom_dir)

In [17]:
from pycisTopic.loom import (
    export_region_accessibility_to_loom,
    export_gene_activity_to_loom
)

### Gene activity

In [18]:
from ctxcore.genesig import Regulon

# generate a dummy regulon (required for export_gene_activity_to_loom):
phreg = Regulon(
        name='placeholder regulon',
        gene2weight={'phreg': 1.0},
        transcription_factor="phreg",
        gene2occurrence={"phreg": 1},
    )

In [19]:
for key in cistopic_obj_dict.keys():
    print(key)
    f_out = os.path.join(f_gene_loom_dir, key + '__libDS_gene_activity.loom')
    if os.path.exists(f_out):
        print(f"Skipping {f_out}: already exists.")
        continue
        
    export_gene_activity_to_loom(
        gene_activity_matrix = gene_act_dict[key],
        cistopic_obj = cistopic_obj_dict[key], 
        regulons = [phreg],
        # selected_cells = [ x.split('-')[0] + '-' + x.split('-')[1]  for x in cistopic_obj_dict[key].cell_names ], # this leaves a cell barcode of the format type 'TGCATGTCGCCGTTCCAAGA-21'
        # selected_cells = cistopic_obj_dict[key].projections['cell']['UMAP'].index.tolist(), # cflerin original 
        out_fname = f_out,t
        cluster_annotation = ['consensus_cell_type', 'SCREEN_fmx_sample'],
        cluster_markers = {'consensus_cell_type': markers_dict_dict[key], 'SCREEN_fmx_sample':{}},
        tree_structure = ('scATAC-seq_Benchmark', 'ATAC_library_downsampled', 'Gene_activity'),
        title = 'Gene activity from library-downsampled and sample-merged dataset',
        nomenclature = "hg38"
    )

VIB_Hydrop_1
2021-11-26 13:58:48,686 cisTopic     INFO     Creating minimal loom
2021-11-26 13:59:03,727 cisTopic     INFO     Adding annotations
2021-11-26 13:59:05,305 cisTopic     INFO     Adding clusterings
2021-11-26 13:59:05,323 cisTopic     INFO     Adding markers
No markers for  sampleA
No markers for  sampleB
2021-11-26 13:59:05,568 cisTopic     INFO     Exporting
VIB_Hydrop_2
2021-11-26 13:59:16,087 cisTopic     INFO     Creating minimal loom
2021-11-26 13:59:30,968 cisTopic     INFO     Adding annotations
2021-11-26 13:59:32,779 cisTopic     INFO     Adding clusterings
2021-11-26 13:59:32,799 cisTopic     INFO     Adding markers
No markers for  sampleA
No markers for  sampleB
2021-11-26 13:59:33,038 cisTopic     INFO     Exporting


### Region accessibility

In [None]:
for key in cistopic_obj_dict.keys():
    print(key)
    s = 'merged' if key=='libds_merged' else key
    f_out = os.path.join(f_region_loom_dir, s + '__libDS_region_accessibility.loom')
    if os.path.exists(f_out):
        print(f"Skipping {f_out}: already exists.")
        continue
    # Subset regions, we will use only regions in topics and DARs here to make it faster
    regions_in_topics = list(set(sum([region_bin_topics_dict[key][i].index.tolist() for i in region_bin_topics_dict[key].keys()],[])))
    regions_in_DARs = list(set(sum([markers_dict_dict[key][i].index.tolist() for i in markers_dict_dict[key].keys()],[])))
    # make sure we only take regions that actually exist in the accessibility matrix:
    selected_regions = list(set(regions_in_topics + regions_in_DARs).intersection(set(imputed_acc_obj_dict[key].feature_names)))

    # Export to loom
    export_region_accessibility_to_loom(
        accessibility_matrix = imputed_acc_obj_dict[key],
        cistopic_obj = cistopic_obj_dict[key], 
        binarized_topic_region = region_bin_topics_dict[key],
        binarized_cell_topic = binarized_cell_topic_dict[key],
        out_fname = f_out,
        selected_regions = selected_regions ,
        # selected_cells = [ x.split('-')[0] + '-' + x.split('-')[1]  for x in cistopic_obj_dict[key].cell_names ], # this leaves a cell barcode of the format type 'TGCATGTCGCCGTTCCAAGA-21'
        # selected_cells = cistopic_obj_dict[key].projections['cell']['UMAP'].index.tolist(), # cflerin original
        cluster_annotation = ['consensus_cell_type', 'SCREEN_fmx_sample'],
        cluster_markers = {'consensus_cell_type': markers_dict_dict[key], 'SCREEN_fmx_sample':{}},
        tree_structure = ('scATAC-seq_Benchmark', 'ATAC_library_downsampled', 'Region_accessibility'),
        title = s + ' - Region accessibility all',
        nomenclature = "hg38"
    )

VIB_Hydrop_1


# caution:
the export loom function contains a bug which adds the barcodes as a clustering, causing scope to crash on loading. this is removed in the final loom jupyter notebook.

# upload LOOMs

curl -F 'UUID=scATAC-seq_Benchmark' -F 'file-type=Loom' -F 'file=@/lustre1/project/stg_00002/lcb/fderop/data/20211024_hca_benchmark_libds_merged/downstream_analysis/gene_act_loom/Merged__libDS_gene_activity.loom' https://scope.aertslab.org/upload/ | tee /dev/null