# pycisTopic analysis

Full dataset, using SCREEN regions.

In [1]:
import pycisTopic
pycisTopic.__version__

'0.1.dev300+g7494158'

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
import pickle
import pandas as pd

In [5]:
import os
wdir = '/lustre1/project/stg_00002/lcb/fderop/data/20211008_hca_benchmark_full_singles/'
os.chdir( wdir )

In [6]:
import glob
from collections import OrderedDict

# load cistopic objs

In [7]:
filenames = glob.glob('fragments_postbap/*.sinto.mm.fragments.tsv.gz')
samples = [item.replace(".sinto.mm.fragments.tsv.gz", "") for item in filenames]
samples = [item.replace("fragments_postbap/", "") for item in samples]
files_dict = {samples[i]: filenames[i] for i in range(len(samples))}
files_dict = OrderedDict(sorted(files_dict.items()))
files_dict.keys()

odict_keys(['Broad_1', 'Broad_2', 'Broad_mito_1', 'Broad_mito_2', 'CNAG_1', 'CNAG_2', 'Sanger_1', 'Sanger_2', 'Stanford_1', 'Stanford_2', 'VIB_1', 'VIB_2', 'VIB_Hydrop_1', 'VIB_Hydrop_2', 's3atac'])

In [8]:
f_cto_dir = 'cistopic_objs__screen'

cistopic_obj_dict = {}
for key in files_dict.keys():
    f_cto = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_filtered.pkl')
    if(os.path.isfile(f_cto)):
        with open(f_cto, 'rb') as f:
            cistopic_obj_dict[key] = pickle.load(f)
        print(f"Loaded filtered cistopic object {key}")
    else:
        print(f"file {f_cto} doesn't exist")

Loaded filtered cistopic object Broad_1
Loaded filtered cistopic object Broad_2
Loaded filtered cistopic object Broad_mito_1
Loaded filtered cistopic object Broad_mito_2
Loaded filtered cistopic object CNAG_1
Loaded filtered cistopic object CNAG_2
Loaded filtered cistopic object Sanger_1
Loaded filtered cistopic object Sanger_2
Loaded filtered cistopic object Stanford_1
Loaded filtered cistopic object Stanford_2
Loaded filtered cistopic object VIB_1
Loaded filtered cistopic object VIB_2
Loaded filtered cistopic object VIB_Hydrop_1
Loaded filtered cistopic object VIB_Hydrop_2
Loaded filtered cistopic object s3atac


## Export region-accessibility looms

For use in cell type identification
# BE CAREFUL WITH THIS: col_attrs = barcodes -> need to make sure that the barcode will match the one in the fragments files, otherwise downstream will not work.

better is for example: `            col_attrs={ 'CellID': list(cistopic_obj_dict[key].cell_data['barcode']) }, # for hydrop
`

In [9]:
import loompy as lp

In [10]:
f_crl_dir = 'cell_region_loom__screen'
if not os.path.exists(os.path.join(wdir, f_crl_dir)):
    os.makedirs(os.path.join(wdir, f_crl_dir))

In [12]:
for key in cistopic_obj_dict.keys():
    f_out = os.path.join(wdir, f_crl_dir, key + 'cell_region-all.loom')
    if os.path.isfile(f_out):
        print(f"Skipping {key} loom writing, already exists")
    else:
        lp.create(
            filename = f_out,
            layers=cistopic_obj_dict[key].fragment_matrix,
            row_attrs={ 'Gene': cistopic_obj_dict[key].region_names }, 
            #col_attrs={ 'CellID': cistopic_obj_dict[key].cell_names }, 
            # col_attrs={ 'CellID': [ x.split('-')[0] for x in cistopic_obj_dict[key].cell_names ] },
            col_attrs={ 'CellID': [ x.split('-')[0] + '-' + x.split('-')[1]  for x in cistopic_obj_dict[key].cell_names ] },

            #file_attrs=attrs
        )
        print(f"Finished {key} loom writing")

Skipping Broad_1 loom writing, already exists
Skipping Broad_2 loom writing, already exists
Skipping Broad_mito_1 loom writing, already exists
Skipping Broad_mito_2 loom writing, already exists
Skipping CNAG_1 loom writing, already exists
Skipping CNAG_2 loom writing, already exists
Skipping Sanger_1 loom writing, already exists
Skipping Sanger_2 loom writing, already exists
Skipping Stanford_1 loom writing, already exists
Skipping Stanford_2 loom writing, already exists
Skipping VIB_1 loom writing, already exists
Skipping VIB_2 loom writing, already exists
Finished VIB_Hydrop_1 loom writing
Finished VIB_Hydrop_2 loom writing
Skipping s3atac loom writing, already exists


go to next notebook to run the modeling

In [12]:
cistopic_obj_dict['VIB_Hydrop_1'].cell_data

Unnamed: 0,cisTopic_nr_frag,cisTopic_log_nr_frag,cisTopic_nr_acc,cisTopic_log_nr_acc,sample_id,Log_total_nr_frag,Log_unique_nr_frag,Total_nr_frag,Unique_nr_frag,Dupl_nr_frag,Dupl_rate,Total_nr_frag_in_regions,Unique_nr_frag_in_regions,FRIP,TSS_enrichment,barcode,fmx_droplet_type,fmx_sample,Doublet_scores_fragments,Predicted_doublets_fragments
ATTCTCAGACCAGGTTCGTG-12-VIB_Hydrop_1,2471,3.392873,2334,3.368101,VIB_Hydrop_1,4.363650,3.415974,23102,2606,20496,0.887196,20180,2149,0.824635,30.242277,ATTCTCAGACCAGGTTCGTG-12,SNG,sampleA,0.271255,False
GACTGAGATTCTGACCAGAT-12-VIB_Hydrop_1,3958,3.597476,3817,3.581722,VIB_Hydrop_1,4.049799,3.719083,11215,5237,5978,0.533036,6590,2719,0.519190,23.026618,GACTGAGATTCTGACCAGAT-12,SNG,sampleA,0.121951,False
CAATTGGAGAAACATTCCGG-12-VIB_Hydrop_1,5189,3.715084,4839,3.684756,VIB_Hydrop_1,4.595342,3.751048,39386,5637,33749,0.856878,33769,4613,0.818343,25.942880,CAATTGGAGAAACATTCCGG-12,SNG,sampleB,0.271255,False
GAGTTAACGTCATCGCAGGT-12-VIB_Hydrop_1,4054,3.607884,3826,3.582745,VIB_Hydrop_1,4.710456,3.671451,51340,4693,46647,0.908590,41910,3488,0.743235,22.186640,GAGTTAACGTCATCGCAGGT-12,SNG,sampleB,0.109278,False
AAGACGCAACTGAGGTAGGC-12-VIB_Hydrop_1,3504,3.544564,3281,3.516006,VIB_Hydrop_1,4.485040,3.582972,30552,3828,26724,0.874705,25865,2982,0.778997,21.792673,AAGACGCAACTGAGGTAGGC-12,SNG,sampleA,0.186747,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GACTTAGAGATCCAGTTCAC-11-VIB_Hydrop_1,960,2.982271,938,2.972203,VIB_Hydrop_1,3.599337,3.037426,3975,1090,2885,0.725786,2936,751,0.688991,29.406409,GACTTAGAGATCCAGTTCAC-11,SNG,sampleB,0.136691,False
TTGCTGGCGTATAGTGCAGG-11-VIB_Hydrop_1,1734,3.239049,1600,3.20412,VIB_Hydrop_1,4.163758,3.287130,14580,1937,12643,0.867147,11874,1435,0.740836,29.439356,TTGCTGGCGTATAGTGCAGG-11,SNG,sampleA,0.115385,False
GTGTGCTACAATCACGTGTC-12-VIB_Hydrop_1,1328,3.123198,1255,3.098644,VIB_Hydrop_1,3.956313,3.163758,9043,1458,7585,0.838770,7528,1118,0.766804,43.349566,GTGTGCTACAATCACGTGTC-12,SNG,sampleB,0.034247,False
TTGCTGGCGTTACGAGGTGG-11-VIB_Hydrop_1,1392,3.143639,1352,3.130977,VIB_Hydrop_1,4.209381,3.196729,16195,1573,14622,0.902871,13463,1155,0.734266,25.262572,TTGCTGGCGTTACGAGGTGG-11,SNG,sampleA,0.059507,False


In [15]:
[ x.split('-')[0] for x in cistopic_obj_dict['VIB_Hydrop_1'].cell_names ]

['ATTCTCAGACCAGGTTCGTG',
 'GACTGAGATTCTGACCAGAT',
 'CAATTGGAGAAACATTCCGG',
 'GAGTTAACGTCATCGCAGGT',
 'AAGACGCAACTGAGGTAGGC',
 'CAGATTGACCCGCCAACACA',
 'TGCATGTCGCTGACCGGAAC',
 'GATTCGAAGGATTCAGGACG',
 'CGATAATCCTGTTGGACATA',
 'GTCGGTTCTTATGACTCCAA',
 'TGGAGCAGTCAGGAATCGCA',
 'TAGAGCCTGACATCAAGCTG',
 'GCACACAGTCCTCCTATCAT',
 'GAACTCAACCCCTCTATGTA',
 'CGATAATCCTTAGATGCTCA',
 'TCGCCACGAAATTCAGGACG',
 'GTGCAACAACTTGCCACAAT',
 'TGGAGCAGTCCTTGATTACG',
 'TTATGACACCGCCTTCTCTC',
 'GAGTGAGGTATGGTCTACTC',
 'AGTGGAATGCGTTCGTCAGC',
 'GCTATGGCGACCACGTTGTG',
 'TCGGATCCATGTGTATCAGA',
 'GGAGTATTCTCGTTCCAAGA',
 'GCGCCATAGTGTATCAGCCG',
 'TGGCATTCACGTATACGATC',
 'CAACGTAGTTAGGTGTCGTG',
 'AGACATGCGAACATAATCCG',
 'TAACTCACAGATGAAGTGCG',
 'CGATAATCCTTTGTCCGCCA',
 'TTGTACTGCAGGACAGGACA',
 'CCAGAGCAAGGACGGCTTCA',
 'AATTCCTGCGATTCGGTCAG',
 'TGGCATTCACAGGCTCTATT',
 'GGATCAAGGCGATCTCGCGT',
 'GTTCAGGTAATCGAGTGATA',
 'GTGTAAGTGAGTAACTGACA',
 'TCTATTGAGCCGCGACAATG',
 'CCTCCATAAGTTGCAGTTCT',
 'ACACCTGAAGAGAACAATCC',
