# Run cisTopic

In [45]:
import os
import pickle
import pandas as pd
from scipy.io import mmread
from pycisTopic.cistopic_class import create_cistopic_object
from pycisTopic.cistopic_class import create_cistopic_object_from_fragments

In [3]:
data_dir = '/cellar/users/aklie/projects/igvf/beta_cell_networks/data/multiome_stimulated_sc'
tmp_dir = '/cellar/users/aklie/tmp/'
out_dir = '/cellar/users/aklie/projects/igvf/beta_cell_networks/scratch/infer_cellular_programs/cistopic/results'

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

## Create python object

## From sparse count matrix

In [47]:
# Define arguments
counts_matrix_path = os.path.join(data_dir, 'matrix', 'dm023_palmitate', 'dm023_palmitate_endocrine_SC.delta_mpeak.count.mtx')
barcodes_path = os.path.join(data_dir, 'barcodes', 'dm023_palmitate', 'dm023_palmitate_endocrine_SC.delta_barcodes.tsv')
regions_path = os.path.join(data_dir, 'matrix', 'dm023_palmitate', 'dm023_palmitate_endocrine_SC.delta_mpeak.var.tsv')
cell_metadata_path = os.path.join(data_dir, 'metadata', 'dm023_palmitate', 'dm023_palmitate_endocrine_SC.delta_metadata.csv')
blacklist_path = '/cellar/users/aklie/opt/pycisTopic/blacklist/hg38-blacklist.v2.bed'
out_dir = '/cellar/users/aklie/projects/igvf/beta_cell_networks/scratch/infer_cellular_programs/cistopic/results/multiome_stimulated_sc'
project_name = 'dm023_palmitate_endocrine_SC.delta.notebook_test'

In [48]:
bcs = pd.read_csv(barcodes_path, sep="\t", header=None)[0].values
len(bcs)

186

In [49]:
cell_data = pd.read_csv(cell_metadata_path, index_col=0)
print(len(cell_data))
cell_data.head()

186


Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,nCount_RNA_raw,nFeature_RNA_raw,nCount_ATAC,nFeature_ATAC,nCount_SCT,nFeature_SCT,...,mpeak.weight,wsnn_res.0.3,nCount_peaks,nFeature_peaks,peaks.weight,wsnn_res.0.5,nCount_peak,nFeature_peak,cell.type.1,cell.type.2
MO1_AAACCAACAACCGCCA-1,SeuratProject,5083.68688,2651,0.0,5564,2655,28678,11595,5940,2637,...,0.32252,11,16257,11865,0.424601,10,16263,11870,SC.delta,SC.delta
MO1_AAATCCGGTCATGCCC-1,SeuratProject,10621.862955,4360,0.0,12114,4388,31846,13055,6877,3474,...,0.157511,11,18004,13216,0.127969,10,18010,13221,SC.delta,SC.delta
MO1_AGCACAGCAATAATCC-1,SeuratProject,4868.694723,2484,0.00222,5309,2492,8880,4347,5896,2475,...,0.147467,11,5140,4625,0.195448,10,5140,4625,SC.delta,SC.delta
MO1_AGCCGGTTCCCTGATC-1,SeuratProject,12376.029583,4459,0.0,14194,4503,45933,17879,6674,3075,...,0.352782,11,26394,17956,0.285563,10,26412,17966,SC.delta,SC.delta
MO1_AGCTTTAAGCTTAGTA-1,SeuratProject,3433.566856,2042,0.0,3712,2046,29131,12201,5528,2065,...,0.387242,11,16622,12426,0.339677,10,16627,12431,SC.delta,SC.delta


In [50]:
# Get rownames from counts matrix
regions = pd.read_csv(regions_path, sep="\t", header=None)[0].values
len(regions)

215116

In [17]:
cnt_mtx = mmread(counts_matrix_path).tocsr()
cnt_mtx.shape

(215116, 186)

In [18]:
cnt_mtx.shape, len(bcs), len(regions)

((215116, 186), 186, 215116)

In [19]:
regions[:5]

array(['chr1:9739-10674', 'chr1:28863-29744', 'chr1:180651-181588',
       'chr1:183892-184817', 'chr1:186256-187321'], dtype=object)

In [20]:
# Create cisTopic object
cistopic_obj = create_cistopic_object(
    fragment_matrix=cnt_mtx,
    cell_names=bcs,
    region_names=regions,
    path_to_blacklist=blacklist_path,
    project="SC.delta_notebook_test"
)

2023-04-27 10:01:16,116 cisTopic     INFO     Removing blacklisted regions
2023-04-27 10:01:17,219 cisTopic     INFO     Creating CistopicObject
2023-04-27 10:01:17,689 cisTopic     INFO     Done!


In [21]:
# Adding cell information
cistopic_obj.add_cell_data(cell_data)

In [22]:
# Save cisTopic object
pickle.dump(cistopic_obj, open(os.path.join(out_dir, cistopic_obj.project + '.pkl'), 'wb'))

## From fragment files

In [51]:
# Define arguments
fragments_dir = "/cellar/users/aklie/projects/igvf/beta_cell_networks/data/multiome_stimulated_sc/fragments/dm023_palmitate"
fragment_files = None
barcodes_file = os.path.join(data_dir, 'barcodes', 'dm023_palmitate', 'dm023_palmitate_endocrine_SC.delta_barcodes.tsv')
regions_file = "/cellar/users/aklie/projects/igvf/beta_cell_networks/data/multiome_stimulated_sc/peaks/dm023_palmitate/dm023_palmitate_endocrine_SC.beta.narrowPeak.wrangled.tmp"
cell_metadata_file = os.path.join(data_dir, 'metadata', 'dm023_palmitate', 'dm023_palmitate_endocrine_SC.delta_metadata.csv')
blacklist_file = '/cellar/users/aklie/opt/pycisTopic/blacklist/hg38-blacklist.v2.bed'
out_dir = '/cellar/users/aklie/projects/igvf/beta_cell_networks/scratch/infer_cellular_programs/cistopic/results/multiome_stimulated_sc'
split_pattern = '_'
n_cpu = 4
project_name = 'dm023_palmitate_endocrine_SC.delta.notebook_test_from_fragments'

In [52]:
if fragments_dir is not None:
    fragment_files = [os.path.join(fragments_dir, f) for f in os.listdir(fragments_dir)]
    fragment_file_dict = {os.path.basename(f).split(split_pattern)[0]: f for f in fragment_files}
elif fragment_files is not None:
    fragment_file_dict = {os.path.basename(f).split(split_pattern)[0]: f for f in fragment_files}
else:
    raise ValueError("Must specify either fragments_dir or fragment_files")
fragment_file_dict

{'mo9': '/cellar/users/aklie/projects/igvf/beta_cell_networks/data/multiome_stimulated_sc/fragments/dm023_palmitate/mo9_fragments.tsv.gz',
 'mo38': '/cellar/users/aklie/projects/igvf/beta_cell_networks/data/multiome_stimulated_sc/fragments/dm023_palmitate/mo38_fragments.tsv.gz.tbi',
 'mo18': '/cellar/users/aklie/projects/igvf/beta_cell_networks/data/multiome_stimulated_sc/fragments/dm023_palmitate/mo18_fragments.tsv.gz',
 'mo3': '/cellar/users/aklie/projects/igvf/beta_cell_networks/data/multiome_stimulated_sc/fragments/dm023_palmitate/mo3_fragments.tsv.gz',
 'mo33': '/cellar/users/aklie/projects/igvf/beta_cell_networks/data/multiome_stimulated_sc/fragments/dm023_palmitate/mo33_fragments.tsv.gz.tbi',
 'mo14': '/cellar/users/aklie/projects/igvf/beta_cell_networks/data/multiome_stimulated_sc/fragments/dm023_palmitate/mo14_fragments.tsv.gz.tbi',
 'mo29': '/cellar/users/aklie/projects/igvf/beta_cell_networks/data/multiome_stimulated_sc/fragments/dm023_palmitate/mo29_fragments.tsv.gz.tbi',
 

In [53]:
# Keep only one entry for now
fragment_file_dict = {k: v for k, v in fragment_file_dict.items() if k in ['mo9', 'mo3']}
fragment_file_dict

{'mo9': '/cellar/users/aklie/projects/igvf/beta_cell_networks/data/multiome_stimulated_sc/fragments/dm023_palmitate/mo9_fragments.tsv.gz'}

In [54]:
# Load the path to regions, if more than one fragment file, use the same regions for all
path_to_regions = dict.fromkeys(fragment_file_dict.keys(), regions_file)
path_to_regions

{'mo9': '/cellar/users/aklie/projects/igvf/beta_cell_networks/data/multiome_stimulated_sc/peaks/dm023_palmitate/dm023_palmitate_endocrine_SC.beta.narrowPeak.wrangled.tmp'}

In [55]:
bcs = pd.read_csv(barcodes_file, sep="\t", header=None)[0].values
bcs[:5]

array(['MO1_AAACCAACAACCGCCA-1', 'MO1_AAATCCGGTCATGCCC-1',
       'MO1_AGCACAGCAATAATCC-1', 'MO1_AGCCGGTTCCCTGATC-1',
       'MO1_AGCTTTAAGCTTAGTA-1'], dtype=object)

In [57]:
cistopic_obj_list=[create_cistopic_object_from_fragments(
    path_to_fragments=fragment_file_dict[key],
    path_to_regions=path_to_regions[key],
    path_to_blacklist=blacklist_file,
    valid_bc=bcs,
    n_cpu=n_cpu,
    project=key,
    split_pattern=split_pattern) for key in fragment_file_dict.keys()
]

2023-04-27 10:15:05,772 cisTopic     INFO     Reading data for mo9
2023-04-27 10:20:19,879 cisTopic     INFO     Counting number of unique fragments (Unique_nr_frag)
2023-04-27 10:22:53,859 cisTopic     INFO     Counting fragments in regions


2023-04-27 10:23:09,215	INFO worker.py:1519 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


2023-04-27 10:25:46,815 cisTopic     INFO     Creating fragment matrix


  counts_df.groupby(["Name", "regionID"], sort=False, observed=True)


: 

: 

In [None]:
if cell_data is not None:
    cell_data = pd.read_csv(cell_metadata_path, index_col=0)
    cistopic_obj.add_cell_data(cell_data)

In [None]:
# Save cisTopic object
pickle.dump(cistopic_obj, open(os.path.join(out_dir, cistopic_obj.project + '.pkl'), 'wb'))

## Run models

In [66]:
models=run_cgs_models(
    cistopic_obj,
    n_topics=[2,3,4],
    n_cpu=3,
    n_iter=10,
    random_state=555,
    alpha=50,
    alpha_by_topic=True,
    eta=0.1,
    eta_by_topic=False,
    save_path=None,
    _temp_dir = os.path.join(tmp_dir + 'ray_spill')
)

2023-04-19 11:14:19,497	INFO worker.py:1519 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(run_cgs_model pid=2723341)[0m 2023-04-19 11:14:38,032 cisTopic     INFO     Running model with 4 topics
[2m[36m(run_cgs_model pid=2723340)[0m 2023-04-19 11:14:38,068 cisTopic     INFO     Running model with 3 topics
[2m[36m(run_cgs_model pid=2723342)[0m 2023-04-19 11:14:38,039 cisTopic     INFO     Running model with 2 topics
[2m[36m(run_cgs_model pid=2723342)[0m 2023-04-19 11:15:32,341 cisTopic     INFO     Model with 2 topics done!
[2m[36m(run_cgs_model pid=2723341)[0m 2023-04-19 11:15:34,793 cisTopic     INFO     Model with 4 topics done!
[2m[36m(run_cgs_model pid=2723340)[0m 2023-04-19 11:15:37,974 cisTopic     INFO     Model with 3 topics done!


In [69]:
if not os.path.exists(os.path.join(out_dir, 'models')):
    os.makedirs(os.path.join(out_dir, 'models'))

pickle.dump(models, open(os.path.join(out_dir, "models", cistopic_obj.project + '_models.pkl'), 'wb'))