# Create a pycistopic object from count matrix
**Authorship:** Adam Klie (last updated: 08/21/2023)<br>
***
**Description:** This notebook is meant to build a pycistopic object from a count matrix.
***

# Set-up

In [1]:
# Import the necessary packages
import os
import sys
import glob
import pickle
import pandas as pd
from scipy import io
from pycisTopic.cistopic_class import *

In [24]:
# Params
mtx_file = "/cellar/users/aklie/data/igvf/beta_cell_networks/platinum/igvf_sc-islet_10X-Multiome/17Aug23/scATAC/matrix.mtx.gz"
barcode_file  = "/cellar/users/aklie/data/igvf/beta_cell_networks/platinum/igvf_sc-islet_10X-Multiome/17Aug23/scATAC/metadata.csv"
peaks_file = "/cellar/users/aklie/data/igvf/beta_cell_networks/platinum/igvf_sc-islet_10X-Multiome/17Aug23/scATAC/features.tsv.gz"
blacklist_file = "/cellar/users/aklie/data/igvf/references/blacklists/hg38/ENCFF356LFX.bed"
output_dir = "/cellar/users/aklie/projects/igvf/beta_cell_networks/infer_grns/scenicplus/results/igvf_sc-islet_10X-Multiome/17Aug23"

In [25]:
# Make output dir if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Get data

In [26]:
# Read in the counts and send to csr
counts = io.mmread(mtx_file)
counts_csr = counts.tocsr()
counts_csr.shape

(262611, 83289)

In [26]:
# Read in the cell metadata
cell_data = pd.read_csv(barcode_file, index_col=0, low_memory=False)
cell_data.head()

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,nCount_RNA_raw,nFeature_RNA_raw,nCount_ATAC,nFeature_ATAC,nCount_SCT,nFeature_SCT,...,atac_pct.mt,TSS.enrichment,TSS.percentile,mpeak.weight,wsnn_res.0.3,cell.type.1,cell.type.2,predicted.cell.type,predicted.cell.type.score,barcode
MO1_AAACAGCCAGCAATAA-1,SeuratProject,7140.390126,2937,0.0,7781,2945,10302,4964,2807,1654,...,0.629753,4.811552,0.15,0.150786,4,pre.SC.endocrine,pre.SC.alpha,pre.SC.alpha,1.0,AAACAGCCAGCAATAA-1
MO1_AAACCAACAACCGCCA-1,SeuratProject,5083.68688,2651,0.0,5564,2655,28678,11595,3028,2044,...,0.597243,5.25115,0.51,0.32252,11,SC.endocrine,SC.delta,SC.delta,1.0,AAACCAACAACCGCCA-1
MO1_AAACCGCGTATTGTGG-1,SeuratProject,8302.654374,3797,0.0,9338,3810,14258,6820,2693,1793,...,0.733843,4.08319,0.01,0.320034,4,pre.SC.endocrine,pre.SC.alpha,pre.SC.alpha,0.99152,AAACCGCGTATTGTGG-1
MO1_AAACGCGCAAGCCACT-1,SeuratProject,5772.429015,3107,0.005388,6430,3114,27699,11662,3011,2105,...,0.44526,4.810653,0.15,0.21576,3,SC.endocrine,SC.EC,SC.EC,1.0,AAACGCGCAAGCCACT-1
MO1_AAACGCGCAGTTATCG-1,SeuratProject,2732.242968,1822,0.140443,2912,1824,19568,8592,2803,1816,...,1.289934,4.997553,0.27,0.357753,0,SC.endocrine,SC.beta,SC.beta,1.0,AAACGCGCAGTTATCG-1


In [27]:
# Read in the regions, clean up and check how many
region_names = pd.read_csv(peaks_file, sep="\t", header=None)[0].values
region_names = [region.replace("-", ":", 1) for region in region_names]
len(region_names)

262611

# Make the object

In [29]:
# Create the object
cistopic_obj = create_cistopic_object(
    fragment_matrix=counts_csr,
    cell_names=cell_data.index,
    region_names=region_names,
    path_to_blacklist=blacklist_file,
    split_pattern="_",
    tag_cells=False,
    project="igvf_sc-islet_10X-Multiome_10Aug23"
)

2023-08-21 15:49:44,267 cisTopic     INFO     Removing blacklisted regions
2023-08-21 15:49:49,435 cisTopic     INFO     Creating CistopicObject


  np.log10(cisTopic_nr_frag),
  np.log10(cisTopic_nr_acc),


2023-08-21 15:50:12,932 cisTopic     INFO     Done!


In [None]:
# Add cell metadata
cistopic_obj.add_cell_data(cell_data)

In [34]:
# Check the object
cistopic_obj.project, cistopic_obj.fragment_matrix.shape, cistopic_obj.cell_data.head(), cistopic_obj.region_names[:5]

('igvf_sc-islet_10X-Multiome_10Aug23',
 (262418, 83289),
                        cisTopic_nr_frag cisTopic_log_nr_frag cisTopic_nr_acc  \
 MO1_AAACAGCCAGCAATAA-1             6161             3.789651            5581   
 MO1_AAACCAACAACCGCCA-1            17040              4.23147           13284   
 MO1_AAACCGCGTATTGTGG-1             8642             3.936614            7781   
 MO1_AAACGCGCAAGCCACT-1            16741             4.223781           13457   
 MO1_AAACGCGCAGTTATCG-1            11428              4.05797            9567   
 
                        cisTopic_log_nr_acc  \
 MO1_AAACAGCCAGCAATAA-1            3.746712   
 MO1_AAACCAACAACCGCCA-1            4.123329   
 MO1_AAACCGCGTATTGTGG-1            3.891035   
 MO1_AAACGCGCAAGCCACT-1            4.128948   
 MO1_AAACGCGCAGTTATCG-1            3.980776   
 
                                                  sample_id     orig.ident  \
 MO1_AAACAGCCAGCAATAA-1  igvf_sc-islet_10X-Multiome_10Aug23  SeuratProject   
 MO1_AAACCAACAA

In [35]:
# Save the cistopic object
pickle.dump(
    cistopic_obj,
    open(os.path.join(output_dir, "cistopic_obj.pkl"), 'wb')
)

# Check the load

In [36]:
cistopic_obj_load = pickle.load(open(os.path.join(output_dir, "cistopic_obj.pkl"), 'rb'))

In [37]:
# Check the object
cistopic_obj_load.project, cistopic_obj_load.fragment_matrix.shape, cistopic_obj_load.cell_data.head(), cistopic_obj_load.region_names[:5]

('igvf_sc-islet_10X-Multiome_10Aug23',
 (262418, 83289),
                        cisTopic_nr_frag cisTopic_log_nr_frag cisTopic_nr_acc  \
 MO1_AAACAGCCAGCAATAA-1             6161             3.789651            5581   
 MO1_AAACCAACAACCGCCA-1            17040              4.23147           13284   
 MO1_AAACCGCGTATTGTGG-1             8642             3.936614            7781   
 MO1_AAACGCGCAAGCCACT-1            16741             4.223781           13457   
 MO1_AAACGCGCAGTTATCG-1            11428              4.05797            9567   
 
                        cisTopic_log_nr_acc  \
 MO1_AAACAGCCAGCAATAA-1            3.746712   
 MO1_AAACCAACAACCGCCA-1            4.123329   
 MO1_AAACCGCGTATTGTGG-1            3.891035   
 MO1_AAACGCGCAAGCCACT-1            4.128948   
 MO1_AAACGCGCAGTTATCG-1            3.980776   
 
                                                  sample_id     orig.ident  \
 MO1_AAACAGCCAGCAATAA-1  igvf_sc-islet_10X-Multiome_10Aug23  SeuratProject   
 MO1_AAACCAACAA

# DONE!

---

# Scratch

In [51]:
n_topics = ['2,', '4,', '8,', '16,', '32,', '48,', '64,', '80']

In [55]:
type(list(map(lambda x: int(x.split(',')[0]), n_topics))[0])

int

In [None]:
list(map(int, n_topics.split(',')[0]))

In [46]:
n_topics = ["2", "4", "8", "16", "32", "48", "64", "80"]

In [47]:
n_topics = list(map(int, n_topics))

In [48]:
n_topics

[2, 4, 8, 16, 32, 48, 64, 80]

In [40]:
n_topics = list(map(int, n_topics[0].split(',')))

[2, 4, 8, 16, 32, 48, 64, 80]

In [57]:
from pycisTopic.lda_models import *

In [58]:
run_cgs_models_mallet?

[0;31mSignature:[0m
[0mrun_cgs_models_mallet[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpath_to_mallet_binary[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcistopic_obj[0m[0;34m:[0m [0;34m'cisTopicObject'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_topics[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_cpu[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mint[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_iter[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mint[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;36m150[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrandom_state[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mint[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;36m555[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0malpha[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mNoneTyp

In [4]:
import ray
ray.__version__

'2.1.0'

In [5]:
ray.init(num_cpus=4)

2023-08-21 19:00:10,940	INFO worker.py:1519 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.8.15
Ray version:,2.1.0
Dashboard:,http://127.0.0.1:8265


In [6]:
ray.shutdown()