In [17]:
import pandas as pd
from scipy.io import mmread
from pycisTopic.cistopic_class import create_cistopic_object

In [40]:
barcodes_path = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/timecourse/A2_control/filtered_archr/Matrices/PeakMatrix/barcodes.tsv"
regions_path = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/timecourse/A2_control/filtered_archr/Matrices/PeakMatrix/features.tsv"
counts_matrix_path = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/timecourse/A2_control/filtered_archr/Matrices/PeakMatrix/mtx.mtx"
cell_metadata_path = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/timecourse/A2_control/A2_control_RNA_cell_metadata.tsv"

In [41]:
bcs = pd.read_csv(barcodes_path, sep="\t", header=None)[0].values
print("{} barcodes loaded: {}...".format(len(bcs), bcs[:5]))

5843 barcodes loaded: ['dm45a#GGTATTTCAGGCTGTT-1' 'dm45a#ACGACAAAGCCTCTGT-1'
 'dm45a#GCGGGTTTCTCACTCA-1' 'dm45a#ACCATTAAGAGCCGCT-1'
 'dm45a#GTGTTCCTCCCGCAAA-1']...


In [42]:
regions = pd.read_csv(regions_path, sep="\t", header=None)[0].values
print("{} regions loaded: {}...".format(len(regions), regions[:5]))

188090 regions loaded: ['chr1:804687-805187' 'chr1:817831-818331' 'chr1:819660-820160'
 'chr1:821415-821915' 'chr1:822102-822602']...


In [43]:
cnt_mtx = mmread(counts_matrix_path).tocsr()
print("Counts matrix loaded: %s...", cnt_mtx.shape)

Counts matrix loaded: %s... (188090, 5843)


In [44]:
if cell_metadata_path is not None:
    if cell_metadata_path.endswith(".tsv") or cell_metadata_path.endswith(".txt"):
        cell_data = pd.read_csv(cell_metadata_path, sep="\t", index_col=0)
    elif cell_metadata_path.endswith(".csv"):
        cell_data = pd.read_csv(cell_metadata_path, index_col=0)
    else:
        raise ValueError("cell_metadata_path must be .tsv, .txt, or .csv")
    print(f"Cells metadata loaded with shape {cell_data.shape}")

Cells metadata loaded with shape (8362, 83)


In [45]:
if cnt_mtx.shape[0] == len(regions) and cnt_mtx.shape[1] == len(bcs):
    print("Counts matrix is already in the correct shape")
elif cnt_mtx.shape[1] == len(regions) and cnt_mtx.shape[0] == len(bcs):
    print("Transposing counts matrix to match barcodes and regions numbers...")
    cnt_mtx = cnt_mtx.T
else:
    raise ValueError(
        "Counts matrix shape does not match either barcodes or regions list"
    )

Counts matrix is already in the correct shape


In [46]:
df = pd.DataFrame.sparse.from_spmatrix(cnt_mtx)
df.columns = bcs
df.index = regions
print(f"Counts matrix shape after filtering: {df.shape}")

Counts matrix shape after filtering: (188090, 5843)


In [47]:
if cell_metadata_path is not None:
    bcs = df.columns.intersection(cell_data.index)
    print(f"{len(bcs)} cells have metadata...")
    df = df[bcs]
    cell_data = cell_data.loc[bcs]
    print(f"Counts matrix shape after filtering: {df.shape}")

5843 cells have metadata...
Counts matrix shape after filtering: (188090, 5843)


In [49]:
bcs[:5], df.columns[:5], cell_data.index[:5]

(Index(['dm45a#GGTATTTCAGGCTGTT-1', 'dm45a#ACGACAAAGCCTCTGT-1',
        'dm45a#GCGGGTTTCTCACTCA-1', 'dm45a#ACCATTAAGAGCCGCT-1',
        'dm45a#GTGTTCCTCCCGCAAA-1'],
       dtype='object'),
 Index(['dm45a#GGTATTTCAGGCTGTT-1', 'dm45a#ACGACAAAGCCTCTGT-1',
        'dm45a#GCGGGTTTCTCACTCA-1', 'dm45a#ACCATTAAGAGCCGCT-1',
        'dm45a#GTGTTCCTCCCGCAAA-1'],
       dtype='object'),
 Index(['dm45a#GGTATTTCAGGCTGTT-1', 'dm45a#ACGACAAAGCCTCTGT-1',
        'dm45a#GCGGGTTTCTCACTCA-1', 'dm45a#ACCATTAAGAGCCGCT-1',
        'dm45a#GTGTTCCTCCCGCAAA-1'],
       dtype='object'))

In [50]:
print("Creating cisTopic object...")
cistopic_obj = create_cistopic_object(
    fragment_matrix=df,
    cell_names=bcs,
    region_names=regions,
    path_to_blacklist=None,
    project="test"
)

Creating cisTopic object...
2024-03-19 07:58:26,499 cisTopic     INFO     Converting fragment matrix to sparse matrix
2024-03-19 07:58:38,898 cisTopic     INFO     Creating CistopicObject
2024-03-19 07:58:39,975 cisTopic     INFO     Done!


In [51]:
if cell_metadata_path is not None:
    print("Adding cell metadata to cisTopic object...")
    cistopic_obj.add_cell_data(cell_data)

Adding cell metadata to cisTopic object...
Columns ['sample_id'] will be overwritten


# DONE!

---