# Description

Tutorial on how to read from the scBaseCamp TileDB-SOMA database

# Set up

[tiledbsoma-py](https://pypi.org/project/tiledbsoma/) must be installed

In [7]:
# set the path to the database (local or cloud)
db_uri = "/scratch/multiomics/nickyoungblut/tiledb-loader/tiledb-soma_GeneFull_Ex50pAS"
#db_uri = "/home/nickyoungblut/tmp/tiledb/db_bkup/2025-02-10_tiledb-soma_GeneFull_Ex50pAS"

In [8]:
# load packages
import numpy as np
import pandas as pd
import tiledbsoma
import tiledbsoma.io

# Observations

In [9]:
# get obs colnames
with tiledbsoma.open(db_uri) as exp:
    print(exp.obs.schema)

soma_joinid: int64 not null
obs_id: large_string
gene_count: int64
umi_count: float
barcode: large_string
SRX_accession: dictionary<values=string, indices=int32, ordered=0>
lib_prep: dictionary<values=string, indices=int32, ordered=0>
tech_10x: dictionary<values=string, indices=int32, ordered=0>
organism: dictionary<values=string, indices=int32, ordered=0>
tissue: dictionary<values=string, indices=int32, ordered=0>
disease: dictionary<values=string, indices=int32, ordered=0>
purturbation: dictionary<values=string, indices=int32, ordered=0>
cell_line: dictionary<values=string, indices=int32, ordered=0>
czi_collection_id: dictionary<values=string, indices=int32, ordered=0>
czi_collection_name: dictionary<values=string, indices=int32, ordered=0>


In [10]:
# view all metadata for the first 3 observations
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.obs.read()
        .concat()
        .slice(0,3) 
        .to_pandas()
    )
pd.set_option('display.max_columns', 100)
df

Unnamed: 0,soma_joinid,obs_id,gene_count,umi_count,barcode,SRX_accession,lib_prep,tech_10x,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name
0,0,AAACCCAAGGGAGATA_SRX10681588,1677,4085.0,AAACCCAAGGGAGATA,SRX10681588,10x_Genomics,3_prime_gex,human,blood,not specified,lean,not applicable,,
1,1,AAACCCAAGGTTGGAC_SRX10681588,1371,3866.0,AAACCCAAGGTTGGAC,SRX10681588,10x_Genomics,3_prime_gex,human,blood,not specified,lean,not applicable,,
2,2,AAACCCACAGTGGTGA_SRX10681588,1891,3936.0,AAACCCACAGTGGTGA,SRX10681588,10x_Genomics,3_prime_gex,human,blood,not specified,lean,not applicable,,


In [11]:
# count total observations
with tiledbsoma.open(db_uri) as exp:
    total_cells = (
        exp.obs.read(column_names=["obs_id"])
        .concat()
        .group_by([])
        .aggregate([
            ([], 'count_all'),
        ])
        .to_pandas()["count_all"].values[0]
    )
print(f"Total cells: {total_cells}")

Total cells: 139856479


In [12]:
# observations per dataset
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.obs.read(column_names=["SRX_accession"])
        .concat()
        .group_by(["SRX_accession"])
        .aggregate([
            ([], 'count_all'),
        ])
        .sort_by([("count_all", "descending")])
        .to_pandas()
    )
df

Unnamed: 0,SRX_accession,count_all
0,SRX15016115,148051
1,SRX19719427,107903
2,SRX15016116,103365
3,SRX22915751,100908
4,SRX17521046,98235
...,...,...
18450,SRX19926788,50
18451,SRX19704133,50
18452,SRX15483880,47
18453,SRX6640133,47


## Metadata

### Summary

In [8]:
# obs count per 10X Genomics technology
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.obs.read(column_names=["tech_10x"])
        .concat()
        .group_by(["tech_10x"])
        .aggregate([
            ([], 'count_all'),
        ])
        .sort_by([("count_all", "descending")])
        .to_pandas()
    )
df

Unnamed: 0,tech_10x,count_all
0,3_prime_gex,90485517
1,5_prime_gex,20009002
2,multiome,3210055
3,feature_barcoding,1959241
4,vdj,1879173
5,not_applicable,997406
6,other,973283
7,,489760
8,cellplex,291407
9,,247285


In [9]:
# obs count per 10X Genomics technology
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.obs.read(column_names=["organism"])
        .concat()
        .group_by(["organism"])
        .aggregate([
            ([], 'count_all'),
        ])
        .sort_by([("count_all", "descending")])
        .to_pandas()
    )
df

Unnamed: 0,organism,count_all
0,human,65052802
1,mouse,53187403
2,,1830849
3,,489760


### Query

In [17]:
# create metadata query
obs_query = tiledbsoma.AxisQuery(value_filter='organism in ["human"]')

# obs count per 10X Genomics technology for the query results
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.axis_query("RNA", obs_query=obs_query)
        .obs()
        .concat()
        .group_by(["organism", "tech_10x"])
        .aggregate([
            ([], 'count_all'),
        ])
        .sort_by([("count_all", "descending")])
        .to_pandas()
    )
df

Unnamed: 0,organism,tech_10x,count_all
0,human,3_prime_gex,51452698
1,human,5_prime_gex,16149396
2,human,multiome,2241595
3,human,feature_barcoding,1772353
4,human,vdj,1570771
5,human,not_applicable,537019
6,human,other,521553
7,human,cellplex,80411
8,human,atac,15988


In [11]:
# create metadata query
obs_query = tiledbsoma.AxisQuery(
    value_filter='organism in ["human"] and tech_10x in ["cellplex"] and gene_count >= 1000'
)

# gene and umi counts
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.axis_query("RNA", obs_query=obs_query)
        .obs()
        .concat()
        .select(["SRX_accession", "gene_count", "umi_count"])
        .sort_by([("gene_count", "descending")])
        .to_pandas()
    )
df

Unnamed: 0,SRX_accession,gene_count,umi_count
0,SRX22369542,14245,272316.0
1,SRX22369542,14093,272180.0
2,SRX22369542,14056,282893.0
3,SRX22369542,14018,253654.0
4,SRX22369542,13895,265843.0
...,...,...,...
46232,SRX22519782,1000,1372.0
46233,SRX22519782,1000,1382.0
46234,SRX22519782,1000,1648.0
46235,SRX22519782,1000,1612.0


In [24]:
# filter to target accessions

## create query
target_acc = [
    "SRX24745432", "SRX17878115", "SRX21370979", "SRX25986076", "SRX25986075", "SRX25986079", 
    "SRX17878109", "SRX21370976", "SRX21370978", "SRX17878114", "SRX21370977", "SRX17878113",
    "SRX25986074", "SRX17878104", "SRX17878105", "SRX17878112", "SRX20300901", "SRX20300902",
    "SRX20300905", "SRX20300900", "SRX17878106", "SRX24534638", "SRX17878111", "SRX21370974",
    "SRX21370975", "SRX20300906", "SRX20300903", "SRX20300899", "SRX17878110", "SRX17878107",
    "SRX20300904", "SRX17878108"
]
target_acc = ",".join([f'"{x}"' for x in target_acc])
obs_query = tiledbsoma.AxisQuery(value_filter=f'SRX_accession in [{target_acc}]')

## query the database
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.axis_query("RNA", obs_query=obs_query)
        .obs()
        .concat()
        .group_by(["SRX_accession", "organism", "tech_10x"])
        .aggregate([
            ([], 'count_all'),
        ])
        .sort_by([("count_all", "descending")])
        .to_pandas()
    )
df

Unnamed: 0,SRX_accession,organism,tech_10x,count_all
0,SRX24745432,human,multiome,24079
1,SRX17878115,human,3_prime_gex,12815
2,SRX21370979,human,5_prime_gex,10135
3,SRX25986076,human,3_prime_gex,9395
4,SRX25986075,human,3_prime_gex,8654
5,SRX25986079,human,3_prime_gex,8353
6,SRX17878109,human,3_prime_gex,7067
7,SRX21370976,human,5_prime_gex,6711
8,SRX17878114,human,3_prime_gex,6385
9,SRX21370977,human,5_prime_gex,6372


### CZI collection datasets

In [23]:
# create metadata query
obs_query = tiledbsoma.AxisQuery(value_filter='czi_collection_name == "None"')

# obs count per 10X Genomics technology for the query results
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.axis_query("RNA", obs_query=obs_query)
        .obs()
        .concat()
        .group_by(["czi_collection_name"])
        .aggregate([
            ([], 'count_all'),
        ])
        .sort_by([("count_all", "descending")])
        .to_pandas()
    )
df

Unnamed: 0,czi_collection_name,count_all
0,,118240205


In [24]:
# create metadata query
obs_query = tiledbsoma.AxisQuery(value_filter='czi_collection_name != "None"')

# obs count per 10X Genomics technology for the query results
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.axis_query("RNA", obs_query=obs_query)
        .obs()
        .concat()
        .group_by(["czi_collection_name"])
        .aggregate([
            ([], 'count_all'),
        ])
        .sort_by([("count_all", "descending")])
        .to_pandas()
    )
df

Unnamed: 0,czi_collection_name,count_all
0,Spatial proteogenomics reveals distinct and ev...,189890
1,Developmental cell programs are co-opted in in...,153916
2,Cross-tissue immune cell analysis reveals tiss...,139896
3,Human CellCards Multi-Study CellRef 1.0 Atlas,115615
4,A spatially resolved atlas of the human lung c...,112047
5,Single-cell eQTL mapping identifies cell type ...,104481
6,COVID-19 mRNA vaccine elicits a potent adaptiv...,98848
7,HypoMap ‚Äì a unified single cell gene express...,97187
8,Multi-scale spatial mapping of cell population...,81460
9,Cells of the adult human heart,74823


In [22]:
# create metadata query
obs_query = tiledbsoma.AxisQuery(value_filter='tech_10x == "NaN"')

# obs count per 10X Genomics technology for the query results
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.axis_query("RNA", obs_query=obs_query)
        .obs()
        .concat()
        .group_by(["czi_collection_name"])
        .aggregate([
            ([], 'count_all'),
        ])
        .sort_by([("count_all", "descending")])
        .to_pandas()
    )
df

Unnamed: 0,czi_collection_name,count_all
0,Cross-tissue immune cell analysis reveals tiss...,56390
1,Single Cell Multiome Atlas of the Human Fetal ...,51781
2,Unified comprehensive single-cell atlas of the...,47947
3,COVID-19 mRNA vaccine elicits a potent adaptiv...,47188
4,Time-resolved Systems Immunology Reveals a Lat...,18419
5,"Harmonized single-cell landscape, intercellula...",7114
6,Spatiotemporal analysis of human intestinal de...,6771
7,Single-cell transcriptome profiling of the hum...,6525
8,iPain atlas,5150


# Variables

In [11]:
# get colnames
with tiledbsoma.open(db_uri) as exp:
    print(exp.ms["RNA"].var.schema)

soma_joinid: int64 not null
var_id: large_string


In [12]:
# read in the var level data
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.ms["RNA"]
        .var.read(column_names=["soma_joinid", "var_id"])
        .concat()
        .to_pandas()
    )
df

Unnamed: 0,soma_joinid,var_id
0,0,ENSG00000000003
1,1,ENSG00000000005
2,2,ENSG00000000419
3,3,ENSG00000000457
4,4,ENSG00000000460
...,...,...
68881,68881,ENSMUSG00000118552
68882,68882,ENSMUSG00000118553
68883,68883,ENSMUSG00000118554
68884,68884,ENSMUSG00000118560


# Counts

## Slice

In [38]:
# sparse count matrix for the first 3 observations
with tiledbsoma.Experiment.open(db_uri) as exp:
    print(
        exp.ms["RNA"].X["data"]
        .read((slice(0,3),))
        .coos()
        .concat() 
    )

<pyarrow.SparseCOOTensor>
type: float
shape: (52967109, 68886)


In [41]:
# Get gene count per obs
def get_genes_per_obs(db_uri, start, end):
    with tiledbsoma.Experiment.open(db_uri) as exp:
        data = (
            exp.ms["RNA"].X["data"]
            .read((slice(start, end),)).coos().concat()
        )
    return np.diff(data.to_scipy().tocsr().indptr)[start:end]

## first 10 observations
get_genes_per_obs(db_uri, 0, 10)

array([1677, 1371, 1891, 1967, 2050, 1303, 3151, 1705, 1276, 1507],
      dtype=int32)

In [43]:
# Get UMI count per obs
def get_umi_counts_per_obs(db_uri, start, end):
    with tiledbsoma.Experiment.open(db_uri) as exp:
        data = exp.ms["RNA"].X["data"].read((slice(start, end),)).coos().concat()
    sp = data.to_scipy().tocsr()
    return sp.sum(axis=1).A1[start:end]

## first 10 observations
get_umi_counts_per_obs(db_uri, 0, 10)

array([4085., 3866., 3936., 5528., 6013., 2288., 7996., 4050., 3031.,
       4726.], dtype=float32)

# 