# Goals

* Test the creation of a tiledb database

In [6]:
import os
import requests
import tiledbsoma
import tiledbsoma.io
import scanpy as sc

In [7]:
work_dir = "/home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb"

# Create db

In [3]:
srx_accession = "SRX21101392"
input_path = os.path.join(work_dir, "prod3", "SCRECOUNTER_2025-01-24_23-55-08", "STAR", srx_accession, "Gene", "filtered")

In [4]:
# Read 10x mtx data
adata = sc.read_10x_mtx(
    input_path,
    var_names="gene_ids",  
    make_unique=True  
)
adata.shape

(6311, 36601)

In [9]:
# add SRX column
adata.obs["SRX_accession"] = [srx_accession] * len(adata.obs)

In [9]:
# location of tiledb
db_uri = os.path.join(work_dir, "scRecount_test")

In [13]:
## create db
tiledbsoma.io.from_anndata(
    db_uri,
    adata,
    measurement_name="RNA",
)

'/home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/scRecount_test'

### Add data

In [14]:
srx_accession = "SRX24099779"
input_path = os.path.join(work_dir, "prod3", "SCRECOUNTER_2025-01-22_01-10-09", "STAR", srx_accession, "Gene", "filtered")

In [15]:
# Read 10x mtx data
adata = sc.read_10x_mtx(
    input_path,
    var_names="gene_ids",  
    make_unique=True  
)
adata.shape

(3688, 32285)

In [16]:
# add SRX column
adata.obs["SRX_accession"] = [srx_accession] * len(adata.obs)

In [17]:
# register
rd = tiledbsoma.io.register_anndatas(
    db_uri,
    [adata],
    measurement_name="RNA",
    obs_field_name="obs_id",
    var_field_name="var_id",
)

In [18]:
# apply resize
with tiledbsoma.Experiment.open(db_uri) as exp:
    tiledbsoma.io.resize_experiment(
        exp.uri, 
        nobs=rd.get_obs_shape(), 
        nvars=rd.get_var_shapes()
    )

In [19]:
# ingest new data into the db
tiledbsoma.io.from_anndata(
    db_uri,
    adata,
    measurement_name="RNA",
    registration_mapping=rd,
)

'file:///home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/scRecount_test'

### Add more data

In [20]:
srx_accession = "SRX20585665"
input_path = os.path.join(work_dir, "prod3", "SCRECOUNTER_2025-01-22_01-10-09", "STAR", srx_accession, "Gene", "filtered")

In [21]:
# Read 10x mtx data
adata = sc.read_10x_mtx(
    input_path,
    var_names="gene_ids",  
    make_unique=True  
)
adata.shape

(983, 36601)

In [22]:
# add SRX column
adata.obs["SRX_accession"] = [srx_accession] * len(adata.obs)

In [23]:
# register
rd = tiledbsoma.io.register_anndatas(
    db_uri,
    [adata],
    measurement_name="RNA",
    obs_field_name="obs_id",
    var_field_name="var_id",
)

In [24]:
# apply resize
with tiledbsoma.Experiment.open(db_uri) as exp:
    tiledbsoma.io.resize_experiment(
        exp.uri, 
        nobs=rd.get_obs_shape(), 
        nvars=rd.get_var_shapes()
    )

In [25]:
# ingest new data into the db
tiledbsoma.io.from_anndata(
    db_uri,
    adata,
    measurement_name="RNA",
    registration_mapping=rd,
)

'file:///home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/scRecount_test'

# Summary

In [27]:
# get colnames
with tiledbsoma.open(db_uri) as exp:
    print(exp.obs.schema)

soma_joinid: int64 not null
obs_id: large_string
SRX_accession: large_string


In [28]:
# read in the data
with tiledbsoma.Experiment.open(db_uri) as exp:
    print(
        exp.obs.read(column_names=["obs_id", "SRX_accession"])
        .concat()
        .to_pandas()["SRX_accession"].value_counts()
    )

SRX_accession
SRX21101392    6311
SRX24099779    3688
SRX20585665     975
Name: count, dtype: int64


# Update metadata

In [17]:
import pyarrow as pa

In [26]:
# query
obs_query = tiledbsoma.AxisQuery(value_filter='SRX_accession in ["SRX20585665_mod"]')

# get the target records
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.axis_query("RNA", obs_query=obs_query)
        .obs()
        .concat()
        .to_pandas()
    )


# modify and update
with tiledbsoma.Experiment.open(db_uri, "w") as exp:
    # modify metadata
    df["SRX_accession"] = "SRX20585665" #df["SRX_accession"] 

    # update the database
    exp.obs.write(pa.Table.from_pandas(df))