# Install

```bash
mamba create -n tiledb  -c conda-forge -c tiledb tiledbsoma-py requests
```

# Install test dataset

In [22]:
#file_url = "https://github.com/single-cell-data/TileDB-SOMA/blob/main/data/pbmc3k_processed.h5ad"
file_url = "https://github.com/chanzuckerberg/cellxgene/raw/main/example-dataset/pbmc3k.h5ad"

work_dir = "/home/nickyoungblut/dev/tmp/tiledb"

In [23]:
import os
import requests

# Local file path to save the downloaded file
h5ad_file = os.path.join(work_dir, os.path.basename(file_url))

try:
    # Send a GET request to the file URL
    response = requests.get(file_url)
    response.raise_for_status()  # Check for HTTP request errors
    
    # Write the content of the file to the local system
    with open(h5ad_file, "wb") as file:
        file.write(response.content)
    
    print(f"File downloaded successfully as {h5ad_file}")
except requests.exceptions.RequestException as e:
    print(f"Error downloading the file: {e}")

File downloaded successfully as /home/nickyoungblut/dev/tmp/tiledb/pbmc3k.h5ad


# Create db

In [25]:
import tiledbsoma
import tiledbsoma.io

In [None]:
pbmc3k_uri = tiledbsoma.io.from_h5ad(
    os.path.join(work_dir, "pbmc3k"), 
    input_path = h5ad_file, #"pbmc3k_processed.h5ad", 
    measurement_name = "RNA"   # RNA-seq data
)


This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(


In [27]:
with tiledbsoma.open(pbmc3k_uri) as pbmc3k_soma:
    pbmc3k_obs_slice = pbmc3k_soma.obs.read(
        value_filter="n_genes >500 and louvain in ['Megakaryocytes', 'CD4 T cells']"
    )
    
    # Concatenate iterator to pyarrow.Table
    print(pbmc3k_obs_slice.concat())

pyarrow.Table
soma_joinid: int64
obs_id: large_string
n_genes: int64
percent_mito: float
n_counts: float
louvain: dictionary<values=string, indices=int32, ordered=0>
----
soma_joinid: [[0,2,8,11,12,...,2617,2621,2626,2631,2637]]
obs_id: [["AAACATACAACCAC-1","AAACATTGATCAGC-1","AAACGCTGTAGCCA-1","AAACTTGATCCAGA-1","AAAGAGACGAGATA-1",...,"TTGTAGCTAGCTCA-1","TTTAGCTGATACCG-1","TTTCACGAGGTTCA-1","TTTCCAGAGGTGAG-1","TTTGCATGCCTCAC-1"]]
n_genes: [[781,1131,533,751,866,...,933,887,721,873,724]]
percent_mito: [[0.030177759,0.008897362,0.011764706,0.010887772,0.010788382,...,0.02224871,0.022875817,0.013261297,0.0068587107,0.008064516]]
n_counts: [[2419,3147,1275,2388,2410,...,2517,2754,2036,2187,1984]]
louvain: [  -- dictionary:
["CD4 T cells","CD14+ Monocytes","B cells","CD8 T cells","NK cells","FCGR3A+ Monocytes","Dendritic cells","Megakaryocytes"]  -- indices:
[0,0,0,0,0,...,0,0,0,0,0]]


# Create from matrix

In [45]:
import tempfile
from shutil import rmtree
import scanpy as sc

In [46]:
input_path = "/home/nickyoungblut/dev/tmp/tiledb/SCRECOUNTER_2025-01-24_23-55-08/STAR/SRX21101392/Gene/filtered"
srx_accession = "SRX21101392"

In [47]:
# Read 10x mtx data
adata = sc.read_10x_mtx(
    input_path,
    var_names="gene_ids",  
    make_unique=True  
)

In [48]:
# add SRX column
adata.obs["SRX_accession"] = [srx_accession] * len(adata.obs)

In [49]:
# write adata file to temp location
temp_dir = tempfile.mkdtemp()
h5ad_file = os.path.join(temp_dir, "adata.h5ad")
adata.write_h5ad(h5ad_file)

In [50]:
# create tiledb soma db
db_dir = os.path.join(work_dir, srx_accession)

## read from temp location and write to tiledb
if os.path.exists(db_dir):
    rmtree(db_dir)

## create db
srx_uri = tiledbsoma.io.from_h5ad(
    db_dir, 
    input_path = h5ad_file,
    measurement_name = "RNA"
)

In [51]:
# read in the data
with tiledbsoma.Experiment.open(srx_uri) as exp:
    print(
        exp.obs.read(column_names=["obs_id", "SRX_accession"])
        .concat()
        .to_pandas()
    )

                obs_id SRX_accession
0     AAACCTGCAAGCCTAT   SRX21101392
1     AAACCTGCAGCTGCTG   SRX21101392
2     AAACCTGCAGCTGGCT   SRX21101392
3     AAACCTGGTCTGATCA   SRX21101392
4     AAACCTGGTTCCCGAG   SRX21101392
...                ...           ...
6306  TTTGTCAGTCGTCTTC   SRX21101392
6307  TTTGTCAGTGGCAAAC   SRX21101392
6308  TTTGTCATCACCGTAA   SRX21101392
6309  TTTGTCATCGCACTCT   SRX21101392
6310  TTTGTCATCTAAGCCA   SRX21101392

[6311 rows x 2 columns]


In [53]:
# 
with tiledbsoma.Experiment.open(srx_uri) as exp:
    print(
        exp.ms["RNA"]
        .var.read(column_names=["soma_joinid", "var_id", "gene_symbols"])
        .concat()
        .to_pandas()
    )

       soma_joinid           var_id gene_symbols
0                0  ENSG00000243485  MIR1302-2HG
1                1  ENSG00000237613      FAM138A
2                2  ENSG00000186092        OR4F5
3                3  ENSG00000238009   AL627309.1
4                4  ENSG00000239945   AL627309.3
...            ...              ...          ...
36596        36596  ENSG00000277836   AC141272.1
36597        36597  ENSG00000278633   AC023491.2
36598        36598  ENSG00000276017   AC007325.1
36599        36599  ENSG00000278817   AC007325.4
36600        36600  ENSG00000277196   AC007325.2

[36601 rows x 3 columns]


### Append data

In [58]:
input_path = "/home/nickyoungblut/dev/tmp/tiledb/SCRECOUNTER_2025-01-22_01-10-09/STAR/SRX24099779/Gene/filtered"
srx_accession = "SRX24099779"

In [59]:
# Read 10x mtx data
adata = sc.read_10x_mtx(
    input_path,
    var_names="gene_ids",  
    make_unique=True  
)

In [60]:
# add SRX column
adata.obs["SRX_accession"] = [srx_accession] * len(adata.obs)

In [61]:
# # write adata file to temp location
# temp_dir = tempfile.mkdtemp()
# h5ad_file = os.path.join(temp_dir, "adata.h5ad")
# adata.write_h5ad(h5ad_file)

In [62]:
# register
rd = tiledbsoma.io.register_anndatas(
    srx_uri,
    [adata],
    measurement_name="RNA",
    obs_field_name="obs_id",
    var_field_name="var_id",
)

In [None]:
# check the experiment shapes
with tiledbsoma.Experiment.open(srx_uri) as exp:
    tiledbsoma.io.show_experiment_shapes(exp.uri)


[DataFrame] obs 
  URI file:///home/nickyoungblut/dev/tmp/tiledb/SRX21101392/obs
  non_empty_domain     ((0, 6310),)
  domain               ((0, 6310),)
  maxdomain            ((0, 9223372036854773758),)
  upgraded             True

[DataFrame] ms/RNA/var 
  URI file:///home/nickyoungblut/dev/tmp/tiledb/SRX21101392/ms/RNA/var
  non_empty_domain     ((0, 36600),)
  domain               ((0, 36600),)
  maxdomain            ((0, 9223372036854773758),)
  upgraded             True

[SparseNDArray] ms/RNA/X/data 
  URI file:///home/nickyoungblut/dev/tmp/tiledb/SRX21101392/ms/RNA/X/data
  used_shape           ((0, 6310), (0, 36599))
  shape                (6311, 36601)
  maxshape             (9223372036854773759, 9223372036854773759)
  upgraded             True


In [69]:
# apply resize
with tiledbsoma.Experiment.open(srx_uri) as exp:
    tiledbsoma.io.resize_experiment(
        exp.uri, 
        nobs=rd.get_obs_shape(), 
        nvars=rd.get_var_shapes()
    )

In [70]:
# ingest new data
tiledbsoma.io.from_anndata(
    srx_uri,
    adata,
    measurement_name="RNA",
    registration_mapping=rd,
)

'file:///home/nickyoungblut/dev/tmp/tiledb/SRX21101392'

In [71]:
# read in the data
with tiledbsoma.Experiment.open(srx_uri) as exp:
    print(
        exp.obs.read(column_names=["obs_id", "SRX_accession"])
        .concat()
        .to_pandas()
    )

                obs_id SRX_accession
0     AAACCTGCAAGCCTAT   SRX21101392
1     AAACCTGCAGCTGCTG   SRX21101392
2     AAACCTGCAGCTGGCT   SRX21101392
3     AAACCTGGTCTGATCA   SRX21101392
4     AAACCTGGTTCCCGAG   SRX21101392
...                ...           ...
9994  TTTGTTGCATTAAGCC   SRX24099779
9995  TTTGTTGGTCGACTGC   SRX24099779
9996  TTTGTTGTCTAGTCAG   SRX24099779
9997  TTTGTTGTCTCTGCCA   SRX24099779
9998  TTTGTTGTCTGGGCGT   SRX24099779

[9999 rows x 2 columns]
