# Notes

* code to check/summarize a tiledb database

In [1]:
import os
import tiledbsoma
import tiledbsoma.io

In [2]:
#db_uri = "/home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/tiledb_exp1"
db_uri = "/home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/tiledb_prod3"

# Summary

## Obs

In [None]:
# get colnames
with tiledbsoma.open(db_uri) as exp:
    print(exp.obs.schema)

soma_joinid: int64 not null
obs_id: large_string
SRX_accession: large_string
lib_prep: large_string
tech_10x: large_string
organism: large_string
tissue: large_string
disease: large_string
purturbation: large_string
cell_line: large_string
czi_collection_id: large_string
czi_collection_name: large_string


In [22]:
# read in the data
with tiledbsoma.Experiment.open(db_uri) as exp:
    print(
        exp.obs.read(column_names=["obs_id", "SRX_accession"])
        .concat()
        .group_by(["SRX_accession"])
        .aggregate([
            ([], 'count_all'),
        ])
        .to_pandas()
    )

    SRX_accession  count_all
0     SRX23386174      25046
1     SRX25595940      11779
2     SRX19303371       6520
3     SRX24003271       9849
4     SRX26275147       9007
..            ...        ...
188   SRX25989517       6467
189   SRX24911288       3638
190   SRX25408713       4996
191   SRX26517486       2672
192   SRX20347966       3282

[193 rows x 2 columns]


### Metadata

In [None]:
# read in the data
# with tiledbsoma.Experiment.open(db_uri) as exp:
#     print(
#         exp.obs.read(column_names=["obs_id", "SRX_accession", "tech_10x", "organism"])
#         .concat()
#         .to_pandas()
#         .groupby(["tech_10x", "organism"])
#         .size()
#         .reset_index(name="count")
#     )

             tech_10x organism   count
0         3_prime_gex      NaN   20489
1         3_prime_gex    human  338493
2         3_prime_gex    mouse  370714
3         5_prime_gex    human  119967
4         5_prime_gex    mouse   67318
5                 NaN      NaN    6497
6   feature_barcoding    mouse    5398
7            multiome    human    1907
8            multiome    mouse    6596
9      not_applicable    human   17975
10     not_applicable    mouse   25694
11              other    human   36557
12              other    mouse   12511
13                vdj    human    6274


In [None]:
# read in the data and summarize
with tiledbsoma.Experiment.open(db_uri) as exp:
    print(
        exp.obs.read()
        .concat()
        .group_by(["organism", "tech_10x"])
        .aggregate([
            ([], 'count_all'),
        ])
        .to_pandas()
    )

   organism           tech_10x  count_all
0     human        3_prime_gex     394704
1     mouse        3_prime_gex     433913
2     mouse              other      12511
3     human                vdj       6274
4     human        5_prime_gex     137467
5     human           multiome      10504
6     mouse        5_prime_gex      82652
7     mouse     not_applicable      25694
8     mouse           multiome       6596
9       NaN        3_prime_gex      20489
10     None               None       4774
11      NaN                NaN       6497
12    human              other      36557
13    human     not_applicable      17975
14    mouse  feature_barcoding       5398


## Var

In [23]:
# read in the var (gene) level data
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.ms["RNA"]
        .var.read(column_names=["soma_joinid", "var_id", "gene_symbols"])
        .concat()
        .to_pandas()
    )
df

Unnamed: 0,soma_joinid,var_id,gene_symbols
0,0,ENSG00000243485,MIR1302-2HG
1,1,ENSG00000237613,FAM138A
2,2,ENSG00000186092,OR4F5
3,3,ENSG00000238009,AL627309.1
4,4,ENSG00000239945,AL627309.3
...,...,...,...
68881,68881,ENSMUSG00000095523,AC124606.1
68882,68882,ENSMUSG00000095475,AC133095.2
68883,68883,ENSMUSG00000094855,AC133095.1
68884,68884,ENSMUSG00000095019,AC234645.1


### Counts

In [11]:
# slice out a sparse matrix
with tiledbsoma.Experiment.open(db_uri) as exp:
    print(exp.ms["RNA"].X["data"].read([slice(0, 5)]).coos().concat())

<pyarrow.SparseCOOTensor>
type: float
shape: (668678, 68886)


In [12]:
# slice by metadata and return as anndata
obs_query = tiledbsoma.AxisQuery(value_filter='tech_10x in ["vdj"]')

with tiledbsoma.Experiment.open(db_uri) as exp:
    print(exp.axis_query("RNA", obs_query=obs_query).to_anndata(X_name="data"))

AnnData object with n_obs × n_vars = 6274 × 68886
    obs: 'soma_joinid', 'obs_id', 'SRX_accession', 'lib_prep', 'tech_10x', 'organism', 'tissue', 'disease', 'purturbation', 'cell_line', 'czi_collection_id', 'czi_collection_name'
    var: 'soma_joinid', 'var_id', 'gene_symbols', 'feature_types'


In [15]:
# slice to just the first 5 cells
obs_query = tiledbsoma.AxisQuery(coords=[slice(0, 4)]) 

with tiledbsoma.Experiment.open(db_uri) as exp:
    adata = exp.axis_query("RNA", obs_query=obs_query).to_anndata(X_name="data")
    print(adata)

AnnData object with n_obs × n_vars = 5 × 68886
    obs: 'soma_joinid', 'obs_id', 'SRX_accession', 'lib_prep', 'tech_10x', 'organism', 'tissue', 'disease', 'purturbation', 'cell_line', 'czi_collection_id', 'czi_collection_name'
    var: 'soma_joinid', 'var_id', 'gene_symbols', 'feature_types'


# Chris's processed data

In [25]:
db_uri = "/home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/tiledb_counted"

In [26]:
# get colnames
with tiledbsoma.open(db_uri) as exp:
    print(exp.obs.schema)

soma_joinid: int64 not null
obs_id: large_string
SRX_accession: large_string
lib_prep: large_string
tech_10x: large_string
organism: large_string
tissue: large_string
disease: large_string
purturbation: large_string
cell_line: large_string
czi_collection_id: large_string
czi_collection_name: large_string


In [27]:
# read in the data
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.obs.read(column_names=["obs_id", "SRX_accession"])
        .concat()
        .to_pandas()
    )
df

Unnamed: 0,obs_id,SRX_accession
0,AAACCCAAGAGATCGC,SRX13549201
1,AAACCCAAGGTCTTTG,SRX13549201
2,AAACCCAAGTTGGCGA,SRX13549201
3,AAACCCACAACCCTAA,SRX13549201
4,AAACCCACAATACCCA,SRX13549201
...,...,...
35122,TTTGTTGGTTCAGCTA,SRX21819799
35123,TTTGTTGTCACTGTCC,SRX21819799
35124,TTTGTTGTCGAGTACT,SRX21819799
35125,TTTGTTGTCTAATTCC,SRX21819799


In [28]:
df["SRX_accession"].value_counts()

SRX_accession
SRX13549201    14392
SRX21819799     7139
SRX17937428     5427
SRX11523721     5288
SRX7117648      2881
Name: count, dtype: int64

### Metadata

In [33]:
# read in the data
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.obs.read(column_names=["obs_id", "SRX_accession", "tech_10x", "organism", "czi_collection_id", "czi_collection_name"])
        .concat()
        .to_pandas()
    )
df

Unnamed: 0,obs_id,SRX_accession,tech_10x,organism,czi_collection_id,czi_collection_name
0,AAACCCAAGAGATCGC,SRX13549201,3_prime_gex,,74e10dc4-cbb2-4605-a189-8a1cd8e44d8c,Spatial proteogenomics reveals distinct and ev...
1,AAACCCAAGGTCTTTG,SRX13549201,3_prime_gex,,74e10dc4-cbb2-4605-a189-8a1cd8e44d8c,Spatial proteogenomics reveals distinct and ev...
2,AAACCCAAGTTGGCGA,SRX13549201,3_prime_gex,,74e10dc4-cbb2-4605-a189-8a1cd8e44d8c,Spatial proteogenomics reveals distinct and ev...
3,AAACCCACAACCCTAA,SRX13549201,3_prime_gex,,74e10dc4-cbb2-4605-a189-8a1cd8e44d8c,Spatial proteogenomics reveals distinct and ev...
4,AAACCCACAATACCCA,SRX13549201,3_prime_gex,,74e10dc4-cbb2-4605-a189-8a1cd8e44d8c,Spatial proteogenomics reveals distinct and ev...
...,...,...,...,...,...,...
35122,TTTGTTGGTTCAGCTA,SRX21819799,,,a0c84e3f-a5ca-4481-b3a5-ccfda0a81ecc,Unified comprehensive single-cell atlas of the...
35123,TTTGTTGTCACTGTCC,SRX21819799,,,a0c84e3f-a5ca-4481-b3a5-ccfda0a81ecc,Unified comprehensive single-cell atlas of the...
35124,TTTGTTGTCGAGTACT,SRX21819799,,,a0c84e3f-a5ca-4481-b3a5-ccfda0a81ecc,Unified comprehensive single-cell atlas of the...
35125,TTTGTTGTCTAATTCC,SRX21819799,,,a0c84e3f-a5ca-4481-b3a5-ccfda0a81ecc,Unified comprehensive single-cell atlas of the...


In [30]:
# count by tech_10x and organism
df.groupby(["tech_10x", "organism"]).size().reset_index(name="count")

Unnamed: 0,tech_10x,organism,count
0,3_prime_gex,,22700
1,,,7139
