# Notes

* code to check/summarize a tiledb database

In [3]:
import os
import scanpy as sc
import tiledbsoma
import tiledbsoma.io

In [4]:
#db_uri = "/home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/tiledb_exp1"
#db_uri = "/home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/tiledb_prod3"
#db_uri = "/home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/tiledb_TEST"
#db_uri = "/scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_prod_tmp"
#db_uri = "/scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_prod3_tmp"
#db_uri = "/scratch/multiomics/nickyoungblut/tiledb-loader/tiledb-soma_GeneFull_Ex50pAS"

# Summary

## Obs

In [3]:
# get colnames
with tiledbsoma.open(db_uri) as exp:
    print(exp.obs.schema)

soma_joinid: int64 not null
obs_id: large_string
gene_count: int64
umi_count: float
barcode: large_string
SRX_accession: dictionary<values=string, indices=int32, ordered=0>
lib_prep: dictionary<values=string, indices=int32, ordered=0>
tech_10x: dictionary<values=string, indices=int32, ordered=0>
organism: dictionary<values=string, indices=int32, ordered=0>
tissue: dictionary<values=string, indices=int32, ordered=0>
disease: dictionary<values=string, indices=int32, ordered=0>
purturbation: dictionary<values=string, indices=int32, ordered=0>
cell_line: dictionary<values=string, indices=int32, ordered=0>
czi_collection_id: dictionary<values=string, indices=int32, ordered=0>
czi_collection_name: dictionary<values=string, indices=int32, ordered=0>


In [4]:
# total obs
with tiledbsoma.open(db_uri) as exp:
    print(
        exp.obs.read(column_names=["obs_id"])
        .concat()
        .group_by([])
        .aggregate([
            ([], 'count_all'),
        ])
        .to_pandas()
    )

   count_all
0   44770961


In [5]:
# cells per SRX
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.obs.read(column_names=["obs_id", "SRX_accession"])
        .concat()
        .group_by(["SRX_accession"])
        .aggregate([
            ([], 'count_all'),
        ])
        .sort_by([("count_all", "descending")])
        .to_pandas()
    )
print(df.shape)
df.head()

(5991, 2)


Unnamed: 0,SRX_accession,count_all
0,SRX17521047,93266
1,SRX18774274,87650
2,SRX18899905,83021
3,SRX17917753,78088
4,SRX15483882,77392


### Metadata

In [12]:
# read in the data and summarize
with tiledbsoma.Experiment.open(db_uri) as exp:
    print(
        exp.obs.read()
        .concat()
        .group_by(["organism", "tech_10x"])
        .aggregate([
            ([], 'count_all'),
        ])
        .sort_by([("count_all", "descending")])
        .to_pandas()
    )

   organism           tech_10x  count_all
0     mouse        3_prime_gex   18278121
1     human        3_prime_gex   15413876
2     human        5_prime_gex    5574529
3     mouse        5_prime_gex    2300447
4     mouse           multiome     557445
5     human           multiome     523019
6     human  feature_barcoding     460394
7     human                vdj     438094
8     mouse     not_applicable     275526
9     human              other     219446
10    mouse              other     218551
11    human     not_applicable     206333
12    mouse                vdj     133023
13    mouse           cellplex      77726
14    mouse  feature_barcoding      68287
15    human           cellplex      26144


## Var

In [None]:
# read in the var (gene) level data
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.ms["RNA"]
        .var.read(column_names=["soma_joinid", "var_id", "gene_symbols"])
        .concat()
        .to_pandas()
    )
print(df.shape)
df.head()

(68886, 2)


Unnamed: 0,soma_joinid,var_id
0,0,ENSG00000000003
1,1,ENSG00000000005
2,2,ENSG00000000419
3,3,ENSG00000000457
4,4,ENSG00000000460


### Counts

In [10]:
# slice to just the first 5 cells
obs_query = tiledbsoma.AxisQuery(coords=[slice(0, 4)]) 

with tiledbsoma.Experiment.open(db_uri) as exp:
    adata = exp.axis_query("RNA", obs_query=obs_query).to_anndata(X_name="data")
    print(adata)

AnnData object with n_obs × n_vars = 5 × 68886
    obs: 'soma_joinid', 'gene_count', 'umi_count', 'barcode', 'SRX_accession', 'lib_prep', 'tech_10x', 'organism', 'tissue', 'disease', 'purturbation', 'cell_line', 'czi_collection_id', 'czi_collection_name', 'obs_id'
    var: 'soma_joinid', 'var_id'


### Metadata

In [8]:
# obs per tech_10x
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.obs.read()
        .concat()
        .group_by(["tech_10x"])
        .aggregate([
            ([], 'count_all'),
        ])
        .sort_by([("count_all", "descending")])
        .to_pandas()
    )
print(df.shape)
df.tail()

(4, 2)


Unnamed: 0,tech_10x,count_all
0,3_prime_gex,263797
1,5_prime_gex,90425
2,not_applicable,18041
3,multiome,11442


### Gene/UMI count

In [9]:
# gene and umi counts
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.obs.read(column_names=["SRX_accession", "gene_count", "umi_count"])
        .concat()
        .sort_by([("gene_count", "descending")])
        .to_pandas()
    )
df

Unnamed: 0,SRX_accession,gene_count,umi_count
0,SRX18899936,14987,251264.0
1,SRX18899936,14965,212983.0
2,SRX18899936,14753,190172.0
3,SRX18899936,14548,181595.0
4,SRX18899936,14300,171838.0
...,...,...,...
570763,SRX26111881,1,4.0
570764,SRX26111881,1,5.0
570765,SRX26111881,1,6.0
570766,SRX26111881,1,4.0


# Chris's processed data

In [25]:
db_uri = "/home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/tiledb_counted"

In [26]:
# get colnames
with tiledbsoma.open(db_uri) as exp:
    print(exp.obs.schema)

soma_joinid: int64 not null
obs_id: large_string
SRX_accession: large_string
lib_prep: large_string
tech_10x: large_string
organism: large_string
tissue: large_string
disease: large_string
purturbation: large_string
cell_line: large_string
czi_collection_id: large_string
czi_collection_name: large_string


In [27]:
# read in the data
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.obs.read(column_names=["obs_id", "SRX_accession"])
        .concat()
        .to_pandas()
    )
df

Unnamed: 0,obs_id,SRX_accession
0,AAACCCAAGAGATCGC,SRX13549201
1,AAACCCAAGGTCTTTG,SRX13549201
2,AAACCCAAGTTGGCGA,SRX13549201
3,AAACCCACAACCCTAA,SRX13549201
4,AAACCCACAATACCCA,SRX13549201
...,...,...
35122,TTTGTTGGTTCAGCTA,SRX21819799
35123,TTTGTTGTCACTGTCC,SRX21819799
35124,TTTGTTGTCGAGTACT,SRX21819799
35125,TTTGTTGTCTAATTCC,SRX21819799


In [28]:
df["SRX_accession"].value_counts()

SRX_accession
SRX13549201    14392
SRX21819799     7139
SRX17937428     5427
SRX11523721     5288
SRX7117648      2881
Name: count, dtype: int64

### Metadata

In [33]:
# read in the data
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.obs.read(column_names=["obs_id", "SRX_accession", "tech_10x", "organism", "czi_collection_id", "czi_collection_name"])
        .concat()
        .to_pandas()
    )
df

Unnamed: 0,obs_id,SRX_accession,tech_10x,organism,czi_collection_id,czi_collection_name
0,AAACCCAAGAGATCGC,SRX13549201,3_prime_gex,,74e10dc4-cbb2-4605-a189-8a1cd8e44d8c,Spatial proteogenomics reveals distinct and ev...
1,AAACCCAAGGTCTTTG,SRX13549201,3_prime_gex,,74e10dc4-cbb2-4605-a189-8a1cd8e44d8c,Spatial proteogenomics reveals distinct and ev...
2,AAACCCAAGTTGGCGA,SRX13549201,3_prime_gex,,74e10dc4-cbb2-4605-a189-8a1cd8e44d8c,Spatial proteogenomics reveals distinct and ev...
3,AAACCCACAACCCTAA,SRX13549201,3_prime_gex,,74e10dc4-cbb2-4605-a189-8a1cd8e44d8c,Spatial proteogenomics reveals distinct and ev...
4,AAACCCACAATACCCA,SRX13549201,3_prime_gex,,74e10dc4-cbb2-4605-a189-8a1cd8e44d8c,Spatial proteogenomics reveals distinct and ev...
...,...,...,...,...,...,...
35122,TTTGTTGGTTCAGCTA,SRX21819799,,,a0c84e3f-a5ca-4481-b3a5-ccfda0a81ecc,Unified comprehensive single-cell atlas of the...
35123,TTTGTTGTCACTGTCC,SRX21819799,,,a0c84e3f-a5ca-4481-b3a5-ccfda0a81ecc,Unified comprehensive single-cell atlas of the...
35124,TTTGTTGTCGAGTACT,SRX21819799,,,a0c84e3f-a5ca-4481-b3a5-ccfda0a81ecc,Unified comprehensive single-cell atlas of the...
35125,TTTGTTGTCTAATTCC,SRX21819799,,,a0c84e3f-a5ca-4481-b3a5-ccfda0a81ecc,Unified comprehensive single-cell atlas of the...


In [30]:
# count by tech_10x and organism
df.groupby(["tech_10x", "organism"]).size().reset_index(name="count")

Unnamed: 0,tech_10x,organism,count
0,3_prime_gex,,22700
1,,,7139


# Edit existing metadata

In [5]:
db_uri = "/home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/tiledb_TEST"

In [6]:
import pyarrow as pa

### Obs metadata

In [21]:
# get colnames
with tiledbsoma.open(db_uri) as exp:
    print(exp.obs.schema)

soma_joinid: int64 not null
obs_id: large_string
SRX_accession: large_string
lib_prep: large_string
tech_10x: large_string
organism: large_string
tissue: large_string
disease: large_string
purturbation: large_string
cell_line: large_string
czi_collection_id: large_string
czi_collection_name: large_string


In [27]:
# query
obs_query = tiledbsoma.AxisQuery(value_filter='organism in ["Homo sapiens"]')
#obs_query = tiledbsoma.AxisQuery(value_filter='organism in ["mouse"]')

# get the target records
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.axis_query("RNA", obs_query=obs_query)
        .obs()
        .concat()
        .to_pandas()
    )
df

Unnamed: 0,soma_joinid,obs_id,SRX_accession,lib_prep,tech_10x,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name
0,0,AAACCCAAGAGAGGTA,SRX23386174,10x_Genomics,3_prime_gex,Homo sapiens,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,
1,1,AAACCCAAGAGCAAGA,SRX23386174,10x_Genomics,3_prime_gex,Homo sapiens,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,
2,2,AAACCCAAGCTCTTCC,SRX23386174,10x_Genomics,3_prime_gex,Homo sapiens,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,
3,3,AAACCCAAGCTGCCAC,SRX23386174,10x_Genomics,3_prime_gex,Homo sapiens,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,
4,4,AAACCCAAGTCACGCC,SRX23386174,10x_Genomics,3_prime_gex,Homo sapiens,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,
...,...,...,...,...,...,...,...,...,...,...,...,...
69058,133935,TTTGGTTAGCCTATTG,SRX23508713,10x_Genomics,3_prime_gex,Homo sapiens,DAOY cell line,malignant,in vitro culture,DAOY,,
69059,133936,TTTGTTGAGGTTTGAA,SRX23508713,10x_Genomics,3_prime_gex,Homo sapiens,DAOY cell line,malignant,in vitro culture,DAOY,,
69060,133937,TTTGTTGCAGATCCTA,SRX23508713,10x_Genomics,3_prime_gex,Homo sapiens,DAOY cell line,malignant,in vitro culture,DAOY,,
69061,133938,TTTGTTGGTAGTGATA,SRX23508713,10x_Genomics,3_prime_gex,Homo sapiens,DAOY cell line,malignant,in vitro culture,DAOY,,


In [28]:
# update metadata
df["organism"] = "Homo sapiens"
#df["organism"] = "Mus musculus"

In [29]:
# fix the dtypes
for colname in ["czi_collection_id", "czi_collection_name"]:
    df[colname] = df[colname].astype(str)
df = pa.Table.from_pandas(df)

In [30]:
# update
with tiledbsoma.Experiment.open(db_uri, "w") as exp:
    exp.obs.write(df)

### Add a column

In [24]:
# Read existing data (must read all rows)
with tiledbsoma.Experiment.open(db_uri) as exp:
    obs = exp.obs.read().concat().to_pandas()
obs

Unnamed: 0,soma_joinid,obs_id,SRX_accession,lib_prep,tech_10x,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name
0,0,AAACCCAAGAGAGGTA,SRX23386174,10x_Genomics,3_prime_gex,Homo sapiens,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,
1,1,AAACCCAAGAGCAAGA,SRX23386174,10x_Genomics,3_prime_gex,Homo sapiens,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,
2,2,AAACCCAAGCTCTTCC,SRX23386174,10x_Genomics,3_prime_gex,Homo sapiens,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,
3,3,AAACCCAAGCTGCCAC,SRX23386174,10x_Genomics,3_prime_gex,Homo sapiens,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,
4,4,AAACCCAAGTCACGCC,SRX23386174,10x_Genomics,3_prime_gex,Homo sapiens,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,
...,...,...,...,...,...,...,...,...,...,...,...,...
133935,133935,TTTGGTTAGCCTATTG,SRX23508713,10x_Genomics,3_prime_gex,Homo sapiens,DAOY cell line,malignant,in vitro culture,DAOY,,
133936,133936,TTTGTTGAGGTTTGAA,SRX23508713,10x_Genomics,3_prime_gex,Homo sapiens,DAOY cell line,malignant,in vitro culture,DAOY,,
133937,133937,TTTGTTGCAGATCCTA,SRX23508713,10x_Genomics,3_prime_gex,Homo sapiens,DAOY cell line,malignant,in vitro culture,DAOY,,
133938,133938,TTTGTTGGTAGTGATA,SRX23508713,10x_Genomics,3_prime_gex,Homo sapiens,DAOY cell line,malignant,in vitro culture,DAOY,,


In [25]:
# Add a new column with default values
obs["TEST"] = "test_value"
# ...or remove column
#obs.drop("TEST", inplace=True, axis=1)
obs

Unnamed: 0,soma_joinid,obs_id,SRX_accession,lib_prep,tech_10x,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name,TEST
0,0,AAACCCAAGAGAGGTA,SRX23386174,10x_Genomics,3_prime_gex,Homo sapiens,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,,test_value
1,1,AAACCCAAGAGCAAGA,SRX23386174,10x_Genomics,3_prime_gex,Homo sapiens,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,,test_value
2,2,AAACCCAAGCTCTTCC,SRX23386174,10x_Genomics,3_prime_gex,Homo sapiens,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,,test_value
3,3,AAACCCAAGCTGCCAC,SRX23386174,10x_Genomics,3_prime_gex,Homo sapiens,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,,test_value
4,4,AAACCCAAGTCACGCC,SRX23386174,10x_Genomics,3_prime_gex,Homo sapiens,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,,test_value
...,...,...,...,...,...,...,...,...,...,...,...,...,...
133935,133935,TTTGGTTAGCCTATTG,SRX23508713,10x_Genomics,3_prime_gex,Homo sapiens,DAOY cell line,malignant,in vitro culture,DAOY,,,test_value
133936,133936,TTTGTTGAGGTTTGAA,SRX23508713,10x_Genomics,3_prime_gex,Homo sapiens,DAOY cell line,malignant,in vitro culture,DAOY,,,test_value
133937,133937,TTTGTTGCAGATCCTA,SRX23508713,10x_Genomics,3_prime_gex,Homo sapiens,DAOY cell line,malignant,in vitro culture,DAOY,,,test_value
133938,133938,TTTGTTGGTAGTGATA,SRX23508713,10x_Genomics,3_prime_gex,Homo sapiens,DAOY cell line,malignant,in vitro culture,DAOY,,,test_value


In [None]:
# update obs
with tiledbsoma.Experiment.open(db_uri, "w") as exp:
    tiledbsoma.io.update_obs(exp, obs)

### Var metadata

In [27]:
# get colnames
with tiledbsoma.open(db_uri) as exp:
    print(exp.ms["RNA"].var.schema)

soma_joinid: int64 not null
var_id: large_string
gene_symbols: large_string
feature_types: large_string


In [34]:
# read in the var (gene) level data
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.ms["RNA"]
        .var.read()
        .concat()
        .to_pandas()
    )
print(df.shape)
df.head()

(68886, 5)


Unnamed: 0,soma_joinid,var_id,gene_symbols,feature_types,organism
0,0,ENSG00000243485,MIR1302-2HG,Gene Expression,Homo sapiens
1,1,ENSG00000237613,FAM138A,Gene Expression,Homo sapiens
2,2,ENSG00000186092,OR4F5,Gene Expression,Homo sapiens
3,3,ENSG00000238009,AL627309.1,Gene Expression,Homo sapiens
4,4,ENSG00000239945,AL627309.3,Gene Expression,Homo sapiens


In [30]:
# if ENSG, add `Homo sapiens`
# if ENSMUSG, add `Mus musculus`
def ensemble_to_org(gene_id):
    if gene_id.startswith('ENSG'):
        return 'Homo sapiens'
    elif gene_id.startswith('ENSMUSG'):
        return 'Mus musculus'
    else:
        return 'Unknown'

df["organism"] = df["var_id"].apply(ensemble_to_org)
df

Unnamed: 0,soma_joinid,var_id,gene_symbols,feature_types,organism
0,0,ENSG00000243485,MIR1302-2HG,Gene Expression,Homo sapiens
1,1,ENSG00000237613,FAM138A,Gene Expression,Homo sapiens
2,2,ENSG00000186092,OR4F5,Gene Expression,Homo sapiens
3,3,ENSG00000238009,AL627309.1,Gene Expression,Homo sapiens
4,4,ENSG00000239945,AL627309.3,Gene Expression,Homo sapiens
...,...,...,...,...,...
68881,68881,ENSMUSG00000095523,AC124606.1,Gene Expression,Mus musculus
68882,68882,ENSMUSG00000095475,AC133095.2,Gene Expression,Mus musculus
68883,68883,ENSMUSG00000094855,AC133095.1,Gene Expression,Mus musculus
68884,68884,ENSMUSG00000095019,AC234645.1,Gene Expression,Mus musculus


In [32]:
# update
with tiledbsoma.Experiment.open(db_uri, "w") as exp:
    tiledbsoma.io.update_var(exp, df, measurement_name="RNA")

In [37]:
# query by organism: obs & var
query = tiledbsoma.AxisQuery(value_filter='organism in ["Homo sapiens"]')

# get the target records
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.axis_query("RNA", obs_query=query, var_query=query)
        .var()
        .concat()
        .to_pandas()
    )
df

Unnamed: 0,soma_joinid,var_id,gene_symbols,feature_types,organism
0,0,ENSG00000243485,MIR1302-2HG,Gene Expression,Homo sapiens
1,1,ENSG00000237613,FAM138A,Gene Expression,Homo sapiens
2,2,ENSG00000186092,OR4F5,Gene Expression,Homo sapiens
3,3,ENSG00000238009,AL627309.1,Gene Expression,Homo sapiens
4,4,ENSG00000239945,AL627309.3,Gene Expression,Homo sapiens
...,...,...,...,...,...
36596,36596,ENSG00000277836,AC141272.1,Gene Expression,Homo sapiens
36597,36597,ENSG00000278633,AC023491.2,Gene Expression,Homo sapiens
36598,36598,ENSG00000276017,AC007325.1,Gene Expression,Homo sapiens
36599,36599,ENSG00000278817,AC007325.4,Gene Expression,Homo sapiens


# #-- Sandbox --#

In [2]:
import os
import tiledbsoma
import tiledbsoma.io

In [3]:
db_uri = "~/dev/nextflow/scRecounter/tmp/tiledb/srx3/tiledb-soma"

In [4]:
# get colnames
with tiledbsoma.open(db_uri) as exp:
    print(exp.obs.schema)

soma_joinid: int64 not null
obs_id: large_string
SRX_accession: dictionary<values=string, indices=int32, ordered=0>


In [5]:
# distinct values
with tiledbsoma.Experiment.open(db_uri) as exp:
    print(
        exp.obs.read(column_names=["obs_id", "SRX_accession"])
        .concat()
        .group_by(["SRX_accession"])
        .aggregate([])
        .to_pandas()
    )

  SRX_accession
0   SRX24099779
1   SRX21101392
2   SRX20585665


In [6]:
# cells per SRX
with tiledbsoma.Experiment.open(db_uri) as exp:
    print(
        exp.obs.read(column_names=["obs_id", "SRX_accession"])
        .concat()
        .group_by(["SRX_accession"])
        .aggregate([
            ([], 'count_all'),
        ])
        .sort_by([("count_all", "descending")])
        .to_pandas()
    )

  SRX_accession  count_all
0   SRX21101392       6311
1   SRX24099779       3688
2   SRX20585665        975
