# Notes

* code to check/summarize a tiledb database

In [1]:
import os
import tiledbsoma
import tiledbsoma.io

In [2]:
#db_uri = "/home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/tiledb_exp1"
#db_uri = "/home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/tiledb_prod3"
db_uri = "/home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/tiledb_TEST"

# Summary

## Obs

In [3]:
# get colnames
with tiledbsoma.open(db_uri) as exp:
    print(exp.obs.schema)

soma_joinid: int64 not null
obs_id: large_string
SRX_accession: large_string
lib_prep: large_string
tech_10x: large_string
organism: large_string
tissue: large_string
disease: large_string
purturbation: large_string
cell_line: large_string
czi_collection_id: large_string
czi_collection_name: large_string


In [4]:
# total obs
with tiledbsoma.open(db_uri) as exp:
    print(
        exp.obs.read(column_names=["obs_id"])
        .concat()
        .group_by([])
        .aggregate([
            ([], 'count_all'),
        ])
        .to_pandas()
    )

   count_all
0     133940


In [5]:
# cells per SRX
with tiledbsoma.Experiment.open(db_uri) as exp:
    print(
        exp.obs.read(column_names=["obs_id", "SRX_accession"])
        .concat()
        .group_by(["SRX_accession"])
        .aggregate([
            ([], 'count_all'),
        ])
        .sort_by([("count_all", "descending")])
        .to_pandas()
    )

   SRX_accession  count_all
0    SRX23386174      25046
1    SRX17429793      23943
2    SRX24003748      11639
3    SRX25595940      11566
4    SRX24003271       9871
5    SRX26275147       8867
6    SRX20179995       6893
7    SRX19303371       6508
8    SRX21101392       6176
9    SRX20732941       6137
10   SRX25137075       4637
11   SRX24367499       4158
12   SRX25406990       3784
13   SRX23508713       2660
14   SRX23508714       1301
15   SRX23508715        754


### Metadata

In [6]:
# read in the data and summarize
with tiledbsoma.Experiment.open(db_uri) as exp:
    print(
        exp.obs.read()
        .concat()
        .group_by(["organism", "tech_10x"])
        .aggregate([
            ([], 'count_all'),
        ])
        .sort_by([("count_all", "descending")])
        .to_pandas()
    )

  organism     tech_10x  count_all
0    human  3_prime_gex      59103
1    mouse  3_prime_gex      56082
2    human          vdj       6176
3    mouse  5_prime_gex       4637
4    mouse        other       4158
5    human  5_prime_gex       3784


## Var

In [7]:
# read in the var (gene) level data
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.ms["RNA"]
        .var.read(column_names=["soma_joinid", "var_id", "gene_symbols"])
        .concat()
        .to_pandas()
    )
df

Unnamed: 0,soma_joinid,var_id,gene_symbols
0,0,ENSG00000243485,MIR1302-2HG
1,1,ENSG00000237613,FAM138A
2,2,ENSG00000186092,OR4F5
3,3,ENSG00000238009,AL627309.1
4,4,ENSG00000239945,AL627309.3
...,...,...,...
68881,68881,ENSMUSG00000095523,AC124606.1
68882,68882,ENSMUSG00000095475,AC133095.2
68883,68883,ENSMUSG00000094855,AC133095.1
68884,68884,ENSMUSG00000095019,AC234645.1


### Counts

In [8]:
# slice out a sparse matrix
with tiledbsoma.Experiment.open(db_uri) as exp:
    print(exp.ms["RNA"].X["data"].read([slice(0, 5)]).coos().concat())

<pyarrow.SparseCOOTensor>
type: float
shape: (3782010, 68886)


In [9]:
# slice by metadata and return as anndata
obs_query = tiledbsoma.AxisQuery(value_filter='tech_10x in ["vdj"]')

#with tiledbsoma.Experiment.open(db_uri) as exp:
#    print(exp.axis_query("RNA", obs_query=obs_query).to_anndata(X_name="data"))

: 

In [15]:
# slice to just the first 5 cells
obs_query = tiledbsoma.AxisQuery(coords=[slice(0, 4)]) 

with tiledbsoma.Experiment.open(db_uri) as exp:
    adata = exp.axis_query("RNA", obs_query=obs_query).to_anndata(X_name="data")
    print(adata)

AnnData object with n_obs × n_vars = 5 × 68886
    obs: 'soma_joinid', 'obs_id', 'SRX_accession', 'lib_prep', 'tech_10x', 'organism', 'tissue', 'disease', 'purturbation', 'cell_line', 'czi_collection_id', 'czi_collection_name'
    var: 'soma_joinid', 'var_id', 'gene_symbols', 'feature_types'


# Chris's processed data

In [25]:
db_uri = "/home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/tiledb_counted"

In [26]:
# get colnames
with tiledbsoma.open(db_uri) as exp:
    print(exp.obs.schema)

soma_joinid: int64 not null
obs_id: large_string
SRX_accession: large_string
lib_prep: large_string
tech_10x: large_string
organism: large_string
tissue: large_string
disease: large_string
purturbation: large_string
cell_line: large_string
czi_collection_id: large_string
czi_collection_name: large_string


In [27]:
# read in the data
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.obs.read(column_names=["obs_id", "SRX_accession"])
        .concat()
        .to_pandas()
    )
df

Unnamed: 0,obs_id,SRX_accession
0,AAACCCAAGAGATCGC,SRX13549201
1,AAACCCAAGGTCTTTG,SRX13549201
2,AAACCCAAGTTGGCGA,SRX13549201
3,AAACCCACAACCCTAA,SRX13549201
4,AAACCCACAATACCCA,SRX13549201
...,...,...
35122,TTTGTTGGTTCAGCTA,SRX21819799
35123,TTTGTTGTCACTGTCC,SRX21819799
35124,TTTGTTGTCGAGTACT,SRX21819799
35125,TTTGTTGTCTAATTCC,SRX21819799


In [28]:
df["SRX_accession"].value_counts()

SRX_accession
SRX13549201    14392
SRX21819799     7139
SRX17937428     5427
SRX11523721     5288
SRX7117648      2881
Name: count, dtype: int64

### Metadata

In [33]:
# read in the data
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.obs.read(column_names=["obs_id", "SRX_accession", "tech_10x", "organism", "czi_collection_id", "czi_collection_name"])
        .concat()
        .to_pandas()
    )
df

Unnamed: 0,obs_id,SRX_accession,tech_10x,organism,czi_collection_id,czi_collection_name
0,AAACCCAAGAGATCGC,SRX13549201,3_prime_gex,,74e10dc4-cbb2-4605-a189-8a1cd8e44d8c,Spatial proteogenomics reveals distinct and ev...
1,AAACCCAAGGTCTTTG,SRX13549201,3_prime_gex,,74e10dc4-cbb2-4605-a189-8a1cd8e44d8c,Spatial proteogenomics reveals distinct and ev...
2,AAACCCAAGTTGGCGA,SRX13549201,3_prime_gex,,74e10dc4-cbb2-4605-a189-8a1cd8e44d8c,Spatial proteogenomics reveals distinct and ev...
3,AAACCCACAACCCTAA,SRX13549201,3_prime_gex,,74e10dc4-cbb2-4605-a189-8a1cd8e44d8c,Spatial proteogenomics reveals distinct and ev...
4,AAACCCACAATACCCA,SRX13549201,3_prime_gex,,74e10dc4-cbb2-4605-a189-8a1cd8e44d8c,Spatial proteogenomics reveals distinct and ev...
...,...,...,...,...,...,...
35122,TTTGTTGGTTCAGCTA,SRX21819799,,,a0c84e3f-a5ca-4481-b3a5-ccfda0a81ecc,Unified comprehensive single-cell atlas of the...
35123,TTTGTTGTCACTGTCC,SRX21819799,,,a0c84e3f-a5ca-4481-b3a5-ccfda0a81ecc,Unified comprehensive single-cell atlas of the...
35124,TTTGTTGTCGAGTACT,SRX21819799,,,a0c84e3f-a5ca-4481-b3a5-ccfda0a81ecc,Unified comprehensive single-cell atlas of the...
35125,TTTGTTGTCTAATTCC,SRX21819799,,,a0c84e3f-a5ca-4481-b3a5-ccfda0a81ecc,Unified comprehensive single-cell atlas of the...


In [30]:
# count by tech_10x and organism
df.groupby(["tech_10x", "organism"]).size().reset_index(name="count")

Unnamed: 0,tech_10x,organism,count
0,3_prime_gex,,22700
1,,,7139


# Edit existing metadata

In [3]:
import pyarrow as pa

In [4]:
# get colnames
with tiledbsoma.open(db_uri) as exp:
    print(exp.obs.schema)

soma_joinid: int64 not null
obs_id: large_string
SRX_accession: large_string
lib_prep: large_string
tech_10x: large_string
organism: large_string
tissue: large_string
disease: large_string
purturbation: large_string
cell_line: large_string
czi_collection_id: large_string
czi_collection_name: large_string


In [23]:
# query
obs_query = tiledbsoma.AxisQuery(value_filter='organism in ["Homo sapien"]')
#obs_query = tiledbsoma.AxisQuery(value_filter='tissue in ["bronchial brushing"]')

# get the target records
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.axis_query("RNA", obs_query=obs_query)
        .obs()
        .concat()
        .to_pandas()
    )
df

Unnamed: 0,soma_joinid,obs_id,SRX_accession,lib_prep,tech_10x,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name
0,0,AAACCCAAGAGAGGTA,SRX23386174,10x_Genomics,3_prime_gex,Homo sapien,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,
1,1,AAACCCAAGAGCAAGA,SRX23386174,10x_Genomics,3_prime_gex,Homo sapien,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,
2,2,AAACCCAAGCTCTTCC,SRX23386174,10x_Genomics,3_prime_gex,Homo sapien,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,
3,3,AAACCCAAGCTGCCAC,SRX23386174,10x_Genomics,3_prime_gex,Homo sapien,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,
4,4,AAACCCAAGTCACGCC,SRX23386174,10x_Genomics,3_prime_gex,Homo sapien,bronchial brushing,neovascular age-related macular degeneration,bronchial cells dissociated from asthmatic chi...,bronchial airway cells,,
...,...,...,...,...,...,...,...,...,...,...,...,...
69058,133935,TTTGGTTAGCCTATTG,SRX23508713,10x_Genomics,3_prime_gex,Homo sapien,DAOY cell line,malignant,in vitro culture,DAOY,,
69059,133936,TTTGTTGAGGTTTGAA,SRX23508713,10x_Genomics,3_prime_gex,Homo sapien,DAOY cell line,malignant,in vitro culture,DAOY,,
69060,133937,TTTGTTGCAGATCCTA,SRX23508713,10x_Genomics,3_prime_gex,Homo sapien,DAOY cell line,malignant,in vitro culture,DAOY,,
69061,133938,TTTGTTGGTAGTGATA,SRX23508713,10x_Genomics,3_prime_gex,Homo sapien,DAOY cell line,malignant,in vitro culture,DAOY,,


In [19]:
# update metadata
df["organism"] = "Homo sapien"

In [20]:
# fix the dtypes
for colname in ["czi_collection_id", "czi_collection_name"]:
    df[colname] = df[colname].astype(str)
df = pa.Table.from_pandas(df)

In [21]:
# update
with tiledbsoma.Experiment.open(db_uri, "w") as exp:
    exp.obs.write(df)