# Notes

* code to check/summarize a tiledb database

In [3]:
import os
import tiledbsoma
import tiledbsoma.io

In [4]:
#db_uri = "/home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/tiledb_exp1"
db_uri = "/home/nickyoungblut/dev/nextflow/scRecounter/tmp/tiledb/tiledb_prod3"

# Summary

## Obs

In [5]:
# get colnames
with tiledbsoma.open(db_uri) as exp:
    print(exp.obs.schema)

soma_joinid: int64 not null
obs_id: large_string
SRX_accession: large_string
lib_prep: large_string
tech_10x: large_string
organism: large_string
tissue: large_string
disease: large_string
purturbation: large_string
cell_line: large_string
czi_collection_id: large_string
czi_collection_name: large_string


In [6]:
# read in the data
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.obs.read(column_names=["obs_id", "SRX_accession"])
        .concat()
        .to_pandas()
    )
df

Unnamed: 0,obs_id,SRX_accession
0,AAACCCAAGAGAGGTA,SRX23386174
1,AAACCCAAGAGCAAGA,SRX23386174
2,AAACCCAAGCTCTTCC,SRX23386174
3,AAACCCAAGCTGCCAC,SRX23386174
4,AAACCCAAGTCACGCC,SRX23386174
...,...,...
120433,TTTCACACACCTCGTT,SRX23945305
120434,TTTGACTAGAGTTCGG,SRX23945305
120435,TTTGACTCACAAGCCC,SRX23945305
120436,TTTGATCAGCCGCTTG,SRX23945305


In [7]:
df["SRX_accession"].value_counts()

SRX_accession
SRX23386174    25046
SRX25595940    11779
SRX27313889    10618
SRX24003271     9849
SRX21839851     9726
SRX26275147     9007
SRX23780558     8270
SRX19303371     6520
SRX21831069     6510
SRX21101392     6274
SRX24367499     4152
SRX22127252     2804
SRX23508713     2675
SRX22127249     2474
SRX23508714     1303
SRX23261461      889
SRX23508715      741
SRX23945305      706
SRX25310839      606
SRX23930792      489
Name: count, dtype: int64

### Metadata

In [8]:
# read in the data
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.obs.read(column_names=["obs_id", "SRX_accession", "tech_10x", "organism"])
        .concat()
        .to_pandas()
    )
df

Unnamed: 0,obs_id,SRX_accession,tech_10x,organism
0,AAACCCAAGAGAGGTA,SRX23386174,3_prime_gex,human
1,AAACCCAAGAGCAAGA,SRX23386174,3_prime_gex,human
2,AAACCCAAGCTCTTCC,SRX23386174,3_prime_gex,human
3,AAACCCAAGCTGCCAC,SRX23386174,3_prime_gex,human
4,AAACCCAAGTCACGCC,SRX23386174,3_prime_gex,human
...,...,...,...,...
120433,TTTCACACACCTCGTT,SRX23945305,3_prime_gex,mouse
120434,TTTGACTAGAGTTCGG,SRX23945305,3_prime_gex,mouse
120435,TTTGACTCACAAGCCC,SRX23945305,3_prime_gex,mouse
120436,TTTGATCAGCCGCTTG,SRX23945305,3_prime_gex,mouse


In [9]:
# count by tech_10x and organism
df.groupby(["tech_10x", "organism"]).size().reset_index(name="count")

Unnamed: 0,tech_10x,organism,count
0,3_prime_gex,human,65710
1,3_prime_gex,mouse,43696
2,5_prime_gex,human,606
3,other,mouse,4152
4,vdj,human,6274


## Var

In [None]:
# read in the var (gene) level data
with tiledbsoma.Experiment.open(db_uri) as exp:
    df = (
        exp.ms["RNA"]
        .var.read(column_names=["soma_joinid", "var_id", "gene_symbols"])
        .concat()
        .to_pandas()
    )
df

Unnamed: 0,soma_joinid,var_id,gene_symbols
0,0,ENSG00000243485,MIR1302-2HG
1,1,ENSG00000237613,FAM138A
2,2,ENSG00000186092,OR4F5
3,3,ENSG00000238009,AL627309.1
4,4,ENSG00000239945,AL627309.3
...,...,...,...
68881,68881,ENSMUSG00000095523,AC124606.1
68882,68882,ENSMUSG00000095475,AC133095.2
68883,68883,ENSMUSG00000094855,AC133095.1
68884,68884,ENSMUSG00000095019,AC234645.1
