# Goal

* Compare the metadata diversity in Cell x Gene versus scBaseCamp

# Var

In [1]:
db_uri = "/scratch/multiomics/nickyoungblut/tiledb-loader/tiledb-soma_GeneFull_Ex50pAS"

# Init

In [2]:
import pandas as pd
import plotnine as pn
import tiledbsoma
import tiledbsoma.io
import cellxgene_census

# Load

In [3]:
# load scBaseCamp data
target_cols = ["SRX_accession", "lib_prep", "organism", "tissue", "disease"]

with tiledbsoma.open(db_uri) as exp:
    scbc_metadata = (
        exp.obs.read(column_names=target_cols)
        .concat()
        .group_by(target_cols)
        .aggregate([
            ([], 'count_all'),
        ])
        .to_pandas()
    )
scbc_metadata 

Unnamed: 0,SRX_accession,lib_prep,organism,tissue,disease,count_all
0,SRX10681588,10x_Genomics,human,blood,not specified,7576
1,SRX18700948,10x_Genomics,human,blood,severe dengue,6823
2,SRX18700950,10x_Genomics,human,blood,severe dengue,4385
3,SRX18700949,10x_Genomics,human,blood,severe dengue,1928
4,SRX19713411,10x_Genomics,human,adipose differentiating cells,none,5879
...,...,...,...,...,...,...
21189,SRX10664197,10x_Genomics,,,,12171
21190,SRX5317413,10x_Genomics,,,,5206
21191,SRX11523475,,,,,6567
21192,SRX10188985,10x_Genomics,,,,9704


In [16]:
# load cell x gene obs metadata
target_cols = ["dataset_id", "assay", "tissue", "tissue_general", "suspension_type", "disease", "cell_type"]

cxg_metadata = []
for organism in ["homo_sapiens", "mus_musculus"]:
    with cellxgene_census.open_soma(census_version="2025-01-30") as census:
        df = (
            census["census_data"][organism].obs
            .read(column_names = target_cols).concat()
            .group_by(target_cols).aggregate([([], 'count_all')]).to_pandas()
        )
        df["organism"] = organism
        cxg_metadata.append(df)

cxg_metadata = pd.concat(cxg_metadata).reset_index()
cxg_metadata

Unnamed: 0,index,dataset_id,assay,tissue,tissue_general,suspension_type,disease,cell_type,count_all,organism
0,0,0d2ee4ac-05ee-40b2-afb6-ebb584caa867,Smart-seq2,lung,lung,cell,normal,classical monocyte,198,homo_sapiens
1,1,0d2ee4ac-05ee-40b2-afb6-ebb584caa867,Smart-seq2,lung,lung,cell,normal,respiratory goblet cell,62,homo_sapiens
2,2,0d2ee4ac-05ee-40b2-afb6-ebb584caa867,Smart-seq2,lung,lung,cell,normal,basal cell,181,homo_sapiens
3,3,0d2ee4ac-05ee-40b2-afb6-ebb584caa867,Smart-seq2,lung,lung,cell,normal,pulmonary alveolar type 2 cell,749,homo_sapiens
4,4,0d2ee4ac-05ee-40b2-afb6-ebb584caa867,Smart-seq2,lung,lung,cell,normal,capillary endothelial cell,339,homo_sapiens
...,...,...,...,...,...,...,...,...,...,...
51176,4807,dcfd4feb-18a3-4b30-81d7-1b0c544a8ab3,sci-RNA-seq3,embryo,embryo,nucleus,normal,club cell,6264,mus_musculus
51177,4808,dcfd4feb-18a3-4b30-81d7-1b0c544a8ab3,sci-RNA-seq3,embryo,embryo,nucleus,normal,"activated CD4-negative, CD8-negative type I NK...",6621,mus_musculus
51178,4809,dcfd4feb-18a3-4b30-81d7-1b0c544a8ab3,sci-RNA-seq3,embryo,embryo,nucleus,normal,pulmonary alveolar type 1 cell,14321,mus_musculus
51179,4810,dcfd4feb-18a3-4b30-81d7-1b0c544a8ab3,sci-RNA-seq3,embryo,embryo,nucleus,normal,pulmonary alveolar type 2 cell,21443,mus_musculus


# Summary

# Sandbox

In [11]:
import pandas as pd
import cellxgene_census

In [None]:
with cellxgene_census.open_soma() as census:
    print(census["census_data"]["homo_sapiens"].obs.schema)

In [None]:
with cellxgene_census.open_soma() as census:
    print(census["census_data"]["mus_musculus"].obs.schema)

In [None]:
target_cols = ["dataset_id", "assay", "tissue", "tissue_general", "suspension_type", "disease"]

with cellxgene_census.open_soma(census_version="2025-01-30") as census:
    cxg_metadata = pd.concat([
        census["census_data"]["homo_sapiens"].obs.read(column_names = target_cols).concat().to_pandas(),
        census["census_data"]["mus_musculus"].obs.read(column_names = target_cols).concat().to_pandas(),
    ])

cxg_metadata