# Query the cellxgene census

Stephen Fleming

2025.10.08

Use the conda env `census`

Pull data from cellxgene programmatically.

In [41]:
import cellxgene_census
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Dataset info

In [42]:
with cellxgene_census.open_soma(census_version="2024-07-01") as census:
    info = census["census_info"]["summary"].read().concat().to_pandas()

info

Unnamed: 0,soma_joinid,label,value
0,0,census_schema_version,2.0.1
1,1,census_build_date,2024-05-20
2,2,dataset_schema_version,5.0.0
3,3,total_cell_count,115556140
4,4,unique_cell_count,60597966
5,5,number_donors_homo_sapiens,17651
6,6,number_donors_mus_musculus,4216


In [43]:
with cellxgene_census.open_soma() as census:
    info = census["census_info"]["summary_cell_counts"].read().concat().to_pandas()

info[info['category'] == 'all']

The "stable" release is currently 2024-07-01. Specify 'census_version="2024-07-01"' in future calls to open_soma() to ensure data consistency.


Unnamed: 0,soma_joinid,organism,category,label,ontology_term_id,total_cell_count,unique_cell_count
0,0,Homo sapiens,all,na,na,74322510,44265932
1189,1189,Mus musculus,all,na,na,41233630,16332034


In [44]:
with cellxgene_census.open_soma() as census:
    dataset_df = census["census_info"]["datasets"].read().concat().to_pandas()

dataset_df

The "stable" release is currently 2024-07-01. Specify 'census_version="2024-07-01"' in future calls to open_soma() to ensure data consistency.


Unnamed: 0,soma_joinid,citation,collection_id,collection_name,collection_doi,dataset_id,dataset_version_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count
0,0,Publication: https://doi.org/10.1002/hep4.1854...,44531dd9-1388-4416-a117-af0a99de2294,"Single-Cell, Single-Nucleus, and Spatial RNA S...",10.1002/hep4.1854,0895c838-e550-48a3-a777-dbcd35d30272,fb76c95f-0391-4fac-9fb9-082ce2430b59,Healthy human liver: B cells,0895c838-e550-48a3-a777-dbcd35d30272.h5ad,146
1,1,Publication: https://doi.org/10.1126/sciimmuno...,3a2af25b-2338-4266-aad3-aa8d07473f50,Single-cell analysis of human B cell maturatio...,10.1126/sciimmunol.abe6291,00ff600e-6e2e-4d76-846f-0eec4f0ae417,b6737a5e-9069-4dd6-9a57-92e17a746df9,Human tonsil nonlymphoid cells scRNA,00ff600e-6e2e-4d76-846f-0eec4f0ae417.h5ad,363
2,2,Publication: https://doi.org/10.1038/s41593-02...,180bff9c-c8a5-4539-b13b-ddbc00d643e6,Molecular characterization of selectively vuln...,10.1038/s41593-020-00764-7,bdacc907-7c26-419f-8808-969eab3ca2e8,0e02290f-b992-450b-8a19-554f73cd7f09,Molecular characterization of selectively vuln...,bdacc907-7c26-419f-8808-969eab3ca2e8.h5ad,3799
3,3,Publication: https://doi.org/10.1038/s41467-02...,bf325905-5e8e-42e3-933d-9a9053e9af80,Single-cell Atlas of common variable immunodef...,10.1038/s41467-022-29450-x,a5d95a42-0137-496f-8a60-101e17f263c8,40832710-d7b1-43fb-b2c2-1cd2255bc3ac,Steady-state B cells - scRNA-seq,a5d95a42-0137-496f-8a60-101e17f263c8.h5ad,1324
4,4,Publication: https://doi.org/10.1038/s41590-02...,93eebe82-d8c3-41bc-a906-63b5b5f24a9d,Single-cell proteo-genomic reference maps of t...,10.1038/s41590-021-01059-0,d3566d6a-a455-4a15-980f-45eb29114cab,eb6c070c-ff67-4c1f-8d4d-65f9fe2119ee,blood and bone marrow from a healthy young donor,d3566d6a-a455-4a15-980f-45eb29114cab.h5ad,15502
...,...,...,...,...,...,...,...,...,...,...
807,807,Publication: https://doi.org/10.1038/s41586-02...,45d5d2c3-bc28-4814-aed6-0bb6f0e11c82,A single-cell transcriptional timelapse of mou...,10.1038/s41586-024-07069-w,0bce33ed-455c-4e12-93f8-b7b04a2de4a1,ffeb40f8-d4b9-45c4-95cc-5e2674452ef8,Whole dataset: Normalized subset 2,0bce33ed-455c-4e12-93f8-b7b04a2de4a1.h5ad,2863559
808,808,Publication: https://doi.org/10.1101/2023.05.0...,1ca90a2d-2943-483d-b678-b809bf464c30,SEA-AD: Seattle Alzheimer’s Disease Brain Cell...,10.1101/2023.05.08.539485,c2876b1b-06d8-4d96-a56b-5304f815b99a,77dab54a-f2a8-42fc-8c1b-3fda90622ac7,Whole Taxonomy - MTG: Seattle Alzheimer's Dise...,c2876b1b-06d8-4d96-a56b-5304f815b99a.h5ad,1226855
809,809,Publication: https://doi.org/10.1101/2023.05.0...,1ca90a2d-2943-483d-b678-b809bf464c30,SEA-AD: Seattle Alzheimer’s Disease Brain Cell...,10.1101/2023.05.08.539485,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,b0cbf861-edd3-4add-a09a-c8698ed0cedf,Whole Taxonomy - DLPFC: Seattle Alzheimer's Di...,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3.h5ad,1309414
810,810,Publication: https://doi.org/10.1038/s41586-02...,45d5d2c3-bc28-4814-aed6-0bb6f0e11c82,A single-cell transcriptional timelapse of mou...,10.1038/s41586-024-07069-w,dcfa2614-7ca7-4d82-814c-350626eccb26,4ef3a829-b36e-413f-9a32-56f5a91b1041,Major cell cluster: Mesoderm,dcfa2614-7ca7-4d82-814c-350626eccb26.h5ad,3267338


## Obs

In [45]:
with cellxgene_census.open_soma() as census:

    # Reads SOMADataFrame as a slice
    cell_metadata = census["census_data"]["homo_sapiens"].obs.read(
        value_filter = f"""(is_primary_data == True)""",
        column_names = ["assay", "cell_type", "cell_type_ontology_term_id", 
                        "tissue", "tissue_ontology_term_id", "dataset_id", "donor_id", 
                        "tissue_general", "suspension_type", 
                        "disease", "disease_ontology_term_id", 
                        "development_stage", "development_stage_ontology_term_id", 
                        "raw_sum",
                        "sex", "soma_joinid"]
    )

    # Concatenates results to pyarrow.Table
    cell_metadata = cell_metadata.concat()

    # Converts to pandas.DataFrame
    cell_metadata = cell_metadata.to_pandas()

The "stable" release is currently 2024-07-01. Specify 'census_version="2024-07-01"' in future calls to open_soma() to ensure data consistency.


In [46]:
cell_metadata.columns

Index(['assay', 'cell_type', 'cell_type_ontology_term_id', 'tissue',
       'tissue_ontology_term_id', 'dataset_id', 'donor_id', 'tissue_general',
       'suspension_type', 'disease', 'disease_ontology_term_id',
       'development_stage', 'development_stage_ontology_term_id', 'raw_sum',
       'sex', 'soma_joinid', 'is_primary_data'],
      dtype='object')

In [47]:
sorted(cell_metadata.tissue.unique())

['Brodmann (1909) area 4',
 'adipose tissue',
 'adrenal gland',
 'adrenal tissue',
 'alveolus of lung',
 'ampulla of uterine tube',
 'angular gyrus',
 'anterior cingulate cortex',
 'anterior cingulate gyrus',
 'anterior part of tongue',
 'anterior wall of left ventricle',
 'aorta',
 'apex of heart',
 'arm skin',
 'artery',
 'ascending colon',
 'axilla',
 'basal ganglion',
 'basal zone of heart',
 'bladder organ',
 'blood',
 'body of stomach',
 'bone marrow',
 'bone spine',
 'brain',
 'brain white matter',
 'breast',
 'bronchopulmonary lymph node',
 'bronchus',
 'caecum',
 'caecum epithelium',
 'cardia of stomach',
 'cardiac atrium',
 'cardiac ventricle',
 'caudal ganglionic eminence',
 'caudate lobe of liver',
 'caudate nucleus',
 'cerebellar cortex',
 'cerebellum',
 'cerebellum vermis lobule',
 'cerebral cortex',
 'cerebral nuclei',
 'cervical lymph node',
 'cervical spinal cord white matter',
 'chorionic villus',
 'choroid plexus',
 'ciliary body',
 'cingulate cortex',
 'colon',
 'co

### Cell types per donor

In [48]:
cell_metadata[['cell_type', 'cell_type_ontology_term_id']].drop_duplicates().head()

Unnamed: 0,cell_type,cell_type_ontology_term_id
0,naive B cell,CL:0000788
5,unswitched memory B cell,CL:0000970
14,class switched memory B cell,CL:0000972
1324,plasma cell,CL:0000786
1329,erythroid progenitor cell,CL:0000038


In [49]:
cell_metadata['cell_type_unknown'] = cell_metadata['cell_type'] == 'unknown'

In [50]:
cell_metadata['cell_type_unknown'].sum() / len(cell_metadata)

0.02986755141629007

In [51]:
donor_cell_type_df1 = (
    cell_metadata[['dataset_id', 'donor_id', 'cell_type']]
    .groupby(['dataset_id', 'donor_id'], observed=True)
    .agg(lambda s: s.nunique())
).rename(columns={'cell_type': 'n_unique_cell_types'})


donor_cell_type_df2 = (
    cell_metadata[['dataset_id', 'donor_id', 'cell_type_unknown']]
    .groupby(['dataset_id', 'donor_id'], observed=True)
    .mean()
).rename(columns={'cell_type_unknown': 'cell_type_unknown_fraction'})



donor_cell_type_df = pd.concat([donor_cell_type_df1, donor_cell_type_df2], axis=1)

In [52]:
donor_cell_type_df

Unnamed: 0_level_0,Unnamed: 1_level_0,n_unique_cell_types,cell_type_unknown_fraction
dataset_id,donor_id,Unnamed: 2_level_1,Unnamed: 3_level_1
00476f9f-ebc1-4b72-b541-32f912ce36ea,H19.30.001,11,0.0
01209dce-3575-4bed-b1df-129f57fbc031,Donor1,6,0.0
01209dce-3575-4bed-b1df-129f57fbc031,Donor2,6,0.0
01209dce-3575-4bed-b1df-129f57fbc031,DonorA,4,0.0
01209dce-3575-4bed-b1df-129f57fbc031,DonorB,4,0.0
...,...,...,...
fe4b89d5-461e-440c-a5a8-621b37b122c0,195045,8,0.0
fe4b89d5-461e-440c-a5a8-621b37b122c0,199129,8,0.0
ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded,H18.30.002,9,0.0
ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded,H19.30.001,8,0.0


In [53]:
donor_cell_type_df.sort_values('cell_type_unknown_fraction', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n_unique_cell_types,cell_type_unknown_fraction
dataset_id,donor_id,Unnamed: 2_level_1,Unnamed: 3_level_1
7bb64315-9e5a-41b9-9235-59acf9642a3e,NGCII511,1,1.0
7bb64315-9e5a-41b9-9235-59acf9642a3e,NGCII545,1,1.0
7bb64315-9e5a-41b9-9235-59acf9642a3e,NGCII529,1,1.0
7bb64315-9e5a-41b9-9235-59acf9642a3e,NGCII533,1,1.0
7bb64315-9e5a-41b9-9235-59acf9642a3e,NGCII536,1,1.0


In [54]:
# how many (datasets, donors) have zero cell_type annotations?

donor_cell_type_df[donor_cell_type_df['cell_type_unknown_fraction'] == 1]

Unnamed: 0_level_0,Unnamed: 1_level_0,n_unique_cell_types,cell_type_unknown_fraction
dataset_id,donor_id,Unnamed: 2_level_1,Unnamed: 3_level_1
0c9a8cfb-6649-4d52-b418-6d8e56bd7afe,C66,1,1.0
7bb64315-9e5a-41b9-9235-59acf9642a3e,NGCII498,1,1.0
7bb64315-9e5a-41b9-9235-59acf9642a3e,NGCII499,1,1.0
7bb64315-9e5a-41b9-9235-59acf9642a3e,NGCII502,1,1.0
7bb64315-9e5a-41b9-9235-59acf9642a3e,NGCII509,1,1.0
7bb64315-9e5a-41b9-9235-59acf9642a3e,NGCII510,1,1.0
7bb64315-9e5a-41b9-9235-59acf9642a3e,NGCII511,1,1.0
7bb64315-9e5a-41b9-9235-59acf9642a3e,NGCII512,1,1.0
7bb64315-9e5a-41b9-9235-59acf9642a3e,NGCII513,1,1.0
7bb64315-9e5a-41b9-9235-59acf9642a3e,NGCII514,1,1.0


In [55]:
dataset_df[dataset_df['dataset_id'].isin(
    donor_cell_type_df[donor_cell_type_df['cell_type_unknown_fraction'] == 1].reset_index()['dataset_id']
)]

Unnamed: 0,soma_joinid,citation,collection_id,collection_name,collection_doi,dataset_id,dataset_version_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count
11,11,Publication: https://doi.org/10.1016/j.jhep.20...,0c8a364b-97b5-4cc8-a593-23c38c6f0ac5,Single-cell and spatial transcriptomics charac...,10.1016/j.jhep.2023.12.023,0c9a8cfb-6649-4d52-b418-6d8e56bd7afe,6ff309fa-e9f6-405d-b24e-3c35528f154e,Cholangiocytes from human healthy donor liver ...,0c9a8cfb-6649-4d52-b418-6d8e56bd7afe.h5ad,1011
548,548,Publication: https://doi.org/10.1038/s41593-02...,b0f0b447-ac37-45b0-b1bf-5c0b7d871120,Single-cell genomic profiling of human dopamin...,10.1038/s41593-022-01061-1,dea1aa78-c0a2-413f-b375-f91cce49e4d0,51affad7-08ed-43b3-93ad-56640f4b8910,Human Nurr-Negative Nuclei 10x scRNA-seq,dea1aa78-c0a2-413f-b375-f91cce49e4d0.h5ad,104097
711,711,Publication: https://doi.org/10.1158/2159-8290...,a18474f4-ff1e-4864-af69-270b956cee5b,Single-cell RNA sequencing unifies development...,10.1158/2159-8290.cd-22-0824,7bb64315-9e5a-41b9-9235-59acf9642a3e,9ec33c56-14a4-4815-b747-34ee38c850e8,UMAP of Cancer Data integration,7bb64315-9e5a-41b9-9235-59acf9642a3e.h5ad,293823


### Number of cells

In [56]:
num_cell_df = (
    cell_metadata[['dataset_id', 'donor_id', 'cell_type']]
    .groupby(['dataset_id', 'donor_id'], observed=True)
    .count()
).rename(columns={'cell_type': 'n_cells'})
num_cell_df

Unnamed: 0_level_0,Unnamed: 1_level_0,n_cells
dataset_id,donor_id,Unnamed: 2_level_1
00476f9f-ebc1-4b72-b541-32f912ce36ea,H19.30.001,10099
01209dce-3575-4bed-b1df-129f57fbc031,Donor1,16185
01209dce-3575-4bed-b1df-129f57fbc031,Donor2,18066
01209dce-3575-4bed-b1df-129f57fbc031,DonorA,9193
01209dce-3575-4bed-b1df-129f57fbc031,DonorB,8432
...,...,...
fe4b89d5-461e-440c-a5a8-621b37b122c0,195045,1864
fe4b89d5-461e-440c-a5a8-621b37b122c0,199129,1101
ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded,H18.30.002,5728
ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded,H19.30.001,9766


### Tissue

In [58]:
tissue_coarsener = np.load('../data/pseudobulk/tissue_coarsen_map.npy', allow_pickle=True).item()

{k: v for i, (k, v) in enumerate({a: b for a, b in tissue_coarsener.items() if b is not None}.items()) if i < 5}

{'UBERON_0000115': 'UBERON_0002048',
 'UBERON_0001264': 'UBERON_0001264',
 'UBERON_0002129': 'UBERON_0000955',
 'UBERON_0000006': 'UBERON_0001264',
 'UBERON_0002512': 'UBERON_0000992'}

In [59]:
tissue_dist = np.load('../data/pseudobulk/tissue_distances_from_root.npy', allow_pickle=True).item()
{k: v for i, (k, v) in enumerate(tissue_dist.items()) if i < 5}

{'UBERON_0000061': 2,
 'UBERON_0000122': 4,
 'UBERON_0000465': 1,
 'UBERON_0001637': 6,
 'UBERON_0001638': 6}

In [57]:
tissue_coarsener = pd.read_csv("../data/pseudobulk/uberon_ontology_map.tsv",sep="\t",skiprows=1)
tissue_coarsener_dict = dict(zip(tissue_coarsener.iloc[:,1],tissue_coarsener.iloc[:,0]))
tissue_coarsener_dict_reverse = dict(zip(tissue_coarsener.iloc[:,1],tissue_coarsener.iloc[:,0]))

cell_metadata['tissue_coarse_ontology_id'] = cell_metadata['tissue_ontology_term_id'].map(tissue_coarsener_dict)

In [60]:
tissue_name_lookup = np.load('../data/pseudobulk/tissue_name_lookup.npy', allow_pickle=True).item()
print(tissue_name_lookup)
#{k: v for i, (k, v) in enumerate(tissue_name_lookup.items()) if i < 5}

{'UBERON_0000061': 'anatomical structure', 'UBERON_0000122': 'neuron projection bundle', 'UBERON_0000465': 'material anatomical entity', 'UBERON_0001637': 'artery', 'UBERON_0001638': 'vein', 'UBERON_0010000': 'multicellular anatomical structure', 'UBERON_0001062': 'anatomical entity', 'UBERON_0000475': 'organism subdivision', 'UBERON_0010912': 'subdivision of skeleton', 'UBERON_0004111': 'anatomical conduit', 'UBERON_0000463': 'organism substance', 'UBERON_0001017': 'central nervous system', 'UBERON_0002418': 'cartilage tissue', 'UBERON_0005176': 'tooth enamel organ', 'UBERON_0001763': 'odontogenic papilla', 'UBERON_0001752': 'enamel', 'UBERON_0001751': 'dentine', 'UBERON_0001753': 'cementum', 'UBERON_0008883': 'osteoid', 'UBERON_0000483': 'epithelium', 'UBERON_0000058': 'duct', 'UBERON_0004638': 'blood vessel endothelium', 'UBERON_0000179': 'haemolymphatic fluid', 'UBERON_0002048': 'lung', 'UBERON_0000115': 'lung epithelium', 'UBERON_0001264': 'pancreas', 'UBERON_0001281': 'hepatic si

In [61]:
# list all the "coarse" tissues: the organs

[tissue_name_lookup.get(t) for t in np.unique(list({a: b for a, b in tissue_coarsener.items() if b is not None}.values()))]

['nose',
 'lymph node',
 'intestine',
 'blood',
 'breast',
 'stomach',
 'heart',
 'brain',
 'eye',
 'ovary',
 'uterus',
 'adipose tissue',
 'esophagus',
 'pancreas',
 'tongue',
 'placenta',
 'thyroid gland',
 'lung',
 'skin of body',
 'spleen',
 'liver',
 'gallbladder',
 'kidney',
 'spinal cord',
 'thymus',
 'bone marrow',
 'fallopian tube',
 'bladder organ']

In [62]:
cell_metadata['tissue_coarse_ontology_id'] = cell_metadata['tissue_ontology_term_id'].str.replace(':', '_').map(tissue_coarsener)
cell_metadata['tissue_coarse'] = cell_metadata['tissue_coarse_ontology_id'].map(tissue_name_lookup)

cell_metadata['tissue_name_ont_coarsename_coarseont'] = list(zip(
    cell_metadata['tissue'], 
    cell_metadata['tissue_ontology_term_id'],
    cell_metadata['tissue_coarse'],
    cell_metadata['tissue_coarse_ontology_id'],
))

tissue_df = (
    cell_metadata[['dataset_id', 'donor_id', 'tissue_name_ont_coarsename_coarseont']]
    .groupby(['dataset_id', 'donor_id'], observed=True)
    .agg(lambda s: s.unique().tolist())
)
tissue_df['n_tissues'] = tissue_df['tissue_name_ont_coarsename_coarseont'].apply(len)
tissue_df

Unnamed: 0_level_0,Unnamed: 1_level_0,tissue_name_ont_coarsename_coarseont,n_tissues
dataset_id,donor_id,Unnamed: 2_level_1,Unnamed: 3_level_1
00476f9f-ebc1-4b72-b541-32f912ce36ea,H19.30.001,"[(hypothalamus, UBERON:0001898, brain, UBERON_...",1
01209dce-3575-4bed-b1df-129f57fbc031,Donor1,"[(lower lobe of left lung, UBERON:0008953, lun...",3
01209dce-3575-4bed-b1df-129f57fbc031,Donor2,"[(lower lobe of left lung, UBERON:0008953, lun...",3
01209dce-3575-4bed-b1df-129f57fbc031,DonorA,"[(venous blood, UBERON:0013756, blood, UBERON_...",1
01209dce-3575-4bed-b1df-129f57fbc031,DonorB,"[(venous blood, UBERON:0013756, blood, UBERON_...",1
...,...,...,...
fe4b89d5-461e-440c-a5a8-621b37b122c0,195045,"[(ileal epithelium, UBERON:0008345, intestine,...",2
fe4b89d5-461e-440c-a5a8-621b37b122c0,199129,"[(ileum, UBERON:0002116, intestine, UBERON_000...",1
ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded,H18.30.002,"[(cerebral cortex, UBERON:0000956, brain, UBER...",1
ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded,H19.30.001,"[(cerebral cortex, UBERON:0000956, brain, UBER...",1


### Development stage

In [63]:
devstage_coarsener = np.load('../data/pseudobulk/devstage_coarsen_map.npy', allow_pickle=True).item()
{k: v for i, (k, v) in enumerate({a: b for a, b in devstage_coarsener.items() if b is not None}.items()) if i < 5}

{'HsapDv_0000002': 'HsapDv_0000002',
 'HsapDv_0000003': 'HsapDv_0000002',
 'HsapDv_0000005': 'HsapDv_0000002',
 'HsapDv_0000006': 'HsapDv_0000002',
 'HsapDv_0000007': 'HsapDv_0000002'}

In [64]:
devstage_name_lookup = np.load('../data/pseudobulk/devstage_name_lookup.npy', allow_pickle=True).item()
{k: v for i, (k, v) in enumerate(devstage_name_lookup.items()) if i < 5}

{'HsapDv_0000000': 'life cycle stage',
 'HsapDv_0000001': 'life cycle',
 'HsapDv_0000002': 'embryonic stage',
 'HsapDv_0000045': 'prenatal stage',
 'HsapDv_0000003': 'Carnegie stage 01'}

In [65]:
# list all the "coarse" tissues: the organs

[devstage_name_lookup.get(t) for t in np.unique(list({a: b for a, b in devstage_coarsener.items() if b is not None}.values()))]

['embryonic stage',
 'fetal stage',
 '80 year-old and over stage',
 'pediatric stage',
 'young adult stage',
 'middle aged stage',
 '60-79 year-old stage']

In [66]:
cell_metadata.columns

Index(['assay', 'cell_type', 'cell_type_ontology_term_id', 'tissue',
       'tissue_ontology_term_id', 'dataset_id', 'donor_id', 'tissue_general',
       'suspension_type', 'disease', 'disease_ontology_term_id',
       'development_stage', 'development_stage_ontology_term_id', 'raw_sum',
       'sex', 'soma_joinid', 'is_primary_data', 'cell_type_unknown',
       'tissue_coarse_ontology_id', 'tissue_coarse',
       'tissue_name_ont_coarsename_coarseont'],
      dtype='object')

In [67]:
cell_metadata['coarse_development_stage_ontology_id'] = cell_metadata['development_stage_ontology_term_id'].str.replace(':', '_').map(devstage_coarsener)
cell_metadata['coarse_development_stage'] = cell_metadata['coarse_development_stage_ontology_id'].map(devstage_name_lookup)

development_df = (
    cell_metadata[['dataset_id', 'donor_id', 'development_stage', 'development_stage_ontology_term_id', 
                   'coarse_development_stage', 'coarse_development_stage_ontology_id']]
    .groupby(['dataset_id', 'donor_id'], observed=True)
    .agg(lambda s: s.unique().tolist()[0])
)
development_df

Unnamed: 0_level_0,Unnamed: 1_level_0,development_stage,development_stage_ontology_term_id,coarse_development_stage,coarse_development_stage_ontology_id
dataset_id,donor_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00476f9f-ebc1-4b72-b541-32f912ce36ea,H19.30.001,42-year-old human stage,HsapDv:0000136,middle aged stage,HsapDv_0000267
01209dce-3575-4bed-b1df-129f57fbc031,Donor1,65-year-old human stage,HsapDv:0000159,60-79 year-old stage,HsapDv_0000272
01209dce-3575-4bed-b1df-129f57fbc031,Donor2,52-year-old human stage,HsapDv:0000146,middle aged stage,HsapDv_0000267
01209dce-3575-4bed-b1df-129f57fbc031,DonorA,unknown,unknown,,
01209dce-3575-4bed-b1df-129f57fbc031,DonorB,unknown,unknown,,
...,...,...,...,...,...
fe4b89d5-461e-440c-a5a8-621b37b122c0,195045,seventh decade human stage,HsapDv:0000241,60-79 year-old stage,HsapDv_0000272
fe4b89d5-461e-440c-a5a8-621b37b122c0,199129,eighth decade human stage,HsapDv:0000242,60-79 year-old stage,HsapDv_0000272
ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded,H18.30.002,50-year-old human stage,HsapDv:0000144,middle aged stage,HsapDv_0000267
ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded,H19.30.001,42-year-old human stage,HsapDv:0000136,middle aged stage,HsapDv_0000267


In [68]:
vc = development_df[development_df['coarse_development_stage'].isnull()]['development_stage'].value_counts()
vc[vc > 0]

development_stage
unknown              608
human adult stage    310
mature stage         147
human aged stage      16
Name: count, dtype: int64

### Disease

In [69]:
disease_df = (
    cell_metadata[['dataset_id', 'donor_id', 'disease', 'disease_ontology_term_id']]
    .groupby(['dataset_id', 'donor_id'], observed=True)
    .agg(lambda s: s.unique().tolist()[0])
)
disease_df

Unnamed: 0_level_0,Unnamed: 1_level_0,disease,disease_ontology_term_id
dataset_id,donor_id,Unnamed: 2_level_1,Unnamed: 3_level_1
00476f9f-ebc1-4b72-b541-32f912ce36ea,H19.30.001,normal,PATO:0000461
01209dce-3575-4bed-b1df-129f57fbc031,Donor1,normal,PATO:0000461
01209dce-3575-4bed-b1df-129f57fbc031,Donor2,normal,PATO:0000461
01209dce-3575-4bed-b1df-129f57fbc031,DonorA,normal,PATO:0000461
01209dce-3575-4bed-b1df-129f57fbc031,DonorB,normal,PATO:0000461
...,...,...,...
fe4b89d5-461e-440c-a5a8-621b37b122c0,195045,Crohn disease,MONDO:0005011
fe4b89d5-461e-440c-a5a8-621b37b122c0,199129,normal,PATO:0000461
ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded,H18.30.002,normal,PATO:0000461
ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded,H19.30.001,normal,PATO:0000461


## Sex

In [70]:
sex_df = (
    cell_metadata[['dataset_id', 'donor_id', 'sex']]
    .groupby(['dataset_id', 'donor_id'], observed=True)
    .agg(lambda s: s.unique().tolist()[0])
)


## Merged table

In [71]:
cell_metadata.columns

Index(['assay', 'cell_type', 'cell_type_ontology_term_id', 'tissue',
       'tissue_ontology_term_id', 'dataset_id', 'donor_id', 'tissue_general',
       'suspension_type', 'disease', 'disease_ontology_term_id',
       'development_stage', 'development_stage_ontology_term_id', 'raw_sum',
       'sex', 'soma_joinid', 'is_primary_data', 'cell_type_unknown',
       'tissue_coarse_ontology_id', 'tissue_coarse',
       'tissue_name_ont_coarsename_coarseont',
       'coarse_development_stage_ontology_id', 'coarse_development_stage'],
      dtype='object')

In [72]:
dfs = [donor_cell_type_df, num_cell_df, tissue_df, development_df, disease_df, sex_df]
df = pd.concat(dfs, axis=1)
print(df.shape)

df.reset_index()
#df.to_pickle("../data/pseudobulk/cell_metadata_coarsened.pkl")


(7305, 12)


Unnamed: 0,dataset_id,donor_id,n_unique_cell_types,cell_type_unknown_fraction,n_cells,tissue_name_ont_coarsename_coarseont,n_tissues,development_stage,development_stage_ontology_term_id,coarse_development_stage,coarse_development_stage_ontology_id,disease,disease_ontology_term_id,sex
0,00476f9f-ebc1-4b72-b541-32f912ce36ea,H19.30.001,11,0.0,10099,"[(hypothalamus, UBERON:0001898, brain, UBERON_...",1,42-year-old human stage,HsapDv:0000136,middle aged stage,HsapDv_0000267,normal,PATO:0000461,male
1,01209dce-3575-4bed-b1df-129f57fbc031,Donor1,6,0.0,16185,"[(lower lobe of left lung, UBERON:0008953, lun...",3,65-year-old human stage,HsapDv:0000159,60-79 year-old stage,HsapDv_0000272,normal,PATO:0000461,male
2,01209dce-3575-4bed-b1df-129f57fbc031,Donor2,6,0.0,18066,"[(lower lobe of left lung, UBERON:0008953, lun...",3,52-year-old human stage,HsapDv:0000146,middle aged stage,HsapDv_0000267,normal,PATO:0000461,male
3,01209dce-3575-4bed-b1df-129f57fbc031,DonorA,4,0.0,9193,"[(venous blood, UBERON:0013756, blood, UBERON_...",1,unknown,unknown,,,normal,PATO:0000461,male
4,01209dce-3575-4bed-b1df-129f57fbc031,DonorB,4,0.0,8432,"[(venous blood, UBERON:0013756, blood, UBERON_...",1,unknown,unknown,,,normal,PATO:0000461,male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7300,fe4b89d5-461e-440c-a5a8-621b37b122c0,195045,8,0.0,1864,"[(ileal epithelium, UBERON:0008345, intestine,...",2,seventh decade human stage,HsapDv:0000241,60-79 year-old stage,HsapDv_0000272,Crohn disease,MONDO:0005011,female
7301,fe4b89d5-461e-440c-a5a8-621b37b122c0,199129,8,0.0,1101,"[(ileum, UBERON:0002116, intestine, UBERON_000...",1,eighth decade human stage,HsapDv:0000242,60-79 year-old stage,HsapDv_0000272,normal,PATO:0000461,male
7302,ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded,H18.30.002,9,0.0,5728,"[(cerebral cortex, UBERON:0000956, brain, UBER...",1,50-year-old human stage,HsapDv:0000144,middle aged stage,HsapDv_0000267,normal,PATO:0000461,male
7303,ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded,H19.30.001,8,0.0,9766,"[(cerebral cortex, UBERON:0000956, brain, UBER...",1,42-year-old human stage,HsapDv:0000136,middle aged stage,HsapDv_0000267,normal,PATO:0000461,male


In [73]:
df.columns

Index(['n_unique_cell_types', 'cell_type_unknown_fraction', 'n_cells',
       'tissue_name_ont_coarsename_coarseont', 'n_tissues',
       'development_stage', 'development_stage_ontology_term_id',
       'coarse_development_stage', 'coarse_development_stage_ontology_id',
       'disease', 'disease_ontology_term_id', 'sex'],
      dtype='object')