# Genes measured in each cell (dataset presence matrix)

## Opening the Census

In [1]:
import numpy as np
from scipy import sparse
import cellxgene_census

In [2]:
census = cellxgene_census.open_soma()

The "stable" release is currently 2023-05-15. Specify 'census_version="2023-05-15"' in future calls to open_soma() to ensure data consistency.


## Fetching the IDs of the Census datasets

In [3]:
# Grab the experiment containing human data, and the measurement therein with RNA
human = census["census_data"]["homo_sapiens"]

In [4]:
human

<Experiment 's3://cellxgene-data-public/cell-census/2023-05-15/soma/census_data/homo_sapiens' (open for 'r') (2 items)
    'ms': 's3://cellxgene-data-public/cell-census/2023-05-15/soma/census_data/homo_sapiens/ms' (unopened)
    'obs': 's3://cellxgene-data-public/cell-census/2023-05-15/soma/census_data/homo_sapiens/obs' (unopened)>

In [5]:
human_rna = human.ms["RNA"]

In [6]:
human_rna

<Measurement 's3://cellxgene-data-public/cell-census/2023-05-15/soma/census_data/homo_sapiens/ms/RNA' (open for 'r') (3 items)
    'feature_dataset_presence_matrix': 's3://cellxgene-data-public/cell-census/2023-05-15/soma/census_data/homo_sapiens/ms/RNA/feature_dataset_presence_matrix' (unopened)
    'var': 's3://cellxgene-data-public/cell-census/2023-05-15/soma/census_data/homo_sapiens/ms/RNA/var' (unopened)
    'X': 's3://cellxgene-data-public/cell-census/2023-05-15/soma/census_data/homo_sapiens/ms/RNA/X' (unopened)>

In [7]:
# The census-wide datasets
datasets_df = census["census_info"]["datasets"].read().concat().to_pandas()

In [9]:
datasets_df.head()

Unnamed: 0,soma_joinid,collection_id,collection_name,collection_doi,dataset_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count
0,0,6b701826-37bb-4356-9792-ff41fc4c3161,Abdominal White Adipose Tissue,,9d8e5dca-03a3-457d-b7fb-844c75735c83,22 integrated samples,9d8e5dca-03a3-457d-b7fb-844c75735c83.h5ad,72335
1,1,4195ab4c-20bd-4cd3-8b3d-65601277e731,A spatially resolved single cell genomic atlas...,,a6388a6f-6076-401b-9b30-7d4306a20035,scRNA-seq data - myeloid cells,a6388a6f-6076-401b-9b30-7d4306a20035.h5ad,30789
2,2,4195ab4c-20bd-4cd3-8b3d-65601277e731,A spatially resolved single cell genomic atlas...,,842c6f5d-4a94-4eef-8510-8c792d1124bc,scRNA-seq data - all cells,842c6f5d-4a94-4eef-8510-8c792d1124bc.h5ad,714331
3,3,4195ab4c-20bd-4cd3-8b3d-65601277e731,A spatially resolved single cell genomic atlas...,,74520626-b0ba-4ee9-86b5-714649554def,scRNA-seq data - T cells,74520626-b0ba-4ee9-86b5-714649554def.h5ad,76567
4,4,4195ab4c-20bd-4cd3-8b3d-65601277e731,A spatially resolved single cell genomic atlas...,,396a9124-fb20-4822-bf9c-e93fdf7c999a,scRNA-seq data - B cells,396a9124-fb20-4822-bf9c-e93fdf7c999a.h5ad,12510


## Fetching the dataset presence matrix

In [10]:
presence_matrix = cellxgene_census.get_presence_matrix(census, organism="Homo sapiens", measurement_name="RNA")

In [11]:
presence_matrix

<562x60664 sparse matrix of type '<class 'numpy.uint8'>'
	with 14829450 stored elements in Compressed Sparse Row format>

In [12]:
var_df = human_rna.var.read().concat().to_pandas()

In [13]:
var_df.head()

Unnamed: 0,soma_joinid,feature_id,feature_name,feature_length
0,0,ENSG00000243485,MIR1302-2HG,1021
1,1,ENSG00000237613,FAM138A,1219
2,2,ENSG00000186092,OR4F5,2618
3,3,ENSG00000238009,RP11-34P13.7,3726
4,4,ENSG00000239945,RP11-34P13.8,1319


## Identifying genes measured in a specific dataset.

In [14]:
var_joinid = var_df.loc[var_df.feature_id == "ENSG00000286096"].soma_joinid

In [15]:
var_joinid

60395    60395
Name: soma_joinid, dtype: int64

In [16]:
dataset_joinid = datasets_df.loc[datasets_df.dataset_id == "97a17473-e2b1-4f31-a544-44a60773e2dd"].soma_joinid

In [17]:
dataset_joinid

448    448
Name: soma_joinid, dtype: int64

In [18]:
is_present = presence_matrix[dataset_joinid, var_joinid][0, 0]

In [20]:
print(f'Feature is {"present" if is_present else "not present"}.')

Feature is present.


## Identifying datasets that measured specific genes

In [22]:
# Grab the feature's soma_joinid from the var dataframe
var_joinid = var_df.loc[var_df.feature_id == "ENSG00000286096"].soma_joinid

In [23]:
var_joinid

60395    60395
Name: soma_joinid, dtype: int64

In [24]:
# The presence matrix is indexed by the joinids of the dataset and var dataframes,
# so slice out the feature of interest by its joinid.
dataset_joinids = presence_matrix[:, var_joinid].tocoo().row

In [25]:
dataset_joinids

array([105, 109, 126, 131, 141, 142, 143, 144, 149, 162, 163, 172, 174,
       186, 209, 211, 215, 236, 239, 444, 445, 446, 447, 448], dtype=int32)

In [27]:
# From the datasets dataframe, slice out the datasets which have a joinid in the list
datasets_df.loc[datasets_df.soma_joinid.isin(dataset_joinids)].head()

Unnamed: 0,soma_joinid,collection_id,collection_name,collection_doi,dataset_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count
105,105,283d65eb-dd53-496d-adb7-7570c7caa443,Transcriptomic diversity of cell types across ...,10.1101/2022.10.12.511898,fe1a73ab-a203-45fd-84e9-0f7fd19efcbd,Dissection: Amygdaloid complex (AMY) - basolat...,fe1a73ab-a203-45fd-84e9-0f7fd19efcbd.h5ad,35285
109,109,283d65eb-dd53-496d-adb7-7570c7caa443,Transcriptomic diversity of cell types across ...,10.1101/2022.10.12.511898,f8dda921-5fb4-4c94-a654-c6fc346bfd6d,Dissection: Cerebral cortex (Cx) - Occipitotem...,f8dda921-5fb4-4c94-a654-c6fc346bfd6d.h5ad,31899
126,126,283d65eb-dd53-496d-adb7-7570c7caa443,Transcriptomic diversity of cell types across ...,10.1101/2022.10.12.511898,dd03ce70-3243-4c96-9561-330cc461e4d7,Dissection: Cerebral cortex (Cx) - Perirhinal ...,dd03ce70-3243-4c96-9561-330cc461e4d7.h5ad,23732
131,131,283d65eb-dd53-496d-adb7-7570c7caa443,Transcriptomic diversity of cell types across ...,10.1101/2022.10.12.511898,d2b5efc1-14c6-4b5f-bd98-40f9084872d7,Dissection: Tail of Hippocampus (HiT) - Caudal...,d2b5efc1-14c6-4b5f-bd98-40f9084872d7.h5ad,36886
141,141,283d65eb-dd53-496d-adb7-7570c7caa443,Transcriptomic diversity of cell types across ...,10.1101/2022.10.12.511898,c4b03352-af8d-492a-8d6b-40f304e0a122,Supercluster: Medium spiny neuron,c4b03352-af8d-492a-8d6b-40f304e0a122.h5ad,152189


## Identifying all genes measured in a dataset

In [28]:
# Slice the dataset(s) of interest, and get the joinid(s)
dataset_joinids = datasets_df.loc[datasets_df.collection_id == "17481d16-ee44-49e5-bcf0-28c0780d8c4a"].soma_joinid

In [29]:
dataset_joinids

311    311
312    312
Name: soma_joinid, dtype: int64

In [30]:
# Slice the presence matrix by the first dimension, i.e., by dataset
var_joinids = presence_matrix[dataset_joinids, :].tocoo().col

In [31]:
var_joinids

array([    3,     4,    13, ..., 58043, 58688, 58928], dtype=int32)

In [33]:
# From the feature (var) dataframe, slice out features which have a joinid in the list.
var_df.loc[var_df.soma_joinid.isin(var_joinids)].head()

Unnamed: 0,soma_joinid,feature_id,feature_name,feature_length
3,3,ENSG00000238009,RP11-34P13.7,3726
4,4,ENSG00000239945,RP11-34P13.8,1319
13,13,ENSG00000229905,RP11-206L10.4,456
14,14,ENSG00000237491,LINC01409,8413
15,15,ENSG00000177757,FAM87B,1947
