# Fetch the datasets table

In [None]:
import cellxgene_census

census = cellxgene_census.open_soma()
census_datasets = census["census_info"]["datasets"].read().concat().to_pandas()

# for convenience, indexing on the soma_joinid which links this to other census data.
census_datasets = census_datasets.set_index("soma_joinid")

# Filter to only include the "Human Brian Cell Atlas v1.0" by the collection id == 283d65eb-dd53-496d-adb7-7570c7caa443
census_datasets[census_datasets.collection_id == "283d65eb-dd53-496d-adb7-7570c7caa443"]

# Filter to only include the Microglia Supercluster by the dataset id == 700aed19-c16e-4ba8-9191-07da098a8626
census_datasets[census_datasets.dataset_id == "700aed19-c16e-4ba8-9191-07da098a8626"]

The "stable" release is currently 2025-01-30. Specify 'census_version="2025-01-30"' in future calls to open_soma() to ensure data consistency.


Unnamed: 0_level_0,citation,collection_id,collection_name,collection_doi,collection_doi_label,dataset_id,dataset_version_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count
soma_joinid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
946,Publication: https://doi.org/10.1126/science.a...,283d65eb-dd53-496d-adb7-7570c7caa443,Human Brain Cell Atlas v1.0,10.1126/science.add7046,Siletti et al. (2023) Science,700aed19-c16e-4ba8-9191-07da098a8626,5ba4ebb8-93b4-4204-b2d0-eb0a1704fd7e,Supercluster: Microglia,700aed19-c16e-4ba8-9191-07da098a8626.h5ad,91838


# Identify all the genes measured in the dataset

In [17]:
# Grab the experiment containing human data, and the measurement therein with RNA
human = census["census_data"]["homo_sapiens"]
human_rna = human.ms["RNA"]

# The census-wide datasets
datasets_df = census["census_info"]["datasets"].read().concat().to_pandas()

datasets_df

presence_matrix = cellxgene_census.get_presence_matrix(census, organism="Homo sapiens", measurement_name="RNA")

presence_matrix


var_df = human_rna.var.read().concat().to_pandas()

var_df

# Slice the dataset(s) of interest, and get the joinid(s)
dataset_joinids = datasets_df.loc[datasets_df.collection_id == "283d65eb-dd53-496d-adb7-7570c7caa443"].soma_joinid

# Slice the presence matrix by the first dimension, i.e., by dataset
var_joinids = presence_matrix[dataset_joinids, :].tocoo().col

# From the feature (var) dataframe, slice out features which have a joinid in the list.
var_df.loc[var_df.soma_joinid.isin(var_joinids)]

Unnamed: 0,soma_joinid,feature_id,feature_name,feature_type,feature_length,nnz,n_measured_obs
0,0,ENSG00000237491,LINC01409,lncRNA,1059,7958785,92252850
1,1,ENSG00000188976,NOC2L,protein_coding,1244,18685092,105784525
2,2,ENSG00000187642,PERM1,protein_coding,2765,664016,95688802
3,3,ENSG00000272512,ENSG00000272512.1,lncRNA,2086,927207,91649589
4,4,ENSG00000188290,HES4,protein_coding,961,19206715,105542421
...,...,...,...,...,...,...,...
60383,60383,ENSG00000285470,TUSC2P1,processed_pseudogene,1237,2150,23175703
60384,60384,ENSG00000286120,ENSG00000286120.1,transcribed_unprocessed_pseudogene,457,6,19614003
60385,60385,ENSG00000286173,ENSG00000286173.1,transcribed_unprocessed_pseudogene,457,3,19614003
60386,60386,ENSG00000286744,ENSG00000286744.1,unprocessed_pseudogene,102,74,19229873
