# Querying data using the gget cellxgene module

gget is a free, open-source command-line tool and Python package that enables efficient querying of genomic databases. gget consists of a collection of separate but interoperable modules, each designed to facilitate one type of database querying in a single line of code.

In [75]:
import gget
gget.setup("cellxgene")

# Display all options of the cellxgene gget module
help(gget.cellxgene)

20:35:20 - INFO - Installing cellxgene-census package (requires pip).
20:35:21 - INFO - cellxgene_census installed succesfully.


Help on function cellxgene in module gget.gget_cellxgene:

cellxgene(species='homo_sapiens', gene=None, ensembl=False, column_names=['dataset_id', 'assay', 'suspension_type', 'sex', 'tissue_general', 'tissue', 'cell_type'], meta_only=False, tissue=None, cell_type=None, development_stage=None, disease=None, sex=None, is_primary_data=True, dataset_id=None, tissue_general_ontology_term_id=None, tissue_general=None, assay_ontology_term_id=None, assay=None, cell_type_ontology_term_id=None, development_stage_ontology_term_id=None, disease_ontology_term_id=None, donor_id=None, self_reported_ethnicity_ontology_term_id=None, self_reported_ethnicity=None, sex_ontology_term_id=None, suspension_type=None, tissue_ontology_term_id=None, census_version='stable', verbose=True, out=None)
    Query data from CZ CELLxGENE Discover (https://cellxgene.cziscience.com/) using the
    CZ CELLxGENE Discover Census (https://github.com/chanzuckerberg/cellxgene-census).

    NOTE: Querying large datasets requires

# Fetch an AnnData object by selecting gene(s), tissue(s) and cell type(s)

In [76]:
adata_homo = gget.cellxgene(
    meta_only=False,
    ensembl=False,  # Setting 'ensembl=True' here since the gene is passed as an Ensembl ID
    # collection_id="",
    # dataset_id = "700aed19-c16e-4ba8-9191-07da098a8626",
    # gene=["ENSG00000197405","ENSG00000171860","ENSG00000000971"],
    species="homo_sapiens",
    disease="normal",
    suspension_type="cell",
    # assay=["10x 3' v3"],
    tissue="brain",
    cell_type="microglial cell",
    gene=["CFH", "C5AR1", "C3AR1"],
)

adata_homo

20:35:21 - INFO - Fetching AnnData object from CZ CELLxGENE Discover. This might take a few minutes...
The "stable" release is currently 2025-01-30. Specify 'census_version="2025-01-30"' in future calls to open_soma() to ensure data consistency.
  adata = cellxgene_census.get_anndata(


AnnData object with n_obs × n_vars = 128 × 3
    obs: 'dataset_id', 'assay', 'suspension_type', 'sex', 'tissue_general', 'tissue', 'cell_type', 'is_primary_data', 'disease'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_type', 'feature_length', 'nnz', 'n_measured_obs'

In [131]:
adata_homo.var

Unnamed: 0,soma_joinid,feature_id,feature_name,feature_type,feature_length,nnz,n_measured_obs,species
0,1062,ENSG00000000971,CFH,protein_coding,3057,6810172,105893241,homo_sapiens
1,7759,ENSG00000171860,C3AR1,protein_coding,2000,4295412,105910908,homo_sapiens
2,11952,ENSG00000197405,C5AR1,protein_coding,781,8688032,105817790,homo_sapiens


In [78]:
adata_mus = gget.cellxgene(
    meta_only=False,
    ensembl=False,  # Setting 'ensembl=True' here since the gene is passed as an Ensembl ID
    # collection_id="",
    # dataset_id = "700aed19-c16e-4ba8-9191-07da098a8626",
    # gene=["ENSG00000197405","ENSG00000171860","ENSG00000000971"],
    # assay=["10x 3' v3"],
    species="mus_musculus",  # Let's switch up the species
    disease="normal",
    suspension_type="cell",
    # assay=["10x 3' v3"],
    tissue="brain",
    cell_type="microglial cell",
    gene=["Cfh", "C5ar1", "C3ar1"],
)

adata_mus

20:36:25 - INFO - Fetching AnnData object from CZ CELLxGENE Discover. This might take a few minutes...
The "stable" release is currently 2025-01-30. Specify 'census_version="2025-01-30"' in future calls to open_soma() to ensure data consistency.
  adata = cellxgene_census.get_anndata(


AnnData object with n_obs × n_vars = 13268 × 3
    obs: 'dataset_id', 'assay', 'suspension_type', 'sex', 'tissue_general', 'tissue', 'cell_type', 'is_primary_data', 'disease'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_type', 'feature_length', 'nnz', 'n_measured_obs'

# Convert mus_musculus 'feature_name' entries to match homo_sapiens

In [126]:
import anndata as ad
import pandas as pd

# Convert AnnData object to DataFrame
df_var = adata_mus.var.copy()
df_obs = adata_mus.obs.copy()
df_X = pd.DataFrame(adata_mus.X.toarray(), index=adata_mus.obs.index, columns=adata_mus.var.index)

# Modify the feature_name entries to uppercase
df_var['feature_name'] = df_var['feature_name'].str.upper()

# Convert the DataFrame back to an AnnData object
adata_mus = ad.AnnData(X=df_X.values, var=df_var, obs=df_obs)

# Verify the changes
adata_mus.var

Unnamed: 0,soma_joinid,feature_id,feature_name,feature_type,feature_length,nnz,n_measured_obs
0,1029,ENSMUSG00000040552,C3AR1,protein_coding,4305,317588,42636109
1,13238,ENSMUSG00000049130,C5AR1,protein_coding,2481,422091,42757016
2,15169,ENSMUSG00000026365,CFH,protein_coding,3923,2085471,42776863


# Add column for species

In [127]:
# Add column for species
adata_homo.var['species'] = "homo_sapiens"
adata_homo.obs['species'] = "homo_sapiens"
adata_mus.var['species'] = "mus_musculus"
adata_mus.obs['species'] = "mus_musculus"

# Combine adata_homo and adata_mus

adata_homo.var:
   soma_joinid       feature_id feature_name    feature_type  feature_length  \
0         1062  ENSG00000000971          CFH  protein_coding            3057   
1         7759  ENSG00000171860        C3AR1  protein_coding            2000   
2        11952  ENSG00000197405        C5AR1  protein_coding             781   

       nnz  n_measured_obs       species  
0  6810172       105893241  homo_sapiens  
1  4295412       105910908  homo_sapiens  
2  8688032       105817790  homo_sapiens  
adata_mus.var:
   soma_joinid          feature_id feature_name    feature_type  \
0         1029  ENSMUSG00000040552        C3AR1  protein_coding   
1        13238  ENSMUSG00000049130        C5AR1  protein_coding   
2        15169  ENSMUSG00000026365          CFH  protein_coding   

   feature_length      nnz  n_measured_obs       species  
0            4305   317588        42636109  mus_musculus  
1            2481   422091        42757016  mus_musculus  
2            3923  2085471    

0
1
2
