# Learning about the CZ CELLxGENE Census

## Opening the Census

In [1]:
import cellxgene_census

In [2]:
census = cellxgene_census.open_soma()

The "stable" release is currently 2023-05-15. Specify 'census_version="2023-05-15"' in future calls to open_soma() to ensure data consistency.


## Cell metadata

In [3]:
keys = list(census["census_data"]["homo_sapiens"].obs.keys())

In [4]:
keys

['soma_joinid',
 'dataset_id',
 'assay',
 'assay_ontology_term_id',
 'cell_type',
 'cell_type_ontology_term_id',
 'development_stage',
 'development_stage_ontology_term_id',
 'disease',
 'disease_ontology_term_id',
 'donor_id',
 'is_primary_data',
 'self_reported_ethnicity',
 'self_reported_ethnicity_ontology_term_id',
 'sex',
 'sex_ontology_term_id',
 'suspension_type',
 'tissue',
 'tissue_ontology_term_id',
 'tissue_general',
 'tissue_general_ontology_term_id']

## Gene metadata

In [5]:
keys = list(census["census_data"]["homo_sapiens"].ms["RNA"].var.keys())

In [6]:
keys

['soma_joinid', 'feature_id', 'feature_name', 'feature_length']

In [7]:
census_info = census["census_info"]["summary"].read().concat().to_pandas()

In [8]:
census_info

Unnamed: 0,soma_joinid,label,value
0,0,census_schema_version,1.0.0
1,1,census_build_date,2023-05-15
2,2,dataset_schema_version,3.0.0
3,3,total_cell_count,57880760
4,4,unique_cell_count,36673205
5,5,number_donors_homo_sapiens,12493
6,6,number_donors_mus_musculus,1362


## Census summary content tables

### Cell counts by cell metadata

In [9]:
census_counts = census["census_info"]["summary_cell_counts"].read().concat().to_pandas()

In [10]:
census_counts

Unnamed: 0,soma_joinid,organism,category,ontology_term_id,unique_cell_count,total_cell_count,label
0,0,Homo sapiens,all,na,33758887,53794728,na
1,1,Homo sapiens,assay,EFO:0008722,264166,279635,Drop-seq
2,2,Homo sapiens,assay,EFO:0008780,25652,51304,inDrop
3,3,Homo sapiens,assay,EFO:0008913,133511,133511,single-cell RNA sequencing
4,4,Homo sapiens,assay,EFO:0008919,89477,206754,Seq-Well
...,...,...,...,...,...,...,...
1330,1330,Mus musculus,tissue_general,UBERON:0002113,179684,208324,kidney
1331,1331,Mus musculus,tissue_general,UBERON:0002365,15577,31154,exocrine gland
1332,1332,Mus musculus,tissue_general,UBERON:0002367,37715,130135,prostate gland
1333,1333,Mus musculus,tissue_general,UBERON:0002368,13322,26644,endocrine gland


### Example: cell metadata included in the summary counts table

In [11]:
census_counts[["organism", "category"]].value_counts(sort=False)

organism      category               
Homo sapiens  all                          1
              assay                       20
              cell_type                  604
              disease                     68
              self_reported_ethnicity     26
              sex                          3
              suspension_type              1
              tissue                     227
              tissue_general              61
Mus musculus  all                          1
              assay                        9
              cell_type                  226
              disease                      5
              self_reported_ethnicity      1
              sex                          3
              suspension_type              1
              tissue                      51
              tissue_general              27
dtype: int64

### Example: cell counts for each sequencing assay in human data

In [12]:
census_human_assays = census_counts.query("organism == 'Homo sapiens' & category == 'assay'")

In [13]:
census_human_assays.sort_values("total_cell_count", ascending=False)

Unnamed: 0,soma_joinid,organism,category,ontology_term_id,unique_cell_count,total_cell_count,label
11,11,Homo sapiens,assay,EFO:0009922,12559550,23296808,10x 3' v3
8,8,Homo sapiens,assay,EFO:0009899,7439891,12412039,10x 3' v2
15,15,Homo sapiens,assay,EFO:0011025,3904000,6292436,10x 5' v1
14,14,Homo sapiens,assay,EFO:0010550,4062980,5064268,sci-RNA-seq
9,9,Homo sapiens,assay,EFO:0009900,2835156,3116133,10x 5' v2
17,17,Homo sapiens,assay,EFO:0030003,744798,811422,10x 3' transcription profiling
18,18,Homo sapiens,assay,EFO:0030004,644182,770127,10x 5' transcription profiling
16,16,Homo sapiens,assay,EFO:0030002,625175,642559,microwell-seq
1,1,Homo sapiens,assay,EFO:0008722,264166,279635,Drop-seq
4,4,Homo sapiens,assay,EFO:0008919,89477,206754,Seq-Well


### Example: number of microglial cells in the Census

In [14]:
census_counts.query("label == 'microglial cell'")

Unnamed: 0,soma_joinid,organism,category,ontology_term_id,unique_cell_count,total_cell_count,label
70,70,Homo sapiens,cell_type,CL:0000129,264424,363182,microglial cell
1048,1048,Mus musculus,cell_type,CL:0000129,48998,62617,microglial cell


## Understanding Census contents beyond the summary tables

### Example: all cell types available in human

In [15]:
human_cell_types = (
    census["census_data"]["homo_sapiens"].obs.read(column_names=["cell_type", "is_primary_data"]).concat().to_pandas()
)

In [16]:
human_cell_types

Unnamed: 0,cell_type,is_primary_data
0,myeloid cell,True
1,myeloid cell,True
2,fat cell,True
3,myeloid cell,True
4,fat cell,True
...,...,...
53794723,pericyte,True
53794724,pericyte,True
53794725,pericyte,True
53794726,pericyte,True


In [17]:
human_cell_types = (
    census["census_data"]["homo_sapiens"]
    .obs.read(column_names=["cell_type"], value_filter="is_primary_data == True")
    .concat()
    .to_pandas()
)

In [18]:
human_cell_types

Unnamed: 0,cell_type,is_primary_data
0,myeloid cell,True
1,myeloid cell,True
2,fat cell,True
3,myeloid cell,True
4,fat cell,True
...,...,...
33758882,pericyte,True
33758883,pericyte,True
33758884,pericyte,True
33758885,pericyte,True


In [19]:
human_cell_types = human_cell_types[["cell_type"]]

In [20]:
human_cell_types

Unnamed: 0,cell_type
0,myeloid cell
1,myeloid cell
2,fat cell
3,myeloid cell
4,fat cell
...,...
33758882,pericyte
33758883,pericyte
33758884,pericyte
33758885,pericyte


In [21]:
human_cell_types.shape

(33758887, 1)

In [22]:
human_cell_type_counts = human_cell_types.value_counts()

In [23]:
human_cell_type_counts

cell_type                                      
neuron                                             2682728
glutamatergic neuron                               1539105
CD4-positive, alpha-beta T cell                    1274999
CD8-positive, alpha-beta T cell                    1211278
classical monocyte                                  998008
                                                    ...   
microfold cell of epithelium of small intestine         19
mature conventional dendritic cell                      17
serous cell of epithelium of bronchus                   15
sperm                                                   11
type N enteroendocrine cell                             10
Length: 588, dtype: int64

In [24]:
human_cell_type_counts.shape

(588,)

### Example: cell types available in human liver

In [25]:
human_liver_cell_types = (
    census["census_data"]["homo_sapiens"]
    .obs.read(column_names=["cell_type"], value_filter="is_primary_data == True and tissue_general == 'liver'")
    .concat()
    .to_pandas()
)

In [26]:
human_liver_cell_types

Unnamed: 0,cell_type,is_primary_data,tissue_general
0,monocyte,True,liver
1,monocyte,True,liver
2,monocyte,True,liver
3,monocyte,True,liver
4,monocyte,True,liver
...,...,...,...
567322,dendritic cell,True,liver
567323,dendritic cell,True,liver
567324,dendritic cell,True,liver
567325,erythroid progenitor cell,True,liver


In [27]:
human_liver_cell_types["cell_type"].value_counts()

T cell                                  86780
hepatoblast                             58447
neoplastic cell                         52431
erythroblast                            45605
monocyte                                34585
                                        ...  
epithelial cell of exocrine pancreas        1
enteroendocrine cell                        1
type I pneumocyte                           1
endocrine cell                              1
myelocyte                                   1
Name: cell_type, Length: 127, dtype: int64

### Example: diseased T cells in human tissues

In [28]:
t_cells_list = ["CD8-positive, alpha-beta T cell", "CD4-positive, alpha-beta T cell"]

In [29]:
t_cells_list

['CD8-positive, alpha-beta T cell', 'CD4-positive, alpha-beta T cell']

In [30]:
t_cells_diseased = (
    census["census_data"]["homo_sapiens"]
    .obs.read(
        column_names=["disease", "tissue_general"],
        value_filter=f"is_primary_data == True and cell_type in {t_cells_list} and disease != 'normal'",
    )
    .concat()
    .to_pandas()
)

In [31]:
t_cells_diseased

Unnamed: 0,disease,tissue_general,is_primary_data,cell_type
0,breast cancer,breast,True,"CD4-positive, alpha-beta T cell"
1,breast cancer,breast,True,"CD4-positive, alpha-beta T cell"
2,breast cancer,breast,True,"CD4-positive, alpha-beta T cell"
3,breast cancer,breast,True,"CD4-positive, alpha-beta T cell"
4,breast cancer,breast,True,"CD4-positive, alpha-beta T cell"
...,...,...,...,...
1766292,COVID-19,blood,True,"CD4-positive, alpha-beta T cell"
1766293,COVID-19,blood,True,"CD4-positive, alpha-beta T cell"
1766294,COVID-19,blood,True,"CD4-positive, alpha-beta T cell"
1766295,COVID-19,blood,True,"CD4-positive, alpha-beta T cell"


In [32]:
t_cells_diseased = t_cells_diseased[["disease", "tissue_general"]].value_counts(sort=False)

In [33]:
t_cells_diseased

disease                                tissue_general    
B-cell non-Hodgkin lymphoma            blood                  62499
COVID-19                               blood                 834850
                                       lung                   30578
                                       nose                      13
                                       respiratory system         4
                                       saliva                    41
Crohn disease                          colon                  17490
                                       small intestine        52029
Down syndrome                          bone marrow              181
breast cancer                          breast                  1850
chronic obstructive pulmonary disease  lung                    9382
chronic rhinitis                       nose                     909
cystic fibrosis                        lung                       7
follicular lymphoma                    lymph node         

In [34]:
census.close()
del census