In [None]:
PDIR = '/Users/aumchampaneri/VSCode Projects/complement-receptor-blockade/'

# Download data from cellxgene census

> Only need to run this section once for initial data downloading. Skip this section on all proceeding runs due to import incompatibility errors

In [None]:
# Install cellxgene-census if not already installed (run in terminal or notebook)
# !pip install -U cellxgene-census  # Uncomment if needed

In [None]:
import cellxgene_census

census = cellxgene_census.open_soma(census_version="latest")
census["census_info"]["summary"].read().concat().to_pandas()

# Get all dataset citations from census
datasets = census["census_info"]["datasets"].read().concat().to_pandas()

## Fetch AnnData from Census

In [None]:
import os
os.makedirs(f"{PDIR}/lupus-pbmc/raw-data", exist_ok=True)

In [None]:
# Fetch an AnnData object
dataset_id = "218acb0f-9f2f-4f76-b90b-15a4b7c7f629" # https://datasets.cellxgene.cziscience.com/218acb0f-9f2f-4f76-b90b-15a4b7c7f629.h5ad
cellxgene_census.download_source_h5ad(dataset_id, to_path=f"{PDIR}/lupus-pbmc/raw-data/cxg_lupus-pbmc.h5ad", progress_bar=True)
census.close()

## Fetch citation for dataset

In [None]:
# Get a citation string for the slice
datasets[datasets["dataset_id"] == dataset_id].iloc[0]

# Prepare data for Geneformer

#### The input data for Geneformer is single-cell RNA-seq (scRNAseq) data composed of a raw count matrix stored in .loom or .h5ad format. If data is in another format (e.g., .csv, .mtx), it should be converted to .loom or .h5ad using Scanpy or Anndata tools.

#### No normalization, transformation, or feature selection should be performed. Removal of cells per standard quality controls (e.g., removal of empty droplets, damaged cells, doublets, etc.) is recommended.

In [None]:
import scanpy as sc
adata = sc.read_h5ad(f"{PDIR}/lupus-pbmc/raw-data/cxg_lupus-pbmc.h5ad")

In [None]:
adata

In [None]:
adata.obs

In [None]:
adata.var

#### The genes must be labeled as Ensembl IDs. If your dataset includes only gene symbols instead, you can use external tools like Ensembl Biomart or MyGeneInfo to convert them.

In [None]:
# Duplicate index to a new .var column -> "ensembl_id"
adata.var["ensembl_id"] = adata.var.index

#### The data may contain additional feature labels that one would like to retain in the tokenized dataset, for example to use as labels for embeddings or fine-tuning, or to confirm balancing of attributes across data splits.

In [None]:
# systemic lupus erythematosus
# normal (control)
set(adata.obs["disease"])

In [None]:
# cell type annotations from original authors
set(adata.obs["cell_type"])

In [None]:
# reformat developmental stage / age attribute
set(adata.obs["development_stage"])

# keep only first two characters (e.g., "Adult" -> "Ad")
adata.obs["development_stage"] = [
    ds[:2] for ds in adata.obs["development_stage"]
]

# sanity check
set(adata.obs["development_stage"])

## Save processed AnnData object

In [None]:
import os
os.makedirs(f"{PDIR}/lupus-pbmc/input-data", exist_ok=True)

In [None]:
# Save prepared AnnData object for Geneformer tokenization
adata.write_h5ad(f"{PDIR}}lupus-pbmc/input-data/cxg_lupus-pbmc_prepared.h5ad")