# CellAssign Cell Type Assignment

Load the data

In [None]:
import os

marker_list = os.getenv("SNAKEMAKE_MARKER_LIST")
if not os.path.exists(marker_list):
    raise FileNotFoundError(f"Marker list file does not exist: {marker_list}")
data = os.getenv("SNAKEMAKE_H5AD_INPUT")
if not os.path.exists(data):
    raise FileNotFoundError(f"Input data file does not exist: {data}")
output_file = os.getenv("SNAKEMAKE_OUTPUT_FILE")  # Output csv mapping barcodes to cell type data
n_gpus = int(os.getenv("SNAKEMAKE_NUM_GPUS", "0"))  # Number of GPUs to use, default is 0 (CPU only)

print(f"Marker list file: {marker_list}")
print(f"Input data file: {data}")
print(f"Output file: {output_file}")
print(f"Number of GPUs: {n_gpus}")

In [None]:
import scanpy as sc
adata = sc.read_h5ad(data)
adata

In [None]:
import pandas as pd
marker_df = pd.read_csv(marker_list, index_col=0)
# Add an unassigned cell type
marker_df['Unassigned'] = 0
marker_df

Performing cell type assignment

In [None]:
import scvi
from scvi.external import CellAssign
scvi.settings.seed = 0
print("Last run with scvi-tools version:", scvi.__version__)

In [None]:
import numpy as np
lib_size = adata.X.sum(1)
adata.obs["size_factor"] = lib_size / np.mean(lib_size)
# Strip genes not in the marker list
bdata = adata[:, adata.var_names.isin(marker_df['gene'])].copy()
bdata

In [None]:
CellAssign.setup_anndata(bdata, size_factor_key="size_factor")
bdata

In [None]:
model = CellAssign(bdata, marker_df)
model.train(
    use_gpu=False if n_gpus == 0 else n_gpus,
)

In [None]:
model.history['elbo_validation'].plot()

Extract assignments

In [None]:
predictions = model.predict()
predictions.head()

In [None]:
import seaborn as sns
sns.clustermap(predictions, cmap="viridis")

In [None]:
predictions['cell_type'] = predictions.idxmax(axis=1)
sns.barplot(x="cell_type", y="size_factor", data=predictions)

In [None]:
predictions['cell_type'].value_counts()

Save the results

In [None]:
predictions.to_csv(output_file, index=True)