In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,confusion_matrix,ConfusionMatrixDisplay
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import umap
import pandas as pd
import numpy as np
import logging
import torch
from helical.utils import get_anndata_from_hf_dataset
from datasets import load_dataset

logging.getLogger().setLevel(logging.ERROR)

warnings.filterwarnings("ignore")

# Import Geneformer & UCE from the Helical package
from helical.models.geneformer import Geneformer,GeneformerConfig
from helical.models.uce import UCE, UCEConfig

INFO:datasets:PyTorch version 2.6.0 available.


In [2]:
import anndata as ad

ann_data = ad.read_h5ad("../fb338c4d-e63a-4b17-abd6-1032a66c8886.h5ad")

In [3]:
ann_data

AnnData object with n_obs × n_vars = 422220 × 33105
    obs: 'Ethnicity', 'BMI', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'Age_group', 'COVID_severity', 'COVID_status', 'Group', 'Smoker', 'sample_id', 'sequencing_library', 'Protein_modality_weight', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'donor_id', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'name', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'antibody_X', 'antibody_features', 'citation', 'default_embedding', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_umap_aft

In [4]:
# Shuffle the dataset.
# Seed used for reproducibility
#hf_dataset.shuffle(seed=42)
#X_train = get_anndata_from_hf_dataset(hf_dataset["train"])[:1000]
#X_test = get_anndata_from_hf_dataset(hf_dataset["test"])[:100]

import anndata as ad
from sklearn.model_selection import train_test_split


ann_data.X = ann_data.raw.X.copy()
# Shuffle the AnnData object by its observations (cells)
ann_data = ann_data[ann_data.obs.sample(frac=1, random_state=42).index]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test = train_test_split(ann_data, test_size=0.2, random_state=42)

# Now you have train_adata and test_adata while keeping the AnnData structure intact
X_train

View of AnnData object with n_obs × n_vars = 337776 × 33105
    obs: 'Ethnicity', 'BMI', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'Age_group', 'COVID_severity', 'COVID_status', 'Group', 'Smoker', 'sample_id', 'sequencing_library', 'Protein_modality_weight', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'donor_id', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'name', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'antibody_X', 'antibody_features', 'citation', 'default_embedding', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_config = GeneformerConfig(batch_size=10,device=device)   
geneformer = Geneformer(configurer=model_config)

In [6]:
# The "process_data"-function from the Helical package pre-processes the data. 
# It takes AnnData as an input. 
# More information in our documentation
X_train.var["ensembl_id"] = X_train.var_names
X_test.var["ensembl_id"] = X_test.var_names

train_dataset = geneformer.process_data(X_train, gene_names='ensembl_id')
test_dataset = geneformer.process_data(X_test, gene_names='ensembl_id')

100%|█████████████████████████████████████████| 660/660 [03:48<00:00,  2.89it/s]


Map:   0%|          | 0/337776 [00:00<?, ? examples/s]

100%|█████████████████████████████████████████| 165/165 [00:57<00:00,  2.86it/s]


Map:   0%|          | 0/84444 [00:00<?, ? examples/s]

In [None]:
ref_embeddings = geneformer.get_embeddings(train_dataset)
test_embeddings = geneformer.get_embeddings(test_dataset)

  0%|          | 0/33778 [00:00<?, ?it/s]

In [None]:
reducer = umap.UMAP(min_dist=0.2, n_components=2, n_epochs=None,n_neighbors=3)
mapper = reducer.fit(ref_embeddings)

plot_df = pd.DataFrame(mapper.embedding_,columns=['px','py'])
labels = X_train.obs['annotation_detailed']
plot_df['Cell Type'] = labels.values


# Create a matplotlib figure and axes
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(14, 5))

#plt.style.use("dark_background")

sns.scatterplot(data = plot_df,x='px',y='py',sizes=(50,200),ax=axs[0],palette="pastel")
axs[0].set_title('UMAP of Reference Data without labels')

sns.scatterplot(data = plot_df,x='px',y='py',hue='Cell Type',sizes=(50,200),ax=axs[1],palette="pastel")
axs[1].set_title('UMAP of Reference Data with labels')