In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,confusion_matrix,ConfusionMatrixDisplay
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import umap
import pandas as pd
import numpy as np
import logging
import torch
from helical.utils import get_anndata_from_hf_dataset
from datasets import load_dataset

logging.getLogger().setLevel(logging.ERROR)

warnings.filterwarnings("ignore")

# Import Geneformer & UCE from the Helical package
from helical.models.geneformer import Geneformer,GeneformerConfig
from helical.models.uce import UCE, UCEConfig

In [None]:
import anndata as ad

ann_data = ad.read_h5ad("../fb338c4d-e63a-4b17-abd6-1032a66c8886.h5ad")

In [None]:
ann_data

In [None]:
# Shuffle the dataset.
# Seed used for reproducibility
#hf_dataset.shuffle(seed=42)
#X_train = get_anndata_from_hf_dataset(hf_dataset["train"])[:1000]
#X_test = get_anndata_from_hf_dataset(hf_dataset["test"])[:100]

import anndata as ad
from sklearn.model_selection import train_test_split


ann_data.X = ann_data.raw.X.copy()
# Shuffle the AnnData object by its observations (cells)
ann_data = ann_data[ann_data.obs.sample(frac=1, random_state=42).index]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test = train_test_split(ann_data, test_size=0.2, random_state=42)

# Now you have train_adata and test_adata while keeping the AnnData structure intact
X_train

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_config = GeneformerConfig(batch_size=10,device=device)   
geneformer = Geneformer(configurer=model_config)

In [None]:
# The "process_data"-function from the Helical package pre-processes the data. 
# It takes AnnData as an input. 
# More information in our documentation
X_train.var["ensembl_id"] = X_train.var_names
X_test.var["ensembl_id"] = X_test.var_names

train_dataset = geneformer.process_data(X_train, gene_names='ensembl_id')
test_dataset = geneformer.process_data(X_test, gene_names='ensembl_id')

In [None]:
ref_embeddings = geneformer.get_embeddings(train_dataset)
test_embeddings = geneformer.get_embeddings(test_dataset)

In [None]:
reducer = umap.UMAP(min_dist=0.2, n_components=2, n_epochs=None,n_neighbors=3)
mapper = reducer.fit(ref_embeddings)

plot_df = pd.DataFrame(mapper.embedding_,columns=['px','py'])
labels = X_train.obs['annotation_detailed']
plot_df['Cell Type'] = labels.values


# Create a matplotlib figure and axes
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(14, 5))

#plt.style.use("dark_background")

sns.scatterplot(data = plot_df,x='px',y='py',sizes=(50,200),ax=axs[0],palette="pastel")
axs[0].set_title('UMAP of Reference Data without labels')

sns.scatterplot(data = plot_df,x='px',y='py',hue='Cell Type',sizes=(50,200),ax=axs[1],palette="pastel")
axs[1].set_title('UMAP of Reference Data with labels')