# Reference mapping using cell embedding by pretrained scGPT model

In [None]:
# Imports
import sys
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import mode
import scanpy as sc
import sklearn
import warnings
sys.path.insert(0, "../")
import scgpt as scg

# extra dependency for similarity search
try:
    import faiss

    faiss_imported = True
except ImportError:
    faiss_imported = False
    print(
        "faiss not installed! We highly recommend installing it for fast similarity search."
    )
    print("To install it, see https://github.com/facebookresearch/faiss/wiki/Installing-Faiss")

warnings.filterwarnings("ignore", category=ResourceWarning)

In [None]:
# Change paths according to your scGPT model, adata object and faiss index directory
model_dir = Path("/data1/projects/pi-drukkerm/herbermannc/practicedata/scGPT_human")
adata = sc.read_h5ad("/data1/projects/pi-drukkerm/herbermannc/practicedata/CFS_all_days_rawcount.h5ad")
gene_col = "index"
index_dir="/data1/projects/pi-drukkerm/herbermannc/practicedata/faiss_index/"

In [None]:
# Embedding of adata
adata_embed = scg.tasks.embed_data(
    adata,
    model_dir,
    gene_col=gene_col,
    batch_size=64,
    return_new_adata=True,
)

In [None]:
# Load faiss index
from build_atlas_index_faiss import load_index, vote
index, meta_labels = load_index(
    index_dir,
    use_config_file=True,
    use_gpu=True,
)
print(f"Loaded index with {index.ntotal} cells")

In [None]:
# Apply similarity search
adata_embed = adata_embed.X
%%time
k = 50
distances, idx = index.search(adata_embed, k)

predict_labels = meta_labels[idx]

from tqdm import tqdm

voting = []
for preds in tqdm(predict_labels):
    voting.append(vote(preds, return_prob=False)[0])
voting = np.array(voting)

In [None]:
# Add celltype to adata
adata.obs['celltype'] = voting

In [None]:
# Dimensionality reduction and plotting
sc.pp.neighbors(adata, use_rep="X")
sc.tl.umap(adata)
sc.pl.umap(adata, color='celltype', frameon=False, wspace=0.4)