# load and import

In [2]:
import pandas as pd
import numpy as np
import scanpy as sc
from scib_metrics.benchmark import Benchmarker


In [None]:
model = scPrint.load_from_checkpoint('../data/temp/o2uniqsx/epoch=18-step=133000.ckpt', precpt_gene_emb=None)

RuntimeError caught: scPrint is not attached to a `Trainer`.


## Loading the data

In [2]:
adata = sc.read(
    "data/pancreas_atlas.h5ad",
    backup_url="https://figshare.com/ndownloader/files/24539828",
)


In [3]:
adata = sc.read(
    "data/lung_atlas.h5ad",
    backup_url="https://figshare.com/ndownloader/files/24539942",
)

In [4]:
adata.X = adata.layers['counts']
del adata.layers['counts']

In [6]:
sc.pp.highly_variable_genes(
    adata, flavor="seurat_v3", n_top_genes=5000, subset=True
)



# convert to right format


In [7]:
def main_gene_selection(X_df, gene_list):
    """
    Describe:
        rebuild the input adata to select target genes encode protein 
    Parameters:
        adata->`~anndata.AnnData` object: adata with var index_name by gene symbol
        gene_list->list: wanted target gene 
    Returns:
        adata_new->`~anndata.AnnData` object
        to_fill_columns->list: zero padding gene
    """
    to_fill_columns = list(set(gene_list) - set(X_df.columns))
    padding_df = pd.DataFrame(np.zeros((X_df.shape[0], len(to_fill_columns))), 
                              columns=to_fill_columns, 
                              index=X_df.index)
    X_df = pd.DataFrame(np.concatenate([df.values for df in [X_df, padding_df]], axis=1), 
                        index=X_df.index, 
                        columns=list(X_df.columns) + list(padding_df.columns))
    X_df = X_df[gene_list]
    
    var = pd.DataFrame(index=X_df.columns)
    var['mask'] = [1 if i in to_fill_columns else 0 for i in list(var.index)]
    return X_df, to_fill_columns,var

In [23]:
adata.shape

(16382, 5000)

In [8]:
# X_df represents your single cell data with cells in rows and genes in columns
gene_list_df = pd.read_csv('../tools/scFoundation/OS_scRNA_gene_index.19264.tsv', header=0, delimiter='\t')
gene_list = list(gene_list_df['gene_name'])
X_df, to_fill_columns, var = main_gene_selection(adata.to_df(), gene_list)
# Save your data X_df in either npy or csv format.
X_df.to_csv('/tmp/scfoundation_data.csv')

In [17]:
# need to download the model first
# https://hopebio2020-my.sharepoint.com/:f:/g/personal/dongsheng_biomap_com/Eh22AX78_AVDv6k6v4TZDikBXt33gaWXaz27U9b1SldgbA 
# then mv it in the model/models folder
! mv ../tools/models.ckpt ../tools/scFoundation/model/models/

In [None]:
! pip install local_attention argparse

In [27]:
! mkdir -p /tmp/mapping

In [9]:
! cd ../tools/scFoundation/model && python get_embedding.py --task_name mapping --input_type singlecell --output_type cell --pool_type all --tgthighres t4.5 --data_path /tmp/scfoundation_data.csv --save_path /tmp/mapping/ --pre_normalized F --version rde

(32472, 19264)
{'mask_gene_name': False, 'gene_num': 19266, 'seq_len': 19266, 'encoder': {'hidden_dim': 768, 'depth': 12, 'heads': 12, 'dim_head': 64, 'seq_len': 19266, 'module_type': 'transformer', 'norm_first': False}, 'decoder': {'hidden_dim': 512, 'depth': 6, 'heads': 8, 'dim_head': 64, 'module_type': 'performer', 'seq_len': 19266, 'norm_first': False}, 'n_class': 104, 'pad_token_id': 103, 'mask_token_id': 102, 'bin_num': 100, 'bin_alpha': 1.0, 'rawcount': True, 'model': 'mae_autobin', 'test_valid_train_idx_dict': '/nfs_beijing/minsheng/data/os10000w-new/global_shuffle/meta.csv.train_set_idx_dict.pt', 'valid_data_path': '/nfs_beijing/minsheng/data/valid_count_10w.npz', 'num_tokens': 13, 'train_data_path': None, 'isPanA': False, 'isPlanA1': False, 'max_files_to_load': 5, 'bin_type': 'auto_bin', 'value_mask_prob': 0.3, 'zero_mask_prob': 0.03, 'replace_prob': 0.8, 'random_token_prob': 0.1, 'mask_ignore_token_ids': [0], 'decoder_add_zero': True, 'mae_encoder_max_seq_len': 15000, 'isPla

In [29]:
ls /tmp/mapping/

mapping_01B-resolution_singlecell_cell_embedding_t4.5_resolution.npy


In [5]:
res = np.load('/tmp/mapping/mapping_01B-resolution_singlecell_cell_embedding_t4.5_resolution.npy')
res.shape


(32472, 3072)

In [6]:
adata.obsm['scFoundation'] = res

In [7]:
adata.obsm['scFoundation'] = sc.pp.pca(adata.obsm['scFoundation'], n_comps=512, svd_solver='arpack', random_state=0)


In [8]:
bm = Benchmarker(
    adata,
    batch_key="batch", #batch, tech
    label_key="cell_type", #celltype
    embedding_obsm_keys=["scFoundation"],
    n_jobs=20,
)
bm.benchmark()

Computing neighbors: 100%|██████████| 1/1 [00:33<00:00, 33.35s/it]


In [7]:
bm.get_results(min_max_scale=False)

Unnamed: 0_level_0,Isolated labels,KMeans NMI,KMeans ARI,Silhouette label,cLISI,Silhouette batch,iLISI,KBET,Graph connectivity,PCR comparison,Batch correction,Bio conservation,Total
Embedding,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
scFoundation,0.60987,0.493447,0.26954,0.541061,0.998018,0.853688,0.009817,0.178943,0.825811,0.402781,0.454208,0.582387,0.531116
Metric Type,Bio conservation,Bio conservation,Bio conservation,Bio conservation,Bio conservation,Batch correction,Batch correction,Batch correction,Batch correction,Batch correction,Aggregate score,Aggregate score,Aggregate score
