In [1]:
import TOSICA
import scanpy as sc
import numpy as np
import pandas as pd
import pickle
from time import time
from scipy.stats import spearmanr, gamma, poisson
from anndata import AnnData, read_h5ad
from scanpy import read
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import tensor
from torch.cuda import is_available
from scMMT.scMMT_API import scMMT_API
from sklearn.metrics import f1_score, accuracy_score
import datetime
import warnings 
warnings.filterwarnings ("ignore")



Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [None]:
models=["tosica","scmmt"]
datasets=["tos","pmbc"]

In [None]:
seed = 5
torch.manual_seed(seed)
torch.cuda.manual_seed(seed) 
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU
np.random.seed(seed)

In [2]:
def get_dataset(dataset_name):
    data_path="data/"
    if dataset_name=="tos":
        data_path=data_path+"tosica/"
        train = sc.read(data_path+'demo_train.h5ad')
        train = train[:,train.var_names]
        test = sc.read(data_path+'demo_test.h5ad')
        test = test[:,train.var_names]
    elif dataset_name=="pmbc":
        data_path=data_path+"pbmc/"
        adata_gene = sc.read(data_path+"pbmc_gene.h5ad")
        adata_protein = sc.read(data_path+"pbmc_protein.h5ad")
        adata_gene=adata_gene[:1000]
        adata_protein=adata_protein[:1000]
        adata_gene.X = adata_gene.X.toarray()
        adata_protein.X = adata_protein.X.toarray()
        sc.pp.normalize_total(adata_protein)
        sc.pp.log1p(adata_protein)
        patients = np.unique(adata_protein.obs['donor'].values)
        for patient in patients:
            indices = [x == patient for x in adata_protein.obs['donor']]
            sub_adata = adata_protein[indices]
            sc.pp.scale(sub_adata)
            adata_protein[indices] = sub_adata.X
        train_bool = [x in ['P1', 'P3', 'P4', 'P7'] for x in adata_protein.obs['donor']]
        adata_gene_train = adata_gene[train_bool].copy()
        adata_protein_train = adata_protein[train_bool].copy()
        adata_gene_test = adata_gene[np.invert(train_bool)].copy()
        adata_protein_test = adata_protein[np.invert(train_bool)].copy()
        train=[adata_gene_train,adata_protein_train]
        test=[adata_gene_test,adata_protein_test]
    return train, test, data_path

In [None]:
def train_model(model_name, train, test, data_path, num_epoch):
    now = datetime.datetime.now()
    saved_models_path="saved_models/"
    if model_name=="tosica":
        model_path=saved_models_path+"toscia_"+now
        TOSICA.train(train, gmt_path='human_gobp', label_name='Celltype',epochs=num_epoch,project=model_path)
        model_weight_path = model_path+'/model-0.pth'
        new_adata = TOSICA.pre(query_adata, model_weight_path = model_weight_path,project=model_path)
        
    elif model_name=="scmmt":
        model_path=saved_models_path+"scmmt_"+now
        adata_gene_train,adata_protein_train=train
        adata_gene_test,adata_protein_test=test
        scMMT = scMMT_API(    gene_trainsets = [adata_gene_train], protein_trainsets = [adata_protein_train], gene_test = adata_gene_test, 
                      train_batchkeys = ['donor'], test_batchkey = 'donor',
                      log_normalize = True,            # Is scRNA seq standardized for log
                      type_key = 'celltype.l3',        # Keywords representing cell types (in protein dataset)
                      data_dir=data_path+"preprocess_data_l3.pkl",  # Save path for processed data
                      data_load=False,                # Do you want to import existing processed data
                      dataset_batch = True,           # Is there a batch effect in the training set and testing machine
                      log_weight=3,                   # Log weights for different cell types
                      val_split = None,               # Do you need to divide the validation set according to the distribution of the test set
                      min_cells = 0,                  # Minimum cell count filtering
                      min_genes = 0,                  # Minimum number of genes filtering
                      n_svd = 300,                    # Dimension obtained using Tsvd dimensionality reduction
                      n_fa=180,                       # Dimension obtained by using FA dimensionality reduction
                      n_hvg=550,                      # Number of high variants obtained through screening
                     )
        scMMT.train(n_epochs = num_epoch, ES_max = 12, decay_max = 6, decay_step = 0.1, lr = 10**(-3), label_smoothing=0.4, 
            h_size=600, drop_rate=0.15, n_layer=4,
            weights_dir = model_path , load = False)
        predicted_test = scMMT.predict()
        acc = (predicted_test.obs['transfered cell labels'] == predicted_test.obs['celltype.l3']).mean()
        f1 = f1_score(predicted_test.obs['transfered cell labels'], predicted_test.obs['celltype.l3'], average=None)
        f1_avg = np.median(f1)
    return acc,f1_avg