#### Setting up conda env for Scanorama, scVI


    conda create -n scvi
    conda activate scvi

    conda install python=3.8
    conda install -c anaconda ipykernel
    conda install seaborn scikit-learn statsmodels numba pytables
    conda install -c conda-forge python-igraph leidenalg
    python -m pip install scanpy
    conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch
    conda install scvi-tools -c bioconda -c conda-forge
    python -m ipykernel install --user --name=scvi

    python -m pip install desc
    python -m pip install dca
    python -m pip install scanorama

In [30]:
from scipy.sparse import coo_matrix
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
import numpy as np
import pandas as pd
import scanpy as sc
from anndata import AnnData

In [31]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import adjusted_mutual_info_score, normalized_mutual_info_score
from sklearn.metrics import adjusted_rand_score, silhouette_score
from sklearn import preprocessing

In [32]:
import seaborn as sb
import matplotlib.pyplot as plt
sb.set(font_scale=1.25)

In [33]:
datasets = ['10Xmouse','humanpancreas','10Xpbmc']

#### Data Format

Each dataset contains the pre-processed merged counts from various batches with the cell types labels (actual) and batch labels (batch)

In [34]:
import os
import random
import numpy as np
import torch

def set_all_seeds(seed):
      random.seed(seed)
      np.random.seed(seed)
      torch.manual_seed(seed)
      torch.cuda.manual_seed(seed)
      torch.backends.cudnn.deterministic = True

In [35]:
RANDOM_SEED = 0 
set_all_seeds(RANDOM_SEED)

In [36]:
all_results = dict()
for name in ['batchAMI','cellAMI','batchNMI','cellNMI','batchARI','cellARI', 'SC',]:
    all_results[name] = dict()
    for dataset in ['10Xmouse','humanpancreas','10Xpbmc']:
        all_results[name][dataset] = dict()

In [None]:
## Scanorama

In [37]:
import scanpy as sc
from scanpy.external.pp import scanorama_integrate

In [38]:
batch_labels = 'batch'
cell_labels = 'actual' 

In [39]:
for dataset in datasets:
    adata = sc.read('../Data/{}_v2.h5ad'.format(dataset))
    adata.layers["counts"] = adata.X.copy()
    if dataset == '10Xmouse':
        sc.pp.recipe_zheng17(adata, n_top_genes=19268)
    elif dataset == 'humanpancreas':
        sc.pp.recipe_zheng17(adata, n_top_genes=15360)
    elif dataset == '10Xpbmc':
        sc.pp.recipe_zheng17(adata, n_top_genes=19815)
    sc.tl.pca(adata)
    no_cell_types = len(adata.obs[cell_labels].unique())
    no_batches = len(adata.obs[batch_labels].unique())
    scanorama_integrate(adata, batch_labels)
    kmeans_cell = KMeans(n_clusters=no_cell_types, random_state=0).fit(adata.obsm['X_scanorama'])
    adata.obs['Scanorama_kmeans_normalised'] = kmeans_cell.labels_
    print()
    print(dataset)
    all_results['cellAMI'][dataset]['Scanorama'] = adjusted_mutual_info_score(adata.obs[cell_labels],adata.obs['Scanorama_kmeans_normalised'])
    all_results['batchAMI'][dataset]['Scanorama'] = adjusted_mutual_info_score(adata.obs[batch_labels],adata.obs['Scanorama_kmeans_normalised'])
    all_results['cellNMI'][dataset]['Scanorama'] = normalized_mutual_info_score(adata.obs[cell_labels],adata.obs['Scanorama_kmeans_normalised'])
    all_results['batchNMI'][dataset]['Scanorama'] = normalized_mutual_info_score(adata.obs[batch_labels],adata.obs['Scanorama_kmeans_normalised'])
    all_results['cellARI'][dataset]['Scanorama'] = adjusted_rand_score(adata.obs[cell_labels],adata.obs['Scanorama_kmeans_normalised'])
    all_results['batchARI'][dataset]['Scanorama'] = adjusted_rand_score(adata.obs[cell_labels],adata.obs['Scanorama_kmeans_normalised'])
    all_results['SC'][dataset]['Scanorama'] = silhouette_score(adata.obsm['X_scanorama'], adata.obs['Scanorama_kmeans_normalised'])
    print()

[[0.         0.00450607 0.84818024]
 [0.         0.         0.65888855]
 [0.         0.         0.        ]]
Processing datasets 293t <=> jurkat_293t_50_50
Processing datasets jurkat <=> jurkat_293t_50_50

10Xmouse

[[0.         0.08329931 0.19592476 0.00940439 0.21278019]
 [0.         0.         0.63244514 0.18495298 0.24989792]
 [0.         0.         0.         0.12539185 0.45689655]
 [0.         0.         0.         0.         0.32445141]
 [0.         0.         0.         0.         0.        ]]
Processing datasets pancreas_multi_celseq2_expression_matrix <=> pancreas_multi_celseq_expression_matrix
Processing datasets pancreas_multi_celseq_expression_matrix <=> pancreas_multi_smartseq2_expression_matrix
Processing datasets pancreas_multi_fluidigmc1_expression_matrix <=> pancreas_multi_smartseq2_expression_matrix
Processing datasets pancreas_multi_celseq2_expression_matrix <=> pancreas_multi_smartseq2_expression_matrix
Processing datasets pancreas_inDrop <=> pancreas_multi_smartse

### scVI 

In [None]:
import scvi

In [None]:
datasets = ['10Xmouse','humanpancreas','10Xpbmc']

for dataset in datasets:
    adata = sc.read('../Data/{}_v2.h5ad'.format(dataset))
    adata.layers["counts"] = adata.X.copy()
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    adata.raw = adata 
    scvi.data.setup_anndata(adata, layer="counts", batch_key="batch")
    model = scvi.model.SCVI(adata)
    model.train()
    latent = model.get_latent_representation()
    adata.obsm["X_scVI"] = latent
    kmeans_cell = KMeans(n_clusters=no_cell_types, random_state=0).fit(adata.obsm['X_scVI'])
    adata.obs['scVI_kmeans_normalised'] = kmeans_cell.labels_


    all_results['cellAMI'][dataset]['scVI'] = adjusted_mutual_info_score(adata.obs[cell_labels],adata.obs['scVI_kmeans_normalised'])
    all_results['batchAMI'][dataset]['scVI'] = adjusted_mutual_info_score(adata.obs[batch_labels],adata.obs['scVI_kmeans_normalised'])
    all_results['cellNMI'][dataset]['scVI'] = normalized_mutual_info_score(adata.obs[cell_labels],adata.obs['scVI_kmeans_normalised'])
    all_results['batchNMI'][dataset]['scVI'] = normalized_mutual_info_score(adata.obs[batch_labels],adata.obs['scVI_kmeans_normalised'])
    all_results['cellARI'][dataset]['scVI'] = adjusted_rand_score(adata.obs[cell_labels],adata.obs['scVI_kmeans_normalised'])
    all_results['batchARI'][dataset]['scVI'] = adjusted_rand_score(adata.obs[cell_labels],adata.obs['scVI_kmeans_normalised'])
    all_results['SC'][dataset]['scVI'] = silhouette_score(adata.obsm['X_scVI'], adata.obs['scVI_kmeans_normalised'])


[34mINFO    [0m Using batches from adata.obs[1m[[0m[32m"batch"[0m[1m][0m                                               
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"counts"[0m[1m][0m                                              
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m9530[0m cells, [1;36m32643[0m vars, [1;36m3[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


GPU available: False, used: False
TPU available: False, using: 0 TPU cores


Epoch 400/400: 100%|██████████| 400/400 [2:18:32<00:00, 20.78s/it, loss=8.51e+03, v_num=1]  

10Xmouse
Cell AMI 0.5229180048468935
Batch AMI 0.29242013401535916
silhouette score 0.1267619

[34mINFO    [0m Using batches from adata.obs[1m[[0m[32m"batch"[0m[1m][0m                                               
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"counts"[0m[1m][0m                                              
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m15921[0m cells, [1;36m15369[0m vars, [1;36m5[0m        
         batches, [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates   
         and [1;36m0[0m extra continuous covariates.                                

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


Epoch 1/400:   0%|          | 0/400 [00:00<?, ?it/s]



Epoch 400/400: 100%|██████████| 400/400 [2:35:32<00:00, 23.33s/it, loss=1.12e+04, v_num=1]  





humanpancreas
Cell AMI 0.5971294422176757
Batch AMI 0.08716942328630199
silhouette score 0.2010023

[34mINFO    [0m Using batches from adata.obs[1m[[0m[32m"batch"[0m[1m][0m                                               
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"counts"[0m[1m][0m                                              
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m26202[0m cells, [1;36m32643[0m vars, [1;36m8[0m        
         batches, [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates   
         and [1;36m0[0m extra continuous covariates.                                                  
[34mINFO    [0m Please do not further modify adata until model is t

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


Epoch 305/305: 100%|██████████| 305/305 [4:44:48<00:00, 56.03s/it, loss=2.42e+03, v_num=1]  

10Xpbmc
Cell AMI 0.23583526034125102
Batch AMI 0.2177952251678045
silhouette score 0.08262363



#### BBKNN

In [None]:
import bbknn

In [None]:
for dataset in datasets:
    adata = sc.read('../Data/{}_v2.h5ad'.format(dataset))
    adata.layers["counts"] = adata.X.copy()
    if dataset == '10Xmouse':
        sc.pp.recipe_zheng17(adata, n_top_genes=19268)
    elif dataset == 'humanpancreas':
        sc.pp.recipe_zheng17(adata, n_top_genes=15360)
    elif dataset == '10Xpbmc':
        sc.pp.recipe_zheng17(adata, n_top_genes=19815)
    sc.tl.pca(adata)
    no_cell_types = len(adata.obs[cell_labels].unique())
    no_batches = len(adata.obs[batch_labels].unique())
    bbknn.bbknn(adata) 
    kmeans_cell = KMeans(n_clusters=no_cell_types, random_state=0).fit(adata.obsp['distances'])
    adata.obs['BBKNN_kmeans_normalised'] = kmeans_cell.labels_

    all_results['cellAMI'][dataset]['BBKNN'] = adjusted_mutual_info_score(adata.obs[cell_labels],adata.obs['BBKNN_kmeans_normalised'])
    all_results['batchAMI'][dataset]['BBKNN'] = adjusted_mutual_info_score(adata.obs[batch_labels],adata.obs['BBKNN_kmeans_normalised'])
    all_results['cellNMI'][dataset]['BBKNN'] = normalized_mutual_info_score(adata.obs[cell_labels],adata.obs['BBKNN_kmeans_normalised'])
    all_results['batchNMI'][dataset]['BBKNN'] = normalized_mutual_info_score(adata.obs[batch_labels],adata.obs['BBKNN_kmeans_normalised'])
    all_results['cellARI'][dataset]['BBKNN'] = adjusted_rand_score(adata.obs[cell_labels],adata.obs['BBKNN_kmeans_normalised'])
    all_results['batchARI'][dataset]['BBKNN'] = adjusted_rand_score(adata.obs[cell_labels],adata.obs['BBKNN_kmeans_normalised'])
    all_results['SC'][dataset]['BBKNN'] = silhouette_score(adata.obsp['distances'], adata.obs['BBKNN_kmeans_normalised'])



10Xmouse
Cell AMI 0.15883345296878473
Batch AMI 0.13019300086994023
silhouette score 0.004325918163490555


humanpancreas
Cell AMI 0.4049020420526615
Batch AMI 0.2738395508449958
silhouette score 0.020782733257333035


10Xpbmc
Cell AMI 0.4882876692508661
Batch AMI 0.4825287851837442
silhouette score 0.0214443581162231



### PCA with normalisation 

In [None]:
from scipy.sparse import vstack

In [None]:
class scPCA:
    def __init__(self, k):
        from sklearn.decomposition import TruncatedSVD
        np.random.seed(0)
        self.n_components = k
        self.method = TruncatedSVD(n_components=self.n_components)

    def fit(self, X):
        self.data = X
        self.method.fit(self.data)
        self.cell_score = self.method.transform(self.data)
        self.gene_score = self.method.components_.transpose()

In [None]:
for dataset in datasets:
    ## Replace this with the location of your h5ad files
    adata = sc.read('../Data/{}_v2.h5ad'.format(dataset))
    adata.layers["counts"] = adata.X.copy()
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    adata.raw = adata 
    no_cell_types = len(adata.obs[cell_labels].unique())
    no_batches = len(adata.obs[batch_labels].unique())
    # Split cell matrix into batches preserving order 
    Xlist = list()
    split_idx = list()
    for i,df in adata.obs.groupby(batch_labels):
        df_ints = [int(x) for x in df.index]
        split_idx.append(min(df_ints))
    split_idx.append(adata.obs.shape[0])
    split_idx = sorted(split_idx)
    split_starts = split_idx[:-1]
    split_ends = split_idx[1:]
    for i in range(0,no_batches):
        Xlist.append(coo_matrix(adata.X[split_starts[i]:split_ends[i],:]))
    model = scPCA(no_cell_types)
    model.fit(vstack(Xlist))
    adata.obsm['PCA'] = model.cell_score
    adata.varm['PCA'] = model.gene_score
    kmeans_cell = KMeans(n_clusters=no_cell_types, random_state=0).fit(normalize(adata.obsm['PCA']))
    adata.obs['PCA_kmeans_normalised'] = kmeans_cell.labels_

    all_results['cellAMI'][dataset]['PCA'] = adjusted_mutual_info_score(adata.obs[cell_labels],adata.obs['PCA_kmeans_normalised'])
    all_results['batchAMI'][dataset]['PCA'] = adjusted_mutual_info_score(adata.obs[batch_labels],adata.obs['PCA_kmeans_normalised'])
    all_results['cellNMI'][dataset]['PCA'] = normalized_mutual_info_score(adata.obs[cell_labels],adata.obs['PCA_kmeans_normalised'])
    all_results['batchNMI'][dataset]['PCA'] = normalized_mutual_info_score(adata.obs[batch_labels],adata.obs['PCA_kmeans_normalised'])
    all_results['cellARI'][dataset]['PCA'] = adjusted_rand_score(adata.obs[cell_labels],adata.obs['PCA_kmeans_normalised'])
    all_results['batchARI'][dataset]['PCA'] = adjusted_rand_score(adata.obs[cell_labels],adata.obs['PCA_kmeans_normalised'])
    all_results['SC'][dataset]['PCA'] = silhouette_score(adata.obsm['PCA'], adata.obs['PCA_kmeans_normalised'])

In [None]:
import joblib
joblib.dump(all_results,'single_cell_benchmark_scores.joblib')

In [6]:
import joblib 
import pandas as pd
benchmark_results = joblib.load('single_cell_benchmark_scores.joblib')

In [7]:
pd.DataFrame(benchmark_results['cellAMI'])

Unnamed: 0,10Xmouse,humanpancreas,10Xpbmc
Scanorama,0.966656,0.354211,0.612671
scVI,0.522918,0.597129,0.235835
BBKNN,0.158833,0.404902,0.488288
PCA,0.970991,0.445463,0.615637


In [8]:
pd.DataFrame(benchmark_results['cellARI'])

Unnamed: 0,10Xmouse,humanpancreas,10Xpbmc
Scanorama,0.986195,0.19437,0.543325
scVI,0.359171,0.579321,0.216429
BBKNN,0.032689,0.198368,0.382472
PCA,0.988281,0.314651,0.535658


In [10]:
pd.DataFrame(benchmark_results['cellNMI'])

Unnamed: 0,10Xmouse,humanpancreas,10Xpbmc
Scanorama,0.966659,0.355077,0.612796
scVI,0.523021,0.597459,0.236066
BBKNN,0.158917,0.405661,0.488436
PCA,0.970993,0.446167,0.615756
