# Batch correction benchmark

In [1]:
import numpy as np
import pandas as pd

import scanpy as sc

from harmony import harmonize
#from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score
from tqdm.auto import tqdm

import scIB

In [2]:
from numba import set_num_threads

set_num_threads(16)

In [3]:
import sys
import gc
from pathlib import Path

sys.path.append(str(Path.home() / 'Code/sctoolkit/')) # downloaded from https://github.com/gokceneraslan/sctoolkit/

In [4]:
from sctoolkit.integrate import fit_single_cell

In [5]:
sc.set_figure_params(dpi=100)

In [6]:
ad_orig = sc.read('../scGTEx/alltissue_v5__myocytes_20210204-compessed.h5ad')
ad_orig

AnnData object with n_obs × n_vars = 209126 × 17695
    obs: 'n_genes', 'fpr', 'tissue', 'prep', 'individual', 'nGenes', 'nUMIs', 'PercentMito', 'PercentRibo', 'Age_bin', 'Sex', 'Sample ID', 'Participant ID', 'Container', 'Sample ID short', 'RIN score from PAXgene tissue Aliquot', 'RIN score from Frozen tissue Aliquot', 'Age', 'BMI', 'Race/Ethnicity', 'Autolysis Score', 'Sample Ischemic Time (mins)', 'Tissue Site Detail', 'scrublet', 'scrublet_score', 'barcode', 'batch', 'n_counts', 'tissue-individual-prep', 'Broad cell type', 'Granular cell type', 'batch_triplet', 'introns', 'junctions', 'exons', 'sense', 'antisense', 'intergenic', 'batch-barcode', 'exon_ratio', 'intron_ratio', 'junction_ratio', 'log10_nUMIs', 'leiden', 'leiden_tissue', 'Tissue composition', 'Cell types level 2', 'Cell types level 3', 'Broad cell type numbers', 'Broad cell type (numbers)', 'Tissue', 'channel', 'ischemic_time', 'Participant_ID', 'Sample_ID'
    var: 'gene_ids', 'Chromosome', 'Source', 'Start', 'End', '

In [7]:
ch = ad_orig.obs.channel.value_counts()>30
ch = ch[ch].index.values.astype(str)

ad_orig = ad_orig[ad_orig.obs.channel.isin(ch)].copy()
ad_orig



AnnData object with n_obs × n_vars = 209068 × 17695
    obs: 'n_genes', 'fpr', 'tissue', 'prep', 'individual', 'nGenes', 'nUMIs', 'PercentMito', 'PercentRibo', 'Age_bin', 'Sex', 'Sample ID', 'Participant ID', 'Container', 'Sample ID short', 'RIN score from PAXgene tissue Aliquot', 'RIN score from Frozen tissue Aliquot', 'Age', 'BMI', 'Race/Ethnicity', 'Autolysis Score', 'Sample Ischemic Time (mins)', 'Tissue Site Detail', 'scrublet', 'scrublet_score', 'barcode', 'batch', 'n_counts', 'tissue-individual-prep', 'Broad cell type', 'Granular cell type', 'batch_triplet', 'introns', 'junctions', 'exons', 'sense', 'antisense', 'intergenic', 'batch-barcode', 'exon_ratio', 'intron_ratio', 'junction_ratio', 'log10_nUMIs', 'leiden', 'leiden_tissue', 'Tissue composition', 'Cell types level 2', 'Cell types level 3', 'Broad cell type numbers', 'Broad cell type (numbers)', 'Tissue', 'channel', 'ischemic_time', 'Participant_ID', 'Sample_ID'
    var: 'gene_ids', 'Chromosome', 'Source', 'Start', 'End', '

In [8]:
sc.pp.filter_genes(ad_orig, min_cells=10)
ad_orig

The history saving thread hit an unexpected error (OperationalError('database is locked')).History will not be written to the database.


AnnData object with n_obs × n_vars = 209068 × 17603
    obs: 'n_genes', 'fpr', 'tissue', 'prep', 'individual', 'nGenes', 'nUMIs', 'PercentMito', 'PercentRibo', 'Age_bin', 'Sex', 'Sample ID', 'Participant ID', 'Container', 'Sample ID short', 'RIN score from PAXgene tissue Aliquot', 'RIN score from Frozen tissue Aliquot', 'Age', 'BMI', 'Race/Ethnicity', 'Autolysis Score', 'Sample Ischemic Time (mins)', 'Tissue Site Detail', 'scrublet', 'scrublet_score', 'barcode', 'batch', 'n_counts', 'tissue-individual-prep', 'Broad cell type', 'Granular cell type', 'batch_triplet', 'introns', 'junctions', 'exons', 'sense', 'antisense', 'intergenic', 'batch-barcode', 'exon_ratio', 'intron_ratio', 'junction_ratio', 'log10_nUMIs', 'leiden', 'leiden_tissue', 'Tissue composition', 'Cell types level 2', 'Cell types level 3', 'Broad cell type numbers', 'Broad cell type (numbers)', 'Tissue', 'channel', 'ischemic_time', 'Participant_ID', 'Sample_ID'
    var: 'gene_ids', 'Chromosome', 'Source', 'Start', 'End', '

In [9]:
metrics = []
metric_keys = ['Broad cell type', 'channel', 'tissue', 'Participant ID', 'prep']

## Uncorrected

In [10]:
sc.pp.highly_variable_genes(ad_orig, n_top_genes=2000)
sc.pp.pca(ad_orig)
sc.pp.neighbors(ad_orig)
sc.tl.leiden(ad_orig)

In [11]:
import random 

seed = random.randint(0, 1_000_000)
print(seed)
random.seed(seed)

import torch
torch.manual_seed(seed)

np.random.seed(seed)

664122


## Harmony

In [12]:
ad_harmony = ad_orig.copy()

%time ad_harmony.obsm['X_harmony'] = harmonize(ad_harmony.obsm['X_pca'], ad_harmony.obs, 'channel', random_state=seed)

sc.pp.neighbors(ad_harmony, use_rep='X_harmony')



	Initialization is completed.
	Completed 1 / 10 iteration(s).
	Completed 2 / 10 iteration(s).
Reach convergence after 2 iteration(s).
CPU times: user 2h 19min 26s, sys: 16min 46s, total: 2h 36min 13s
Wall time: 18min 40s


In [13]:
sc.tl.leiden(ad_harmony)

In [14]:
ad = ad_harmony
method_name = 'Harmony'
metrics = [x for x in metrics if x[0] != method_name]

for key in tqdm(metric_keys):
    metrics.append((method_name, 'NMI', key, scIB.metrics.nmi(ad, key, 'leiden')))
    metrics.append((method_name, 'ARI', key, scIB.metrics.ari(ad, key, 'leiden')))
    
%time kbet_score = scIB.metrics.kBET(ad, 'channel', 'Broad cell type', embed='X_harmony')
metrics.append((method_name, 'kBET', 'Broad cell type', kbet_score))        

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))






Adding diffusion to step 4




Adding diffusion to step 4
Adding diffusion to step 5




Adding diffusion to step 4




Adding diffusion to step 4
Adding diffusion to step 5




Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6




Adding diffusion to step 4




Adding diffusion to step 4




Adding diffusion to step 4




Adding diffusion to step 4




Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6




Adding diffusion to step 4
Adding diffusion to step 5




Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6




Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7
Adding diffusion to step 8




Adding diffusion to step 4




Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6




Adding diffusion to step 4




Adding diffusion to step 4




Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7
Adding diffusion to step 8
Adding diffusion to step 9




Adding diffusion to step 4




Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6




Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7
Adding diffusion to step 8
Adding diffusion to step 9
Adding diffusion to step 10
Adding diffusion to step 11
Adding diffusion to step 12
Adding diffusion to step 13
Adding diffusion to step 14
Adding diffusion to step 15




Adding diffusion to step 4




Adding diffusion to step 4
Adding diffusion to step 5




Adding diffusion to step 4
Adding diffusion to step 5




Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7
Adding diffusion to step 8
Adding diffusion to step 9




Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7
Adding diffusion to step 8




Adding diffusion to step 4
Adding diffusion to step 5




Adding diffusion to step 4




Adding diffusion to step 4
Adding diffusion to step 5




Adding diffusion to step 4
Adding diffusion to step 5




Adding diffusion to step 4




Adding diffusion to step 4
Adding diffusion to step 5




Adding diffusion to step 4




Adding diffusion to step 4
Adding diffusion to step 5
CPU times: user 1h 10min 42s, sys: 2min 2s, total: 1h 12min 45s
Wall time: 1h 8min 20s


## BBKNN

In [11]:
ad_bbknn = ad_orig.copy()
sc.pp.highly_variable_genes(ad_bbknn, n_top_genes=2000, subset=True)

%time ad_bbknn = scIB.integration.runBBKNN(ad_bbknn, 'channel', pynndescent_random_state=seed)

sc.tl.leiden(ad_bbknn)

  if not is_categorical(df_full[k]):


CPU times: user 35min 16s, sys: 6min 25s, total: 41min 42s
Wall time: 32min 46s


In [12]:
ad = ad_bbknn
method_name = 'BBKNN'
metrics = [x for x in metrics if x[0] != method_name]

for key in tqdm(metric_keys):
    metrics.append((method_name, 'NMI', key, scIB.metrics.nmi(ad, key, 'leiden')))
    metrics.append((method_name, 'ARI', key, scIB.metrics.ari(ad, key, 'leiden')))
    
%time kbet_score = scIB.metrics.kBET(ad, 'channel', 'Broad cell type', type_='knn')
metrics.append((method_name, 'kBET', 'Broad cell type', kbet_score))    

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




  self._set_arrayXarray(i, j, x)

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(
  if not is_categorical(df_full[k]):

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(
  if not is_categorical(df_full[k]):

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(
  if not is_categorical(df_full[k]):

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(
  if not is_categorical(df_full[k]):

These matrices should now be stored in the .obsp attribute.
This slicing behavio

Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7
Adding diffusion to step 8
Adding diffusion to step 9



These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(
  if not is_categorical(df_full[k]):

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(
  if not is_categorical(df_full[k]):

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(
  if not is_categorical(df_full[k]):


Adding diffusion to step 4



These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(
  if not is_categorical(df_full[k]):

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5



These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(
  if not is_categorical(df_full[k]):


Adding diffusion to step 4



These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(
  if not is_categorical(df_full[k]):

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(
  if not is_categorical(df_full[k]):

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(
  if not is_categorical(df_full[k]):

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(

These matrices should now b

CPU times: user 6h 26min 5s, sys: 37.4 s, total: 6h 26min 42s
Wall time: 6h 26min 44s


In [13]:
#ad_bbknn.write('adata_bbknn_full.h5ad')

## MNN

In [17]:
ad_mnn = ad_orig.copy()
sc.pp.highly_variable_genes(ad_mnn, n_top_genes=2000, subset=True)
ad_mnn.obs.channel.cat.reorder_categories(sorted(ad_mnn.obs.channel.cat.categories, key=len, reverse=True), inplace=True)

%time ad_mnn = scIB.integration.runMNN(ad_mnn, 'channel')

sc.pp.neighbors(ad_mnn, use_rep='X')
sc.tl.leiden(ad_mnn)

Compilation is falling back to object mode WITH looplifting enabled because Function "l2_norm" failed type inference due to: [1m[1m[1mNo implementation of function Function(<function norm at 0x7f31c4115e50>) found for signature:
 
 >>> norm(x=array(float32, 2d, A), axis=Literal[int](1))
 
There are 2 candidate implementations:
[1m  - Of which 2 did not match due to:
  Overload in function 'norm_impl': File: numba/np/linalg.py: Line 2351.
    With argument(s): '(x=array(float32, 2d, A), axis=int64)':[0m
[1m   Rejected as the implementation raised a specific error:
     TypeError: norm_impl() got an unexpected keyword argument 'x'[0m
  raised from /home/gokcen/.miniconda3/lib/python3.8/site-packages/numba/core/typing/templates.py:775
[0m
[0m[1mDuring: resolving callee type: Function(<function norm at 0x7f31c4115e50>)[0m
[0m[1mDuring: typing of call at /home/gokcen/.miniconda3/lib/python3.8/site-packages/mnnpy/utils.py (16)
[0m
[1m
File "../../.miniconda3/lib/python3.8/site

Performing cosine normalization...


Compilation is falling back to object mode WITH looplifting enabled because Function "l2_norm" failed type inference due to: [1m[1m[1mNo implementation of function Function(<function norm at 0x7f31c4115e50>) found for signature:
 
 >>> norm(x=array(float32, 2d, A), axis=Literal[int](1))
 
There are 2 candidate implementations:
[1m    - Of which 2 did not match due to:
    Overload in function 'norm_impl': File: numba/np/linalg.py: Line 2351.
      With argument(s): '(x=array(float32, 2d, A), axis=int64)':[0m
[1m     Rejected as the implementation raised a specific error:
       TypeError: norm_impl() got an unexpected keyword argument 'x'[0m
  raised from /home/gokcen/.miniconda3/lib/python3.8/site-packages/numba/core/typing/templates.py:775
[0m
[0m[1mDuring: resolving callee type: Function(<function norm at 0x7f31c4115e50>)[0m
[0m[1mDuring: typing of call at /home/gokcen/.miniconda3/lib/python3.8/site-packages/mnnpy/utils.py (16)
[0m
[1m
File "../../.miniconda3/lib/pyth

Starting MNN correct iteration. Reference batch: 0
Step 1 of 89: processing batch 1
  Looking for MNNs...


Compilation is falling back to object mode WITHOUT looplifting enabled because Function "find_mutual_nn" failed type inference due to: [1m[1mnon-precise type pyobject[0m
[0m[1mDuring: typing of argument at /home/gokcen/.miniconda3/lib/python3.8/site-packages/mnnpy/utils.py (94)[0m
[1m
File "../../.miniconda3/lib/python3.8/site-packages/mnnpy/utils.py", line 94:[0m
[1mdef find_mutual_nn(data1, data2, k1, k2, n_jobs):
    <source elided>
    mutual_2 = []
[1m    for index_2 in range(data2.shape[0]):
[0m    [1m^[0m[0m
[0m
  @jit((float32[:, :], float32[:, :], int8, int8, int8))
[1m
File "../../.miniconda3/lib/python3.8/site-packages/mnnpy/utils.py", line 94:[0m
[1mdef find_mutual_nn(data1, data2, k1, k2, n_jobs):
    <source elided>
    mutual_2 = []
[1m    for index_2 in range(data2.shape[0]):
[0m    [1m^[0m[0m
[0m
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information

  Computing correction vectors...


Compilation is falling back to object mode WITHOUT looplifting enabled because Function "compute_correction" failed type inference due to: [1m[1mnon-precise type pyobject[0m
[0m[1mDuring: typing of argument at /home/gokcen/.miniconda3/lib/python3.8/site-packages/mnnpy/utils.py (107)[0m
[1m
File "../../.miniconda3/lib/python3.8/site-packages/mnnpy/utils.py", line 107:[0m
[1mdef compute_correction(data1, data2, mnn1, mnn2, data2_or_raw2, sigma):
    <source elided>
    vect_reduced = np.zeros((data2.shape[0], vect.shape[1]), dtype=np.float32)
[1m    for index, ve in zip(mnn2, vect):
[0m    [1m^[0m[0m
[0m
  @jit(float32[:, :](float32[:, :], float32[:, :], int32[:], int32[:], float32[:, :], float32))
[1m
File "../../.miniconda3/lib/python3.8/site-packages/mnnpy/utils.py", line 107:[0m
[1mdef compute_correction(data1, data2, mnn1, mnn2, data2_or_raw2, sigma):
    <source elided>
    vect_reduced = np.zeros((data2.shape[0], vect.shape[1]), dtype=np.float32)
[1m    for inde

  Adjusting variance...
  Applying correction...
Step 2 of 89: processing batch 2
  Looking for MNNs...
  Computing correction vectors...
  Adjusting variance...
  Applying correction...
Step 3 of 89: processing batch 3
  Looking for MNNs...
  Computing correction vectors...
  Adjusting variance...
  Applying correction...
Step 4 of 89: processing batch 4
  Looking for MNNs...
  Computing correction vectors...
  Adjusting variance...
  Applying correction...
Step 5 of 89: processing batch 5
  Looking for MNNs...
  Computing correction vectors...
  Adjusting variance...
  Applying correction...
Step 6 of 89: processing batch 6
  Looking for MNNs...
  Computing correction vectors...
  Adjusting variance...
  Applying correction...
Step 7 of 89: processing batch 7
  Looking for MNNs...
  Computing correction vectors...
  Adjusting variance...
  Applying correction...
Step 8 of 89: processing batch 8
  Looking for MNNs...
  Computing correction vectors...
  Adjusting variance...
  Applying

In [18]:
#ad_mnn.write('adata_mnn_full.h5ad')

In [19]:
ad = ad_mnn
method_name = 'MNN'
metrics = [x for x in metrics if x[0] != method_name]

for key in tqdm(metric_keys):
    metrics.append((method_name, 'NMI', key, scIB.metrics.nmi(ad, key, 'leiden')))
    metrics.append((method_name, 'ARI', key, scIB.metrics.ari(ad, key, 'leiden')))
    
%time kbet_score = scIB.metrics.kBET(ad, 'channel', 'Broad cell type', embed='X')
metrics.append((method_name, 'kBET', 'Broad cell type', kbet_score))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7
Adding diffusion to step 8


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


CPU times: user 1h 4min 25s, sys: 43.2 s, total: 1h 5min 8s
Wall time: 54min 19s


## VAE

In [20]:
ad_vae = ad_orig.copy()
sc.pp.highly_variable_genes(ad_vae, n_top_genes=5000)

In [21]:
hvg = ad_vae.var_names[ad_vae.var['highly_variable']]

In [22]:
ad_vae = sc.AnnData(ad_vae.raw[:, hvg].X, 
                    obs=ad_vae.obs, 
                    var=ad_vae.raw[:, hvg].var, 
                    obsm=ad_vae.obsm)

sc.pp.filter_cells(ad_vae, min_counts=1)
sc.pp.normalize_total(ad_vae, target_sum=10000)
sc.pp.log1p(ad_vae)
ad_vae

AnnData object with n_obs × n_vars = 209068 × 4999
    obs: 'n_genes', 'fpr', 'tissue', 'prep', 'individual', 'nGenes', 'nUMIs', 'PercentMito', 'PercentRibo', 'Age_bin', 'Sex', 'Sample ID', 'Participant ID', 'Container', 'Sample ID short', 'RIN score from PAXgene tissue Aliquot', 'RIN score from Frozen tissue Aliquot', 'Age', 'BMI', 'Race/Ethnicity', 'Autolysis Score', 'Sample Ischemic Time (mins)', 'Tissue Site Detail', 'scrublet', 'scrublet_score', 'barcode', 'batch', 'n_counts', 'tissue-individual-prep', 'Broad cell type', 'Granular cell type', 'batch_triplet', 'introns', 'junctions', 'exons', 'sense', 'antisense', 'intergenic', 'batch-barcode', 'exon_ratio', 'intron_ratio', 'junction_ratio', 'log10_nUMIs', 'leiden', 'leiden_tissue', 'Tissue composition', 'Cell types level 2', 'Cell types level 3', 'Broad cell type numbers', 'Broad cell type (numbers)', 'Tissue', 'channel', 'ischemic_time', 'Participant_ID', 'Sample_ID'
    var: 'gene_ids', 'Chromosome', 'Source', 'Start', 'End', 'S

In [23]:
experiments = {}
for bottleneck in tqdm((8, 64, 256)):
    gc.collect()
    
    ad = ad_vae.copy()
    exp_name = f'benchmark-{bottleneck}d'

    %time ad, model, trainer, train_loader = fit_single_cell(ad, exp_name, categorical_vars=['channel'], concat_all_dec_layers=True, output_activation='softplus', latent_dim=bottleneck, btcvae_B=2., epochs=50, progress_bar=False, seed=seed)
    
    sc.pp.neighbors(ad, use_rep='X_vae_mean')
    sc.tl.leiden(ad)
    
    experiments[exp_name] = ad.copy()

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Directory single_cell_results/benchmark-8d already exists. Archiving it to single_cell_results/benchmark-8d.zip
  if is_string_dtype(df[key]) and not is_categorical(df[key])


CPU times: user 13min 42s, sys: 4min 20s, total: 18min 2s
Wall time: 14min 25s


Directory single_cell_results/benchmark-64d already exists. Archiving it to single_cell_results/benchmark-64d.zip
  if is_string_dtype(df[key]) and not is_categorical(df[key])


CPU times: user 13min 46s, sys: 5min 2s, total: 18min 49s
Wall time: 15min 7s


Directory single_cell_results/benchmark-256d already exists. Archiving it to single_cell_results/benchmark-256d.zip
  if is_string_dtype(df[key]) and not is_categorical(df[key])


CPU times: user 15min 17s, sys: 5min 10s, total: 20min 27s
Wall time: 16min 19s



In [24]:
for bottleneck in (8, 64, 256):
    ad = experiments[f'benchmark-{bottleneck}d']
    method_name = f'VAE ({bottleneck}D)'
    metrics = [x for x in metrics if x[0] != method_name]

    for key in tqdm(metric_keys):
        metrics.append((method_name, 'NMI', key, scIB.metrics.nmi(ad, key, 'leiden')))
        metrics.append((method_name, 'ARI', key, scIB.metrics.ari(ad, key, 'leiden')))
        
    %time kbet_score = scIB.metrics.kBET(ad, 'channel', 'Broad cell type', embed='X_vae_mean')
    metrics.append((method_name, 'kBET', 'Broad cell type', kbet_score))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7
Adding diffusion to step 8
Adding diffusion to step 9
Adding diffusion to step 10


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7
Adding diffusion to step 8


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7
Adding diffusion to step 8
Adding diffusion to step 9
Adding diffusion to step 10
Adding diffusion to step 11
Adding diffusion to step 12
Adding diffusion to step 13
Adding diffusion to step 14
Adding diffusion to step 15
Adding diffusion to step 16
Adding diffusion to step 17
Adding diffusion to step 18


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7
Adding diffusion to step 8


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


CPU times: user 40min 59s, sys: 1min 25s, total: 42min 24s
Wall time: 39min 26s


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7
Adding diffusion to step 8
Adding diffusion to step 9


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


CPU times: user 37min 41s, sys: 1min 30s, total: 39min 12s
Wall time: 35min 37s


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7
Adding diffusion to step 8


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6
Adding diffusion to step 7


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5


  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


Adding diffusion to step 4
Adding diffusion to step 5
Adding diffusion to step 6


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):


Adding diffusion to step 4


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


CPU times: user 35min 57s, sys: 1min 19s, total: 37min 16s
Wall time: 33min 34s


## Compile metrics

In [None]:
metric_df = pd.DataFrame(metrics, columns=['Method', 'Metric', 'Variable', 'Value'])
metric_df

In [26]:
metric_df.to_pickle(f'results_all_{seed}.pkl')