# Tutorial for scVI with PBMC3K dataset


## configuration

In [None]:
import json
with open('tests/notebooks/basic_tutorial.config.json') as f:
    config = json.load(f)
print(config)

n_epochs_all = config['n_epochs'] if 'n_epochs' in config else None
save_path = config['save_path'] if 'save_path' in config else 'data/'
n_samples_tsne = config['n_samples_tsne'] if 'n_samples_tsne' in config else None
n_samples_posterior_density = config['n_samples_posterior_density'] if 'n_samples_posterior_density' in config else None
train_size = config['train_size'] if 'train_size' in config else None
M_sampling_all = config['M_sampling'] if 'M_sampling' in config else None
M_permutation_all = config['M_permutation'] if 'M_permutation' in config else None
rate = config['rate'] if 'rate' in config else None

{'save_path': 'data/'}


In [2]:
import os

import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


from scvi.dataset import Pbmc3kDataset

from scvi.models import *
from scvi.inference import UnsupervisedTrainer

In [7]:
gene_dataset = Pbmc3kDataset(save_path=save_path)

File data/filtered_gene_bc_matrices.tar.gz already downloaded
Pbmc3K Dataset preprocess
Extracting tar file and read data
there exists: True


end reading data
start preprocessing it
Warn: 1. this is only for mito gene


end preprocessing it
Pbmc3k Dataset preprocessing ends


In [4]:

n_epochs = 1 if n_epochs_all is None else n_epochs_all
lr = 1e-3
use_batches = False
use_cuda = False

vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches * use_batches)
trainer = UnsupervisedTrainer(vae, gene_dataset, train_size=0.75, use_cuda=use_cuda, frequency=5)
trainer.train(n_epochs=n_epochs, lr=lr)

ll_train_set = trainer.history["ll_train_set"]
ll_test_set = trainer.history["ll_test_set"]
x = np.linspace(0, 500, (len(ll_train_set)))
plt.plot(x, ll_train_set)
plt.plot(x, ll_test_set)
plt.ylim(1150, 1600)
plt.show()
plt.savefig("test.png", color="CST3")


training:   0%|          | 0/1 [00:00<?, ?it/s]

training: 100%|██████████| 1/1 [00:17<00:00, 17.51s/it]




In [6]:
import os
os.getcwd()
print(gene_dataset.gene_name_index)

AttributeError: 'Pbmc3kDataset' object has no attribute 'gene_name_index'

## PCA


In [16]:
from sklearn.decomposition.pca import PCA
latent, batch_indices, labels = trainer.test_set.get_latent()
pca = PCA(n_components=2)

compressed_latent = pca.fit_transform(latent)
print(compressed_latent)
plt.figure(figsize=(10, 10))

plt.scatter(compressed_latent[:, 0], compressed_latent[:, 1], marker=".", cmap='RdBu_r')
# plt.show()
plt.title("Hello")
plt.savefig("pca.png")

[[-0.08139734 -0.52871233]
 [ 1.8646802  -0.20577204]
 [-0.51875556 -0.48275948]
 ...
 [-0.16295454 -0.35133126]
 [-2.2334762   0.7503805 ]
 [-0.5208154  -0.5061717 ]]


## TSNE


In [None]:
print(type(trainer))
latent, batch_indices, labels = trainer.test_set.get_latent()
n_samples = 1000
print(latent)
latent, idx_t_sne = trainer.test_set.apply_t_sne(latent, n_samples)
plt.figure(figsize=(10, 10))

plt.scatter(latent[:, 0], latent[:, 1], cmap='RdBu_r')
plt.show()
plt.savefig("tsne.png")

<class 'scvi.inference.inference.UnsupervisedTrainer'>
[[-0.77338713 -0.38158295  0.12276298 ... -0.71149576 -1.2822084
  -0.04172951]
 [ 0.98246276 -0.5394774   1.248472   ... -0.36474323  1.0687717
   0.09033996]
 [-0.50152344  0.36511883  0.05584943 ... -0.6257056  -0.1659959
   1.078433  ]
 ...
 [ 0.89670557  0.42668155 -0.37404084 ...  0.39146912 -0.82999045
  -1.248309  ]
 [-0.05436607  0.05620524  0.08537936 ... -1.4820328  -0.00945437
   0.6122958 ]
 [-0.89089394 -1.3243413  -0.15957141 ... -0.9393456  -0.33437467
   0.12578028]]


# Cluster


In [None]:
import louvain
random_state = 0
louvain.set_rng_seed(random_state)


## Finding marker genes

## Saving or exporting the results