### Notebook for the exploratory analysis of hepatocytes multiome data with `MultiVI`

- **Developed by**: Carlos Talavera-López
- **Institute of Computational Biology - Computational Health Department - Helmholtz Munich**
- v220829 

### Import required packages 

In [1]:
import scvi
import numpy as np
import scanpy as sc

scvi.settings.seed = 1712

Global seed set to 0
Global seed set to 1712


In [2]:
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

### Read in 10X multiome samples

In [3]:
adata = scvi.data.read_10x_multiome("/Users/carlos.lopez/Downloads/celia/filtered_feature_bc_matrix")
adata.var_names_make_unique()
adata

  return AnnData(data.tocsr(), var=features, obs=cell_annot)
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 5020 × 146307
    obs: 'batch_id'
    var: 'ID', 'modality', 'chr', 'start', 'end'

### Split to three datasets by modality (RNA, ATAC, Multiome), and corrupt data by remove some data to create single-modality data

In [10]:
n = 1673
adata_rna = adata[:n, adata.var.modality == "Gene Expression"].copy()
adata_paired = adata[n:2 * n].copy()
adata_atac = adata[2 * n:, adata.var.modality == "Peaks"].copy()

In [11]:
adata_rna

AnnData object with n_obs × n_vars = 1673 × 22582
    obs: 'batch_id'
    var: 'ID', 'modality', 'chr', 'start', 'end'

In [12]:
adata_atac

AnnData object with n_obs × n_vars = 1674 × 123725
    obs: 'batch_id'
    var: 'ID', 'modality', 'chr', 'start', 'end'

### We can now use the organizing method from scvi to concatenate these anndata

In [13]:
adata_mvi = scvi.data.organize_multiome_anndatas(adata_paired, adata_rna, adata_atac)
adata_mvi

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


AnnData object with n_obs × n_vars = 5020 × 146307
    obs: 'batch_id', 'modality'
    var: 'ID', 'modality', 'chr', 'start', 'end'

In [14]:
adata_mvi.obs

Unnamed: 0,batch_id,modality
CCGTTACTCGCAATCG_paired,1,paired
CCGTTACTCTTTGAGA_paired,1,paired
CCGTTATGTTGCAGTA_paired,1,paired
CCGTTTGGTGCTGTAA_paired,1,paired
CCGTTTGGTTTCCTCC_paired,1,paired
...,...,...
TTTGTGTTCCCGTTAC_accessibility,1,accessibility
TTTGTGTTCCGGTTGA_accessibility,1,accessibility
TTTGTTGGTCAGTAAT_accessibility,1,accessibility
TTTGTTGGTTTCCTCC_accessibility,1,accessibility


In [15]:
adata_mvi = adata_mvi[:, adata_mvi.var["modality"].argsort()].copy()
adata_mvi.var

Unnamed: 0,ID,modality,chr,start,end
Gm47985,ENSMUSG00000114212,Gene Expression,1,151058287,151058288
Ccr4,ENSMUSG00000047898,Gene Expression,9,114325105,114325630
Tmppe,ENSMUSG00000079260,Gene Expression,9,114230172,114230173
Glb1,ENSMUSG00000045594,Gene Expression,9,114230143,114230144
Crtap,ENSMUSG00000032431,Gene Expression,9,114219742,114219743
...,...,...,...,...,...
15:66615073-66615937,15:66615073-66615937,Peaks,15,66615073,66615937
15:66614085-66614811,15:66614085-66614811,Peaks,15,66614085,66614811
15:66601830-66602407,15:66601830-66602407,Peaks,15,66601830,66602407
15:65718038-65718943,15:65718038-65718943,Peaks,15,65718038,65718943


### Filter empty cells

In [16]:
print(adata_mvi.shape)
sc.pp.filter_genes(adata_mvi, min_cells=int(adata_mvi.shape[0] * 0.01))
print(adata_mvi.shape)

(5020, 146307)
(5020, 106501)


### Set up and training

In [17]:
scvi.model.MULTIVI.setup_anndata(adata_mvi, batch_key = 'modality')

In [18]:
mvi = scvi.model.MULTIVI(
    adata_mvi,
    n_genes=(adata_mvi.var['modality']=='Gene Expression').sum(),
    n_regions=(adata_mvi.var['modality']=='Peaks').sum(),
)
mvi.view_anndata_setup()

In [19]:
mvi.train()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_deprecation(


Epoch 1/500:   0%|          | 0/500 [00:00<?, ?it/s]

  x = torch.where(mask_expr.T, x_expr.T, x_acc.T).T


Epoch 2/500:   0%|          | 1/500 [01:49<15:14:26, 109.95s/it, loss=1.85e+04, v_num=1]

### Save and load `MultiVI` model

In [None]:
mvi.save("trained_multivi")
mvi = scvi.model.MULTIVI.load("trained_multivi", adata=adata_mvi)

### Visualise latent space 

In [None]:
adata_mvi.obsm["MultiVI_latent"] = mvi.get_latent_representation()
sc.pp.neighbors(adata_mvi, use_rep="MultiVI_latent")
sc.tl.umap(adata_mvi, min_dist=0.2)
sc.pl.umap(adata_mvi, color='modality')