## Import

In [1]:
import os
import scanpy as sc
import pandas as pd
import scvi 

import warnings
warnings.filterwarnings("ignore")

## Load

In [2]:
# single cell data
sc_adata = sc.read_h5ad('../input data/sc_imputation_to_be_corrected.h5ad')

In [3]:
# tumor data from the atlas (to check the 4000 hvg)
tumor_adata = sc.read_h5ad("../../../../../projects/2022/CRCA/results/v1/build_atlas/integrate_datasets/scvi/seed/tumor-scvi_model/adata.h5ad")

## Filter for the 400 hvg

In [19]:
sc_ens = pd.Index(sc_adata.var['ensembl'])
tumor_ens = pd.Index(tumor_adata.var['ensembl'])

common_ensembl = sc_ens.intersection(tumor_ens)
common_ensembl

Index(['ENSG00000187634', 'ENSG00000188290', 'ENSG00000187608',
       'ENSG00000186891', 'ENSG00000186827', 'ENSG00000184163',
       'ENSG00000162576', 'ENSG00000179403', 'ENSG00000189409',
       'ENSG00000187730',
       ...
       'ENSG00000198712', 'ENSG00000198899', 'ENSG00000198938',
       'ENSG00000198840', 'ENSG00000212907', 'ENSG00000198886',
       'ENSG00000198786', 'ENSG00000198695', 'ENSG00000198727',
       'ENSG00000275063'],
      dtype='object', name='ensembl', length=4000)

In [20]:
sc_adata_filtered = sc_adata[:, sc_adata.var['ensembl'].isin(common_ensembl)].copy()

In [22]:
sc_adata_filtered.shape

(1392731, 4000)

## Load scvi model

In [30]:
sc_adata_filtered.obs['batch'] = sc_adata_filtered.obs['sample_id'].copy()

In [41]:
sc_adata_filtered.obs['gene_dispersion_label'] = sc_adata_filtered.obs['dataset'].copy()

In [42]:
# for some reason this sample was not used while training the model, we need to remove it
set(sc_adata_filtered.obs['batch'].unique()) - set(tumor_adata.obs['batch'].unique())

{'Zhang_2020_10X_CD45Pos.T_P0305'}

In [43]:
adata_scvi = sc_adata_filtered[
    sc_adata_filtered.obs["batch"] != "Zhang_2020_10X_CD45Pos.T_P0305"
].copy()

In [44]:
# For tumor scvi model
model = scvi.model.SCVI.load("../../../../..//projects/2022/CRCA/results/v1/build_atlas/integrate_datasets/scvi/seed/tumor-scvi_model/", adata=adata_scvi)

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.


[34mINFO    [0m File                                                                                                      
         ..[35m/../../../../[0m[35m/projects/2022/CRCA/results/v1/build_atlas/integrate_datasets/scvi/seed/tumor-scvi_model/[0m[95mmo[0m
         [95mdel.pt[0m already downloaded                                                                                 


In [45]:
model



## Counts correction

In [46]:
scvi.model.SCVI.setup_anndata(adata_scvi, batch_key="batch")

In [48]:
adata_scvi.layers["corrected_counts"] = model.get_normalized_expression(
    adata=adata_scvi,
    library_size=10000,
    transform_batch="Borras_2023_KUL5_CD45Pos.EXT097", # (randomly chosen)
    return_mean=True,
)

## Save

In [50]:
adata_scvi.write('../input data/sc_imputation.h5ad')

In [None]:
# RICORDA DI CAMBIARE IN NF CON CORRECTED_COUNTS