In [47]:
import os
import torch
import argparse
import warnings
import numpy as np
import anndata
from sklearn.preprocessing import OneHotEncoder
import scanpy as sc
import matplotlib.pyplot as plt
import pandas as pd
#import scDREAMER_train
from utils import *
from anndata import AnnData

In [48]:
def read_data(data_path, batch, cell_type, name, hvg=2000):

    Ann = sc.read_h5ad(data_path)
   
    b = Ann.obs[batch] #.to_list()
    batch_info = np.array([[i] for i in b]) 
    enc = OneHotEncoder(handle_unknown = 'ignore')

    enc.fit(batch_info.reshape(-1, 1))
    batch_info_enc = enc.transform(batch_info.reshape(-1, 1)).toarray()
    Ann.obsm[batch + '_encoded'] = pd.DataFrame(batch_info_enc).to_numpy()

    if cell_type is not None:
        
        labels = Ann.obs[cell_type].to_list()
        
        c = Ann.obs[cell_type]
        cell_info = np.array([[i] for i in c])

        enc.fit(cell_info.reshape(-1, 1))
        labels_enc = enc.transform(cell_info.reshape(-1, 1)).toarray()

        Ann.obsm[cell_type + "_encoded"] = pd.DataFrame(labels_enc).to_numpy()
       
    return Ann

In [54]:
data_path = "../Pan/Pancreas.h5ad"
batch = 'tech'
cell_type = 'celltype'
name = 'Pancreas'

adata = read_data(data_path, batch, cell_type, name)

In [46]:
np.unique(adata.obs['celltype'])

array(['acinar', 'activated_stellate', 'alpha', 'beta', 'delta', 'ductal',
       'endothelial', 'epsilon', 'gamma', 'macrophage', 'mast',
       'quiescent_stellate', 'schwann', 't_cell'], dtype=object)

In [50]:
adata

AnnData object with n_obs × n_vars = 16382 × 19093
    obs: 'tech', 'celltype', 'size_factors'
    obsm: 'tech_encoded', 'celltype_encoded'
    layers: 'counts'

In [55]:
sc.pp.subsample(adata, fraction = 0.001)

In [56]:
adata

AnnData object with n_obs × n_vars = 16 × 19093
    obs: 'tech', 'celltype', 'size_factors'
    obsm: 'tech_encoded', 'celltype_encoded'
    layers: 'counts'

In [57]:
ann = adata.copy()

In [64]:
pd.DataFrame(ann.X, index = ann.obs_names).iloc[:, :10].to_csv("test_anndata.csv")

In [65]:
ann.obs['tech']

HP1525301T2D_D11               smartseq2
human3_lib3.final_cell_0536      inDrop3
human1_lib3.final_cell_0018      inDrop1
D28-2_15                         celseq2
D3en4_63                          celseq
human3_lib1.final_cell_0779      inDrop3
D74_53                            celseq
HP1506401_D10                  smartseq2
human3_lib4.final_cell_0610      inDrop3
HP1526901T2D_N15               smartseq2
human1_lib2.final_cell_0189      inDrop1
human3_lib3.final_cell_0720      inDrop3
D172444_15                        celseq
D28-5_63                         celseq2
human1_lib2.final_cell_0591      inDrop1
human3_lib4.final_cell_0539      inDrop3
Name: tech, dtype: category
Categories (5, object): ['celseq', 'celseq2', 'inDrop1', 'inDrop3', 'smartseq2']

In [79]:
sc.pp.subsample(ann, fraction = 1)

In [80]:
pd.DataFrame(ann.X, index = ann.obs_names).iloc[:, :10].to_csv("test_anndata_shuffled_2.csv")

In [77]:
ann.obs['celltype']

D74_53                           beta
human1_lib3.final_cell_0018      beta
human1_lib2.final_cell_0189      beta
HP1506401_D10                  ductal
HP1525301T2D_D11                alpha
D28-5_63                       acinar
human3_lib4.final_cell_0610    acinar
human3_lib1.final_cell_0779     alpha
human3_lib4.final_cell_0539     alpha
human1_lib2.final_cell_0591     alpha
D172444_15                     ductal
human3_lib3.final_cell_0720     alpha
HP1526901T2D_N15                alpha
human3_lib3.final_cell_0536    acinar
D3en4_63                       ductal
D28-2_15                       acinar
Name: celltype, dtype: category
Categories (4, object): ['acinar', 'alpha', 'beta', 'ductal']

In [78]:
from sklearn.preprocessing import OneHotEncoder

b = ann.obs['tech'] #.to_list()
batch_info = np.array([[i] for i in b]) 
enc = OneHotEncoder(handle_unknown = 'ignore')

enc.fit(batch_info.reshape(-1, 1))
batch_info_enc = enc.transform(batch_info.reshape(-1, 1)).toarray()
ann.obsm[batch + '_encoded'] = pd.DataFrame(batch_info_enc).to_numpy()

In [81]:
ann.obsm[batch + '_encoded']

array([[0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.]])

In [74]:
ann.obs['tech']

human3_lib3.final_cell_0536      inDrop3
D74_53                            celseq
human3_lib4.final_cell_0610      inDrop3
HP1526901T2D_N15               smartseq2
D28-5_63                         celseq2
D3en4_63                          celseq
human1_lib3.final_cell_0018      inDrop1
human1_lib2.final_cell_0591      inDrop1
human1_lib2.final_cell_0189      inDrop1
HP1506401_D10                  smartseq2
human3_lib4.final_cell_0539      inDrop3
human3_lib3.final_cell_0720      inDrop3
D28-2_15                         celseq2
HP1525301T2D_D11               smartseq2
human3_lib1.final_cell_0779      inDrop3
D172444_15                        celseq
Name: tech, dtype: category
Categories (5, object): ['celseq', 'celseq2', 'inDrop1', 'inDrop3', 'smartseq2']

In [None]:
import scvi
scvi.model.SCVI.setup_anndata(adata, layer = "counts", batch_key = batch)

In [None]:
model = scvi.model.SCVI(adata)
model.setup_anndata(adata,layer = "counts", batch_key = batch)
model.view_anndata_setup(adata)

In [8]:
from scvi.dataloaders import *
from scvi.dataloaders._ann_dataloader import AnnDataLoader
from scvi.data import *

In [2]:
import scvi

Global seed set to 0
  jax.tree_util.register_keypaths(


In [6]:
adata.layers['raw_counts'] = adata.X.copy()
scvi.data.setup_anndata(adata,
                       batch_key = batch,
                       labels_key = cell_type,
                       layer = 'raw_counts')

AttributeError: module 'scvi.data' has no attribute 'setup_anndata'

In [9]:
scvi.model.SCVI.setup_anndata(adata)
adata_manager = scvi.model.SCVI(adata).adata_manager





AttributeError: 'AnnData' object has no attribute 'adata'

In [16]:
ad1 = AnnDataLoader(adata_manager, shuffle = False, batch_size = 10)

data_batch = next(tensors for tensors in ad1)

In [10]:
#fields = [LayerField("counts", "raw_counts")]
adata_manager = AnnDataManager()
adata_manager.register_fields(adata)

In [18]:
ad1

<scvi.dataloaders._ann_dataloader.AnnDataLoader at 0x7f47209a9be0>

In [13]:
splitter = DataSplitter(adata_manager)

In [14]:
splitter.setup()
#train_dataloader = scvi.dataloaders.AnnDataLoader(adata_manager, shuffle = True, batch_size = 128)
train_dl = splitter.train_dataloader()

In [15]:
train_dl

<scvi.dataloaders._ann_dataloader.AnnDataLoader at 0x7f47209a79d0>

In [None]:
!python scDREAMER_run.py

In [19]:
adata

AnnData object with n_obs × n_vars = 16382 × 19093
    obs: 'tech', 'celltype', 'size_factors', '_scvi_batch', '_scvi_labels'
    uns: '_scvi_uuid', '_scvi_manager_uuid'
    obsm: 'tech_encoded', 'celltype_encoded'
    layers: 'counts', 'raw_counts'

In [26]:
adata.X

array([[0.       , 3.1511765, 0.       , ..., 0.       , 1.1095682,
        3.2738605],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.       , 3.3839343, ..., 4.059979 , 0.       ,
        3.3839343],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [3.5026364, 2.6305318, 0.       , ..., 2.6305318, 0.       ,
        0.       ]], dtype=float32)

In [25]:
adata.obsm['tech_encoded']

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [27]:
sc.pp.subsample(adata, fraction = 1)

In [30]:
len(adata)

16382

In [28]:
adata.X

array([[0.        , 0.        , 0.        , ..., 2.6780741 , 0.        ,
        0.85423183],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 3.3239179 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.5292459 , 1.0094638 , 2.5541277 , ..., 3.9893825 , 0.        ,
        0.29923445],
       [0.        , 0.        , 0.        , ..., 3.4904613 , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

In [29]:
adata.obsm['tech_encoded']

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
data = np.log2(data+1)
scale = np.max(data)
data /= scale 

In [33]:
adata.X = np.log2(adata.X + 1)/np.max(adata.X)

In [38]:
adata.X[:,: 128].shape

(16382, 128)

In [40]:
adata.obsm['tech_encoded'][:, :128].shape

(16382, 9)

In [42]:
len(adata.obs['tech'].unique())

9