## Set path

In [1]:
import os
os.chdir("code/moETM/moETM-main/")
outputs_dir = os.path.abspath(os.path.join(os.getcwd(), '../../../outputs'))
save_dir = os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/moETM")

## Load necessary libraries

In [2]:
from moETM.train import Trainer_moETM_for_cross_prediction, Train_moETM_for_cross_prediction
from moETM.build_model import build_moETM
import os
import numpy as np
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import warnings
warnings.filterwarnings('ignore')
import anndata
import torch
import scanpy as sc

  from .autonotebook import tqdm as notebook_tqdm


## Load data

In [3]:
train_adata_mod1 = anndata.read(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/moETM/train_gene_expression_data.h5ad"))
train_adata_mod2 = anndata.read(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/moETM/train_protein_expression_data.h5ad"))
test_adata_mod1 = anndata.read(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/moETM/test_gene_expression_data.h5ad"))
test_adata_mod2 = anndata.read(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/moETM/test_protein_expression_data.h5ad"))
train_adata_mod1, train_adata_mod2, test_adata_mod1, test_adata_mod2

(AnnData object with n_obs × n_vars = 9264 × 13553
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode',
 AnnData object with n_obs × n_vars = 9264 × 110
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'protein_name',
 AnnData object with n_obs × n_vars = 7564 × 13553
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode',
 AnnData object with n_obs × n_vars = 7564 × 110
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id'

## Copy the batch information contained within the dataset stored in obs to the *batch_indices* column
### The batch information needs to be encoded as integers. When the dataset does not contain batch information, the *batch_indices* column should be all zeros.

In [4]:
train_batch_index = np.zeros(train_adata_mod1.shape[0])
test_batch_index = np.zeros(test_adata_mod1.shape[0])
train_adata_mod1.obs['batch_indices'] = train_batch_index
test_adata_mod1.obs['batch_indices'] = test_batch_index
train_adata_mod2.obs['batch_indices'] = train_batch_index
test_adata_mod2.obs['batch_indices'] = test_batch_index

train_batch_index, test_batch_index

(array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]))

## Merge the gene expression data from the training and test sets and select highly variable genes

In [5]:
adata_mod1 = train_adata_mod1.concatenate(test_adata_mod1, index_unique=None, join='outer')
sc.pp.normalize_total(adata_mod1, target_sum=1e4)
sc.pp.log1p(adata_mod1)
sc.pp.highly_variable_genes(adata_mod1)
index = adata_mod1.var['highly_variable'].values

train_adata_mod1 = train_adata_mod1[:, index]
test_adata_mod1 = test_adata_mod1[:, index]

train_adata_mod1, test_adata_mod1

(View of AnnData object with n_obs × n_vars = 9264 × 1633
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode',
 View of AnnData object with n_obs × n_vars = 7564 × 1633
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode')

## Normalize the gene expression and protein expression data of the training set

In [6]:
X_mod1 = np.array(train_adata_mod1.X.todense())
X_mod2 = np.array(train_adata_mod2.X.todense())
batch_index = np.array(train_adata_mod1.obs['batch_indices'])

X_mod1 = X_mod1 / X_mod1.sum(1)[:, np.newaxis]
X_mod2 = X_mod2 / X_mod2.sum(1)[:, np.newaxis]

X_mod1, X_mod2

(array([[0.        , 0.        , 0.        , ..., 0.        , 0.00530973,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]], dtype=float32),
 array([[0.03293135, 0.00278293, 0.00278293, ..., 0.00185529, 0.00046382,
         0.03107607],
        [0.00400572, 0.00028612, 0.00057225, ..., 0.00057225, 0.00114449,
         0.00572246],
        [0.05660377, 0.00157233, 0.00471698, ..., 0.00078616, 0.00235849,
         0.00471698],
        ...,
        [0.02497162, 0.00113507, 0.00605373, ..., 0.00075672, 0.00264851,
         0.00870223

## Convert to torch tensor

In [7]:
X_mod1_train_T = torch.from_numpy(X_mod1).float().cuda()
X_mod2_train_T = torch.from_numpy(X_mod2).float().cuda()
batch_index_train_T = torch.from_numpy(batch_index).to(torch.int64).cuda()
X_mod1_train_T, X_mod2_train_T, batch_index_train_T

(tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0053, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
        device='cuda:0'),
 tensor([[0.0329, 0.0028, 0.0028,  ..., 0.0019, 0.0005, 0.0311],
         [0.0040, 0.0003, 0.0006,  ..., 0.0006, 0.0011, 0.0057],
         [0.0566, 0.0016, 0.0047,  ..., 0.0008, 0.0024, 0.0047],
         ...,
         [0.0250, 0.0011, 0.0061,  ..., 0.0008, 0.0026, 0.0087],
         [0.0125, 0.0053, 0.0019,  ..., 0.0005, 0.0010, 0.0068],
         [0.0118, 0.0012, 0.0006,  ..., 0.0006, 0.0012, 0.0065]],
        device='cuda:0'),
 tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0'))

## Normalize the gene expression and protein expression data of the test set

In [8]:
X_mod1 = np.array(test_adata_mod1.X.todense())
X_mod2 = np.array(test_adata_mod2.X.todense())
batch_index = np.array(test_adata_mod1.obs['batch_indices'])

sum1 = X_mod1.sum(1)
sum2 = X_mod2.sum(1)

X_mod1 = X_mod1 / X_mod1.sum(1)[:, np.newaxis]
X_mod2 = X_mod2 / X_mod2.sum(1)[:, np.newaxis]

X_mod1, X_mod2

(array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.00645161, 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.00212766]], dtype=float32),
 array([[0.02182673, 0.00167898, 0.00100739, ..., 0.00067159, 0.00100739,
         0.01040967],
        [0.03245887, 0.00177857, 0.00177857, ..., 0.00044464, 0.00177857,
         0.0053357 ],
        [0.01314252, 0.00175234, 0.00087617, ..., 0.00146028, 0.00087617,
         0.01226636],
        ...,
        [0.06713589, 0.00108284, 0.00324851, ..., 0.00054142, 0.00162426,
         0.00433135

## Convert to torch tensor

In [9]:
X_mod1_test_T = torch.from_numpy(X_mod1).float().cuda()
X_mod2_test_T = torch.from_numpy(X_mod2).float().cuda()
batch_index_test_T = torch.from_numpy(batch_index).to(torch.int64).cuda()
X_mod1_test_T, X_mod2_test_T, batch_index_test_T

(tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0065, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0021]],
        device='cuda:0'),
 tensor([[0.0218, 0.0017, 0.0010,  ..., 0.0007, 0.0010, 0.0104],
         [0.0325, 0.0018, 0.0018,  ..., 0.0004, 0.0018, 0.0053],
         [0.0131, 0.0018, 0.0009,  ..., 0.0015, 0.0009, 0.0123],
         ...,
         [0.0671, 0.0011, 0.0032,  ..., 0.0005, 0.0016, 0.0043],
         [0.0106, 0.0032, 0.0021,  ..., 0.0021, 0.0014, 0.0096],
         [0.0130, 0.0004, 0.0009,  ..., 0.0002, 0.0004, 0.0070]],
        device='cuda:0'),
 tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0'))

In [10]:
del X_mod1, X_mod2, batch_index
test_mod1_sum, test_mod2_sum = sum1, sum2

## Create model

In [11]:
num_batch = len(batch_index_train_T.unique())
input_dim_mod1 = X_mod1_train_T.shape[1]
input_dim_mod2 = X_mod2_train_T.shape[1]
train_num = X_mod1_train_T.shape[0]
num_topic = 200
emd_dim = 400
encoder_mod1, encoder_mod2, decoder, optimizer = build_moETM(input_dim_mod1, input_dim_mod2, num_batch, num_topic=num_topic, emd_dim=emd_dim)
encoder_mod1, encoder_mod2, decoder, optimizer

(encoder(
   (f1): Linear(in_features=1633, out_features=128, bias=True)
   (act): ReLU()
   (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (dropout): Dropout(p=0.1, inplace=False)
   (mu): Linear(in_features=128, out_features=200, bias=True)
   (log_sigma): Linear(in_features=128, out_features=200, bias=True)
 ),
 encoder(
   (f1): Linear(in_features=110, out_features=128, bias=True)
   (act): ReLU()
   (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (dropout): Dropout(p=0.1, inplace=False)
   (mu): Linear(in_features=128, out_features=200, bias=True)
   (log_sigma): Linear(in_features=128, out_features=200, bias=True)
 ),
 decoder(),
 Adam (
 Parameter Group 0
     amsgrad: False
     betas: (0.9, 0.999)
     eps: 1e-08
     lr: 0.001
     maximize: False
     weight_decay: 0
 
 Parameter Group 1
     amsgrad: False
     betas: (0.9, 0.999)
     eps: 1e-08
     lr: 0.001
     maximize: False
     w

## Create trainer

In [12]:
encoder_mod1, encoder_mod2, decoder = encoder_mod1.cuda(), encoder_mod2.cuda(), decoder.cuda()
direction = 'rna_to_another'   # Or another_to_rna
trainer = Trainer_moETM_for_cross_prediction(encoder_mod1, encoder_mod2, decoder, optimizer, direction)
trainer

<moETM.train.Trainer_moETM_for_cross_prediction at 0x7fafbef7b2b0>

## Train model
### Imputation results will be saved in *save_dir*

In [13]:
Total_epoch = 500
batch_size = 2000
Train_set = [X_mod1_train_T, X_mod2_train_T, batch_index_train_T]
Test_set = [X_mod1_test_T, X_mod2_test_T, batch_index_test_T, test_adata_mod1, test_mod1_sum, test_mod2_sum]
Train_moETM_for_cross_prediction(trainer, Total_epoch, train_num, batch_size, Train_set, Test_set, save_dir)

[epoch 0 finished time 0.298075], Pearson_1=-0.0100, Spearmanr_1=-0.0191
[epoch 10 finished time 0.037285], Pearson_1=0.3995, Spearmanr_1=0.3317
[epoch 20 finished time 0.028025], Pearson_1=0.7309, Spearmanr_1=0.6190
[epoch 30 finished time 0.053087], Pearson_1=0.8434, Spearmanr_1=0.7408
[epoch 40 finished time 0.027536], Pearson_1=0.8633, Spearmanr_1=0.7693
[epoch 50 finished time 0.041431], Pearson_1=0.8705, Spearmanr_1=0.7784
[epoch 60 finished time 0.036843], Pearson_1=0.8709, Spearmanr_1=0.7775
[epoch 70 finished time 0.046859], Pearson_1=0.8655, Spearmanr_1=0.7697
[epoch 80 finished time 0.048313], Pearson_1=0.8621, Spearmanr_1=0.7639
[epoch 90 finished time 0.056830], Pearson_1=0.8567, Spearmanr_1=0.7578
[epoch 100 finished time 0.055963], Pearson_1=0.8570, Spearmanr_1=0.7592
[epoch 110 finished time 0.058033], Pearson_1=0.8533, Spearmanr_1=0.7567
[epoch 120 finished time 0.035613], Pearson_1=0.8357, Spearmanr_1=0.7369
[epoch 130 finished time 0.062719], Pearson_1=0.8331, Spearm

## Save model

In [14]:
torch.save(encoder_mod1, os.path.join(save_dir, "encoder_mod1.pth"))     
torch.save(encoder_mod2, os.path.join(save_dir, "encoder_mod2.pth"))     
torch.save(decoder, os.path.join(save_dir, "decoder.pth"))     