## Set path

In [1]:
import os
dataset_dir = os.path.join(os.getcwd(), 'datasets/')
outputs_dir = os.path.join(os.getcwd(), 'outputs/')
if not os.path.exists(outputs_dir):
    os.makedirs(outputs_dir)

save_dir = os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/moETM")
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

## Load necessary libraries

In [2]:
import anndata
import pandas as pd

## Load data

In [3]:
train_data = anndata.read(os.path.join(dataset_dir, "different samples/CITE-SLN111-Gayoso/Mouse1.h5ad"))
test_data = anndata.read(os.path.join(dataset_dir, "different samples/CITE-SLN111-Gayoso/Mouse2.h5ad"))
train_data, test_data

(AnnData object with n_obs × n_vars = 9264 × 13553
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode'
     uns: 'protein_name', 'version'
     obsm: 'protein_expression',
 AnnData object with n_obs × n_vars = 7564 × 13553
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode'
     uns: 'protein_name', 'version'
     obsm: 'protein_expression')

## Convert gene expression data and protein expression data to anndata format separately

In [4]:
train_gene_expression = train_data.X
test_gene_expression = test_data.X
train_rna = anndata.AnnData(X=train_gene_expression, var=train_data.var, obs=train_data.obs)
test_rna = anndata.AnnData(X=test_gene_expression, var=test_data.var, obs=test_data.obs)

train_protein_expression = train_data.obsm["protein_expression"]
test_protein_expression = test_data.obsm["protein_expression"]
protein_name = pd.DataFrame(train_data.uns["protein_name"], index=train_data.uns["protein_name"], columns=["protein_name"])
train_protein = anndata.AnnData(X=train_protein_expression, var=protein_name, obs=train_data.obs)
test_protein = anndata.AnnData(X=test_protein_expression, var=protein_name, obs=test_data.obs)

train_rna.write(os.path.join(save_dir, "train_gene_expression_data.h5ad"), compression="gzip")
test_rna.write(os.path.join(save_dir, "test_gene_expression_data.h5ad"), compression="gzip")
train_protein.write(os.path.join(save_dir, "train_protein_expression_data.h5ad"), compression="gzip")
test_protein.write(os.path.join(save_dir, "test_protein_expression_data.h5ad"), compression="gzip")

train_rna, test_rna, train_protein, test_protein

(AnnData object with n_obs × n_vars = 9264 × 13553
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode',
 AnnData object with n_obs × n_vars = 7564 × 13553
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode',
 AnnData object with n_obs × n_vars = 9264 × 110
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'protein_name',
 AnnData object with n_obs × n_vars = 7564 × 110
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id'