## Set path

In [1]:
import os
dataset_dir = os.path.join(os.getcwd(), 'datasets/')
outputs_dir = os.path.join(os.getcwd(), 'outputs/')
save_dir = os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/scMoGNN/openproblems_bmmc_cite_phase2_rna")
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

## Load necessary libraries

In [2]:
import anndata
import pandas as pd
import scanpy as sc

  from .autonotebook import tqdm as notebook_tqdm


## Load data

In [3]:
data_train = anndata.read(os.path.join(dataset_dir, "different samples/CITE-SLN111-Gayoso/Mouse1.h5ad"))
data_test = anndata.read(os.path.join(dataset_dir, "different samples/CITE-SLN111-Gayoso/Mouse2.h5ad"))

train_normalized_gene_expression = pd.read_table(
    os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/scMoGNN/train_normalized_gene_expression.txt"), delimiter=" ", index_col=0)
test_normalized_gene_expression = pd.read_table(
    os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/scMoGNN/test_normalized_gene_expression.txt"), delimiter=" ", index_col=0)

train_protein_clr = pd.read_table(
    os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/scMoGNN/train_protein_clr.txt"), delimiter=" ", index_col=0).T
test_protein_clr = pd.read_table(
    os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/scMoGNN/test_protein_clr.txt"), delimiter=" ", index_col=0).T

data_train, data_test, train_normalized_gene_expression, test_normalized_gene_expression, train_protein_clr, test_protein_clr

(AnnData object with n_obs × n_vars = 9264 × 13553
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode'
     uns: 'protein_name', 'version'
     obsm: 'protein_expression',
 AnnData object with n_obs × n_vars = 7564 × 13553
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode'
     uns: 'protein_name', 'version'
     obsm: 'protein_expression',
                     0610007P14Rik  0610009B22Rik  0610009L18Rik  \
 AAACCCAAGGGTAATT.1       0.616710       0.000000            0.0   
 AAACCCAAGGTAAACT.1       0.561007       0.000000            0.0   
 AAACCCACACTAGGTT.1     

## Convert the gene expression data of the training set and the test set to anndata format separately

In [4]:
train_rna = anndata.AnnData(X=train_normalized_gene_expression.values, var=data_train.var, obs=data_train.obs)
test_rna = anndata.AnnData(X=test_normalized_gene_expression.values, var=data_test.var, obs=data_test.obs)
train_rna, test_rna

(AnnData object with n_obs × n_vars = 9264 × 13553
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode',
 AnnData object with n_obs × n_vars = 7564 × 13553
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode')

## Convert the protein expression data of the training set and the test set to anndata format separately

In [5]:
protein_name = pd.DataFrame(data_train.uns["protein_name"], index=data_train.uns["protein_name"], columns=["protein_name"])
train_protein = anndata.AnnData(X=train_protein_clr.values, var=protein_name, obs=data_train.obs)
test_protein = anndata.AnnData(X=test_protein_clr.values, var=protein_name, obs=data_test.obs)
train_protein, test_protein

(AnnData object with n_obs × n_vars = 9264 × 110
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'protein_name',
 AnnData object with n_obs × n_vars = 7564 × 110
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'protein_name')

## Save prepared data

In [6]:
train_rna.write(os.path.join(save_dir, "openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod1.h5ad"), compression="gzip")
test_rna.write(os.path.join(save_dir, "openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod1.h5ad"), compression="gzip")
train_protein.write(os.path.join(save_dir, "openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod2.h5ad"), compression="gzip")
test_protein.write(os.path.join(save_dir, "openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod2.h5ad"), compression="gzip")