## Set path

In [1]:
import os
dataset_dir = os.path.join(os.getcwd(), 'datasets/')
outputs_dir = os.path.join(os.getcwd(), 'outputs/')
if not os.path.exists(outputs_dir):
    os.makedirs(outputs_dir)

save_dir = os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/Babel/openproblems_bmmc_cite_phase2_rna/")
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

## Load necessary libraries

In [2]:
import anndata
import numpy as np
import pandas as pd
import scanpy as sc

## Load data

In [3]:
data_train = anndata.read(os.path.join(dataset_dir, "different samples/CITE-SLN111-Gayoso/Mouse1.h5ad"))
data_test = anndata.read(os.path.join(dataset_dir, "different samples/CITE-SLN111-Gayoso/Mouse2.h5ad"))
data_train, data_test

(AnnData object with n_obs × n_vars = 9264 × 13553
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode'
     uns: 'protein_name', 'version'
     obsm: 'protein_expression',
 AnnData object with n_obs × n_vars = 7564 × 13553
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode'
     uns: 'protein_name', 'version'
     obsm: 'protein_expression')

## Preprocess gene expression data and convert to anndata format

In [4]:
sc.pp.normalize_total(data_train, key_added="size_factor")
sc.pp.normalize_total(data_test, key_added="size_factor")

sc.pp.scale(data_train)
sc.pp.scale(data_test)

clip_low_train,clip_high_train = np.percentile(data_train.X.flatten(), [0.5, 100.0 - 0.5])
clip_low_test,clip_high_test = np.percentile(data_test.X.flatten(), [0.5, 100.0 - 0.5])

data_train.X = np.clip(data_train.X, clip_low_train, clip_high_train)
data_test.X = np.clip(data_test.X, clip_low_test, clip_high_test)

train_gene_expression = data_train.X
test_gene_expression = data_test.X
train_rna = anndata.AnnData(X=train_gene_expression, var=data_train.var, obs=data_train.obs)
test_rna = anndata.AnnData(X=test_gene_expression, var=data_test.var, obs=data_test.obs)

train_rna, test_rna

(AnnData object with n_obs × n_vars = 9264 × 13553
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types', 'size_factor'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode', 'mean', 'std',
 AnnData object with n_obs × n_vars = 7564 × 13553
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types', 'size_factor'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode', 'mean', 'std')

## Preprocess protein expression data and convert to anndata format

In [5]:
train_protein_expression = pd.DataFrame(data_train.obsm["protein_expression"].todense(), index=data_train.obs.index, columns=data_train.uns["protein_name"])
test_protein_expression = pd.DataFrame(data_test.obsm["protein_expression"].todense(), index=data_test.obs.index, columns=data_test.uns["protein_name"])

gmean_train = ((train_protein_expression.applymap(np.log1p).sum(axis=1)) / train_protein_expression.shape[1]).apply(np.exp)
gmean_test = ((test_protein_expression.applymap(np.log1p).sum(axis=1)) / test_protein_expression.shape[1]).apply(np.exp)

train_protein_clr = (train_protein_expression.div(gmean_train, axis=0)).applymap(np.log1p)
test_protein_clr = (test_protein_expression.div(gmean_test, axis=0)).applymap(np.log1p)

protein_name = pd.DataFrame(data_train.uns["protein_name"], index=data_train.uns["protein_name"], columns=["protein_name"])
train_protein = anndata.AnnData(X=train_protein_clr, var=protein_name, obs=data_train.obs)
test_protein = anndata.AnnData(X=test_protein_clr, var=protein_name, obs=data_test.obs)

train_protein, test_protein

(AnnData object with n_obs × n_vars = 9264 × 110
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types', 'size_factor'
     var: 'protein_name',
 AnnData object with n_obs × n_vars = 7564 × 110
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types', 'size_factor'
     var: 'protein_name')

## Save preprocessed data

In [6]:

train_rna.write(os.path.join(save_dir, "openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod1.h5ad"), compression="gzip")
test_rna.write(os.path.join(save_dir, "openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod1.h5ad"), compression="gzip")
train_protein.write(os.path.join(save_dir, "openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod2.h5ad"), compression="gzip")
test_protein.write(os.path.join(save_dir, "openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod2.h5ad"), compression="gzip")