## Set path

In [1]:
import os
dataset_dir = os.path.join(os.getcwd(), 'datasets/')
outputs_dir = os.path.join(os.getcwd(), 'outputs/')
if not os.path.exists(outputs_dir):
    os.makedirs(outputs_dir)

save_dir = os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/scMoGNN/")
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

## Load necessary libraries

In [2]:
import anndata
import pandas as pd
import scanpy as sc

  from .autonotebook import tqdm as notebook_tqdm


## Load data

In [3]:
data_train = anndata.read(os.path.join(dataset_dir, "different samples/CITE-SLN111-Gayoso/Mouse1.h5ad"))
data_test = anndata.read(os.path.join(dataset_dir, "different samples/CITE-SLN111-Gayoso/Mouse2.h5ad"))
data_train, data_test

(AnnData object with n_obs × n_vars = 9264 × 13553
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode'
     uns: 'protein_name', 'version'
     obsm: 'protein_expression',
 AnnData object with n_obs × n_vars = 7564 × 13553
     obs: 'n_protein_counts', 'n_proteins', 'seurat_hash_id', 'batch_indices', 'hash_id', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types'
     var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_mean_variance', 'encode', 'hvg_encode'
     uns: 'protein_name', 'version'
     obsm: 'protein_expression')

## Perform clustering for scran normalization in clusters

In [4]:
data_train.layers["counts"] = data_train.X.copy()
data_test.layers["counts"] = data_test.X.copy()

sc.pp.normalize_per_cell(data_train, counts_per_cell_after=1e6)
sc.pp.log1p(data_train)
sc.pp.pca(data_train, n_comps=15)
sc.pp.neighbors(data_train)
sc.tl.leiden(data_train, key_added='groups', resolution=0.5)

sc.pp.normalize_per_cell(data_test, counts_per_cell_after=1e6)
sc.pp.log1p(data_test)
sc.pp.pca(data_test, n_comps=15)
sc.pp.neighbors(data_test)
sc.tl.leiden(data_test, key_added='groups', resolution=0.5)

## Save clustering results

In [5]:
data_train.obs.to_csv(os.path.join(save_dir, "train_cell_metadata.txt"))
data_test.obs.to_csv(os.path.join(save_dir, "test_cell_metadata.txt"))
data_train.obs, data_test.obs

(                    n_protein_counts  n_proteins seurat_hash_id  \
 index                                                             
 AAACCCAAGGGTAATT-1            2319.0         100         Spleen   
 AAACCCAAGGTAAACT-1            3760.0         105         Spleen   
 AAACCCACACTAGGTT-1            1351.0         104         Spleen   
 AAACCCACAGATACCT-1            3341.0         102     Lymph_Node   
 AAACCCACAGGAATAT-1            3708.0         102     Lymph_Node   
 ...                              ...         ...            ...   
 TTTGTTGGTAGTCACT-1            3077.0         102         Spleen   
 TTTGTTGGTGGTAATA-1            2739.0         103         Spleen   
 TTTGTTGGTTTACGTG-1            2755.0         102         Spleen   
 TTTGTTGTCATGAAAG-1            2258.0          99     Lymph_Node   
 TTTGTTGTCTCCGATC-1            1789.0         104         Spleen   
 
                     batch_indices     hash_id  n_genes  percent_mito  \
 index                                   

## Convert gene expression data to TXT format

In [6]:
raw_gene_expression_train = pd.DataFrame(data_train.layers["counts"].todense().T, index=data_train.var.index, columns=data_train.obs.index)
raw_gene_expression_test = pd.DataFrame(data_test.layers["counts"].todense().T, index=data_test.var.index, columns=data_test.obs.index)

raw_gene_expression_train.to_csv(os.path.join(save_dir, "train_raw_gene_expression.txt"))
raw_gene_expression_test.to_csv(os.path.join(save_dir, "test_raw_gene_expression.txt"))

raw_gene_expression_train, raw_gene_expression_test

(index          AAACCCAAGGGTAATT-1  AAACCCAAGGTAAACT-1  AAACCCACACTAGGTT-1  \
 index                                                                       
 0610007P14Rik                 2.0                 1.0                 0.0   
 0610009B22Rik                 0.0                 0.0                 0.0   
 0610009L18Rik                 0.0                 0.0                 0.0   
 0610009O20Rik                 0.0                 0.0                 0.0   
 0610010F05Rik                 0.0                 0.0                 0.0   
 ...                           ...                 ...                 ...   
 mt-Nd3                       19.0                 8.0                 3.0   
 mt-Nd4                       43.0                24.0                12.0   
 mt-Nd4l                       4.0                 1.0                 1.0   
 mt-Nd5                        5.0                 5.0                 2.0   
 mt-Nd6                        0.0                 0.0          