## Requirements

In [1]:
from multiprocessing import cpu_count
from google.colab import drive
from os.path import join
from os.path import exists
drive.mount('/content/drive')

# Directories
working_directory_path = '/content/drive/My Drive/Project'
assets_directory = join(working_directory_path, "Assets")
preprocessed_directory = join(working_directory_path, "Preprocessed")
imputed_directory = join(working_directory_path, "Imputed")
h5ad_directory = join(working_directory_path, "H5AD")

# Config Files
file_uniprot2taxid_json = join(assets_directory, "uniprot2taxid.json")
file_taxid2ranks_json = join(assets_directory, "taxid2ranks.json")
# file_metadata_json = join(assets_directory, "metadata.json")

print(exists(assets_directory))
print(exists(preprocessed_directory))
print(exists(h5ad_directory))
print(exists(imputed_directory))
print("CPU:", cpu_count())

Mounted at /content/drive
True
True
True
True
CPU: 2


In [2]:
!pip install scanpy anndata

Collecting scanpy
  Downloading scanpy-1.11.4-py3-none-any.whl.metadata (9.2 kB)
Collecting anndata
  Downloading anndata-0.12.1-py3-none-any.whl.metadata (9.6 kB)
Collecting legacy-api-wrap>=1.4.1 (from scanpy)
  Downloading legacy_api_wrap-1.4.1-py3-none-any.whl.metadata (2.1 kB)
Collecting session-info2 (from scanpy)
  Downloading session_info2-0.2-py3-none-any.whl.metadata (3.3 kB)
Collecting array-api-compat>=1.7.1 (from anndata)
  Downloading array_api_compat-1.12.0-py3-none-any.whl.metadata (2.5 kB)
Collecting zarr!=3.0.*,>=2.18.7 (from anndata)
  Downloading zarr-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting donfig>=0.8 (from zarr!=3.0.*,>=2.18.7->anndata)
  Downloading donfig-0.8.1.post1-py3-none-any.whl.metadata (5.0 kB)
Collecting numcodecs>=0.14 (from numcodecs[crc32c]>=0.14->zarr!=3.0.*,>=2.18.7->anndata)
  Downloading numcodecs-0.16.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Collecting crc32c>=2.7 (from numcodecs[crc32c]>=0.14->zar

In [3]:
import pandas as pd
import scanpy as sc
import numpy as np
import anndata
import json

## Imputed CSV Files

In [5]:
# AE
file_AE_train_imputed_csv = join(imputed_directory, "AE-NEW/train_data_AE_imputed.csv")
file_AE_train_imputed_h5ad = join(h5ad_directory, "AETrain.h5ad")
file_AE_test_imputed_csv = join(imputed_directory, "AE-NEW/test_data_AE_imputed.csv")
file_AE_test_imputed_h5ad = join(h5ad_directory, "AETest.h5ad")

# VAE
file_VAE_train_imputed_csv = join(imputed_directory, "VAE-NEW/ImputedVAETrain_common.csv")
file_VAE_train_imputed_h5ad = join(h5ad_directory, "VAETrain.h5ad")
file_VAE_test_imputed_csv = join(imputed_directory, "VAE-NEW/ImputedVAETest_common.csv")
file_VAE_test_imputed_h5ad = join(h5ad_directory, "VAETest.h5ad")

# DAE
file_DAE_train_imputed_csv = join(imputed_directory, "DAE-NEW/ImputedDAETrain_common.csv")
file_DAE_train_imputed_h5ad = join(h5ad_directory, "DAETrain.h5ad")
file_DAE_test_imputed_csv = join(imputed_directory, "DAE-NEW/ImputedDAETest_common.csv")
file_DAE_test_imputed_h5ad = join(h5ad_directory, "DAETest.h5ad")

# CFT
file_CFT_train_imputed_csv = join(imputed_directory, "CFT-NEW/Imputed_CFT_Train_common.csv")
file_CFT_train_imputed_h5ad = join(h5ad_directory, "CFTTrain.h5ad")
file_CFT_test_imputed_csv = join(imputed_directory, "CFT-NEW/Imputed_CFT_Test_common.csv")
file_CFT_test_imputed_h5ad = join(h5ad_directory, "CFTTest.h5ad")

# # EM
# file_EM_train_imputed_csv = join(imputed_directory, "SVD-NEW/Train_data_EM_imputed.csv")
# file_EM_train_imputed_h5ad = join(h5ad_directory, "EMTrain.h5ad")
# file_EM_test_imputed_csv = join(imputed_directory, "Test_data_EM_imputed.csv")
# file_EM_test_imputed_h5ad = join(h5ad_directory, "EMTest.h5ad")

# KNN
file_KNN_train_imputed_csv = join(imputed_directory, "KNN-NEW/train_data_KNN_imputed.csv")
file_KNN_train_imputed_h5ad = join(h5ad_directory, "KNNTrain.h5ad")
file_KNN_test_imputed_csv = join(imputed_directory, "KNN-NEW/test_data_KNN_imputed.csv")
file_KNN_test_imputed_h5ad = join(h5ad_directory, "KNNTest.h5ad")

# SVD
file_SVD_train_imputed_csv = join(imputed_directory, "SVD-NEW/Train_data_Imputed_SVD_no_negative.csv")
file_SVD_train_imputed_h5ad = join(h5ad_directory, "SVDTrain.h5ad")
file_SVD_test_imputed_csv = join(imputed_directory, "SVD-NEW/Test_data_Imputed_SVD_no_negative.csv")
file_SVD_test_imputed_h5ad = join(h5ad_directory, "SVDTest.h5ad")

# MLE
file_MLE_train_imputed_csv = join(imputed_directory, "MLE-NEW/train_data_MLE_imputed_no_negative_med_imputed.csv")
file_MLE_train_imputed_h5ad = join(h5ad_directory, "MLETrain.h5ad")
file_MLE_test_imputed_csv = join(imputed_directory, "MLE-NEW/test_data_MLE_imputed_no_negative_med_imputed.csv")
file_MLE_test_imputed_h5ad = join(h5ad_directory, "MLETest.h5ad")

# LC (LeftCensored)
file_LC_train_imputed_csv = join(imputed_directory, "LC-NEW/train_data_LeftCensored_imputed.csv")
file_LC_train_imputed_h5ad = join(h5ad_directory, "LCTrain.h5ad")
file_LC_test_imputed_csv = join(imputed_directory, "LC-NEW/test_data_LeftCensored_imputed.csv")
file_LC_test_imputed_h5ad = join(h5ad_directory, "LCTest.h5ad")

# BPCA
file_BPCA_train_imputed_csv = join(imputed_directory, "BPCA-NEW/train_data_BPCA_imputed.csv")
file_BPCA_train_imputed_h5ad = join(h5ad_directory, "BPCATrain.h5ad")
file_BPCA_test_imputed_csv = join(imputed_directory, "BPCA-NEW/test_data_BPCA_imputed.csv")
file_BPCA_test_imputed_h5ad = join(h5ad_directory, "BPCATest.h5ad")


file_paths = [
    ("AE", file_AE_train_imputed_csv, file_AE_test_imputed_csv),
    ("VAE", file_VAE_train_imputed_csv, file_VAE_test_imputed_csv),
    ("DAE", file_DAE_train_imputed_csv, file_DAE_test_imputed_csv),
    ("CFT", file_CFT_train_imputed_csv, file_CFT_test_imputed_csv),
    # ("EM", file_EM_train_imputed_csv, file_EM_test_imputed_csv),
    ("KNN", file_KNN_train_imputed_csv, file_KNN_test_imputed_csv),
    ("SVD", file_SVD_train_imputed_csv, file_SVD_test_imputed_csv),
    ("MLE", file_MLE_train_imputed_csv, file_MLE_test_imputed_csv),
    ("LC", file_LC_train_imputed_csv, file_LC_test_imputed_csv),
    ("BPCA", file_BPCA_train_imputed_csv, file_BPCA_test_imputed_csv),
]
for label, train_csv, test_csv in file_paths:
  print(label, exists(train_csv), exists(test_csv))
  _train_data = pd.read_csv(train_csv)
  _test_data = pd.read_csv(test_csv)
  print("Max Train:", _train_data.drop(columns=['batch_label', 'label']).select_dtypes(include=['float64', 'int64']).max().max())
  print("Max Test:", _test_data.drop(columns=['batch_label', 'label']).select_dtypes(include=['float64', 'int64']).max().max())
  print("Min Train:", _train_data.drop(columns=['batch_label', 'label']).select_dtypes(include=['float64', 'int64']).min().min())
  print("Min Test:", _test_data.drop(columns=['batch_label', 'label']).select_dtypes(include=['float64', 'int64']).min().min())
  print("Mean Train:", _train_data.drop(columns=['batch_label', 'label']).select_dtypes(include=['float64', 'int64']).mean().mean())
  print("Mean Test:", _test_data.drop(columns=['batch_label', 'label']).select_dtypes(include=['float64', 'int64']).mean().mean())
  print("Shapes:", _train_data.shape, _test_data.shape)
  print("Indexes:", _train_data.index, _test_data.index)
  print("Columns:")
  print("\tTrain Columns:\n", _train_data.columns)
  print()
  print("\tTest Columns:\n", _test_data.columns)
  print("="*50)

AE True True
Max Train: 19507158000.0
Max Test: 26144700000.0
Min Train: 0.0
Min Test: 0.0
Mean Train: 11821609.96562456
Mean Test: 11690886.63055364
Shapes: (240, 5331) (120, 5331)
Indexes: RangeIndex(start=0, stop=240, step=1) RangeIndex(start=0, stop=120, step=1)
Columns:
	Train Columns:
 Index(['batch_label', 'A0A075B6H7', 'A0A075B6J9', 'A0A0B4J1X5', 'A0A0C4DH36',
       'A0A0C4DH43', 'A0A0C4DH67', 'A0A2R8Y4L2', 'A1A5D9', 'A6NI72',
       ...
       'A0A378VW26_NEIGO', 'A7BA67_9ACTO', 'A0A2I1Z4Z0_9MICC',
       'A0A081PVR7_STRMT', 'C0EJV9_NEIFL', 'Z4WU57_9PORP', 'D2NQJ0_ROTMD',
       'E4ZF81_NEIL0', 'F9DIH9_9BACT', 'label'],
      dtype='object', length=5331)

	Test Columns:
 Index(['batch_label', 'A0A075B6H7', 'A0A075B6J9', 'A0A0B4J1X5', 'A0A0C4DH36',
       'A0A0C4DH43', 'A0A0C4DH67', 'A0A2R8Y4L2', 'A1A5D9', 'A6NI72',
       ...
       'A0A378VW26_NEIGO', 'A7BA67_9ACTO', 'A0A2I1Z4Z0_9MICC',
       'A0A081PVR7_STRMT', 'C0EJV9_NEIFL', 'Z4WU57_9PORP', 'D2NQJ0_ROTMD',
       'E4ZF81

# Transformer Old

In [None]:
class ProteinDataTransformer:
    def __init__(
        self,
        csv_path: str,
        uniprot2taxid_path: str,
        taxid2ranks_path: str,
    ):
        """
        Initialize the transformer with file paths.

        :param csv_path: Path to imputed CSV
        :param uniprot2taxid_path: Path to uniprot2taxid.json
        :param taxid2ranks_path: Path to taxid2ranks.json
        """
        self.csv_path = csv_path
        self.uniprot2taxid_path = uniprot2taxid_path
        self.taxid2ranks_path = taxid2ranks_path

        self.raw_df: pd.DataFrame = None
        self.uniprot2taxid: dict = None
        self.taxid2ranks: dict = None
        self.counts_df: pd.DataFrame = None
        self.adata: anndata.AnnData = None
        self.case_labels: pd.Series = None

    def load_data(self):
        """
        Load CSV and JSON mappings into memory, extract case labels, and transpose.
        """
        df = pd.read_csv(self.csv_path, index_col=0)
        # Extract sample-level labels if present
        if 'label' in df.columns:
            self.case_labels = df['label'].copy().astype(int)
            df = df.drop(columns=['label'])
        # Transpose: proteins as index, samples as columns
        self.raw_df = df.T

        # Load mappings
        with open(self.uniprot2taxid_path, 'r') as f:
            self.uniprot2taxid = json.load(f)
        with open(self.taxid2ranks_path, 'r') as f:
            self.taxid2ranks = json.load(f)

    def map_and_collapse_taxids(self):
        """
        Map proteins to taxids and collapse by summing abundances.
        Proteins without a mapping are dropped.
        Produces a DataFrame of taxid rows × case columns.
        """
        prot_to_taxid = self.raw_df.index.to_series().map(
            lambda prot: self.uniprot2taxid.get(str(prot))
        )
        valid = prot_to_taxid.dropna().index
        df_valid = self.raw_df.loc[valid].copy()
        df_valid['taxid'] = prot_to_taxid.loc[valid].values

        collapsed = df_valid.groupby('taxid').sum()
        self.counts_df = collapsed

    def build_anndata(self):
        """
        Create AnnData object with organisms as obs and cases as var,
        annotate obs with taxonomic ranks, and add case labels in var.
        Missing ranks will be stored as NaN.
        """
        # Construct AnnData
        self.adata = anndata.AnnData(
            X=self.counts_df.values,
            obs=pd.DataFrame(index=self.counts_df.index),
            var=pd.DataFrame(index=self.counts_df.columns)
        )
        # Annotate organisms
        self.adata.obs['taxid'] = self.adata.obs.index.astype(str)

        # Add taxonomic ranks, using None for missing to yield NaN
        rank_fields = set(
            r for ranks in self.taxid2ranks.values() for r in ranks.keys()
        )
        for rank in rank_fields:
            self.adata.obs[rank] = self.adata.obs['taxid'].map(
                lambda tid: self.taxid2ranks.get(str(tid), {}).get(rank)
            )

        # Replace python None with np.nan
        self.adata.obs = self.adata.obs.replace({None: np.nan})

        # Add case-level label to var if available
        if self.case_labels is not None:
            labels = self.case_labels.reindex(self.adata.var.index)
            self.adata.var['label'] = labels.astype(int)

    def save_h5ad(self, output_path: str):
        """
        Save the AnnData object to an .h5ad file.
        """
        self.adata.write(output_path)

    def run(self, output_path: str):
        """
        Execute full pipeline: load, map, collapse, build, and save.
        """
        self.load_data()
        self.map_and_collapse_taxids()
        self.build_anndata()
        self.save_h5ad(output_path)

In [None]:
class ProteinDataTransformer2:
    def __init__(
        self,
        csv_path: str,
        uniprot2taxid_path: str,
        taxid2ranks_path: str,
    ):
        """
        Initialize the transformer with file paths.

        :param csv_path: Path to imputed CSV
        :param uniprot2taxid_path: Path to uniprot2taxid.json
        :param taxid2ranks_path: Path to taxid2ranks.json
        """
        self.csv_path = csv_path
        self.uniprot2taxid_path = uniprot2taxid_path
        self.taxid2ranks_path = taxid2ranks_path

        self.raw_df: pd.DataFrame = None
        self.uniprot2taxid: dict = None
        self.taxid2ranks: dict = None
        self.counts_df: pd.DataFrame = None
        self.adata: anndata.AnnData = None
        self.case_labels: pd.Series = None

    def load_data(self):
        """
        Load CSV and JSON mappings into memory, extract case labels, and transpose.
        """
        df = pd.read_csv(self.csv_path, index_col=0)
        # Extract sample-level labels if present
        if 'label' in df.columns:
            self.case_labels = df['label'].copy().astype(int)
            df = df.drop(columns=['label'])
        # Transpose: proteins as index, samples as columns
        self.raw_df = df.T

        # Load mappings
        with open(self.uniprot2taxid_path, 'r') as f:
            self.uniprot2taxid = json.load(f)
        with open(self.taxid2ranks_path, 'r') as f:
            self.taxid2ranks = json.load(f)

    def map_and_collapse_taxids(self):
        """
        Map proteins to taxids and collapse by summing abundances.
        Proteins without a mapping are dropped, and unmapped proteins are printed.
        Produces a DataFrame of taxid rows × case columns.
        """
        prot_to_taxid = self.raw_df.index.to_series().map(
            lambda prot: self.uniprot2taxid.get(str(prot))
        )
        missing = prot_to_taxid[prot_to_taxid.isna()].index.tolist()
        if missing:
            print(f"{len(missing)} proteins were not found in UniProt mapping:")
            for prot in missing:
                print(f" - {prot}")
        valid = prot_to_taxid.dropna().index
        df_valid = self.raw_df.loc[valid].copy()
        df_valid['taxid'] = prot_to_taxid.loc[valid].values

        collapsed = df_valid.groupby('taxid').sum()
        self.counts_df = collapsed

    def build_anndata(self):
        """
        Create AnnData object with organisms as obs and cases as var,
        annotate obs with taxonomic ranks (ordered), convert obs to categorical,
        and add case labels in var.
        Missing ranks stored as NaN.
        """
        # Ordered list of ranks
        ordered_ranks = [
            'root', 'domain', 'kingdom', 'phylum', 'class',
            'order', 'family', 'genus', 'species', 'strain'
        ]
        # Construct AnnData
        self.adata = anndata.AnnData(
            X=self.counts_df.values,
            obs=pd.DataFrame(index=self.counts_df.index),
            var=pd.DataFrame(index=self.counts_df.columns)
        )
        # Annotate taxid
        self.adata.obs['taxid'] = self.adata.obs.index.astype(str)

        # Populate ranks in order
        for rank in ordered_ranks:
            # map or fill NaN
            self.adata.obs[rank] = self.adata.obs['taxid'].map(
                lambda tid: self.taxid2ranks.get(str(tid), {}).get(rank)
            )

        # Ensure None -> NaN
        self.adata.obs = self.adata.obs.replace({None: np.nan})

        # Convert each rank column to categorical preserving NaNs
        for rank in ordered_ranks:
            self.adata.obs[rank] = pd.Categorical(
                self.adata.obs[rank],
                categories=self.adata.obs[rank].dropna().unique(),
                ordered=False
            )

        # Reorder obs columns: taxid first, then ranks
        cols = ['taxid'] + ordered_ranks
        self.adata.obs = self.adata.obs[cols]

        # Add case-level label to var if available
        if self.case_labels is not None:
            labels = self.case_labels.reindex(self.adata.var.index)
            self.adata.var['label'] = labels.astype(int)

    def save_h5ad(self, output_path: str):
        """
        Save the AnnData object to an .h5ad file.
        """
        self.adata.write(output_path)

    def run(self, output_path: str):
        """
        Execute full pipeline: load, map, collapse, build, and save.
        """
        self.load_data()
        self.map_and_collapse_taxids()
        self.build_anndata()
        self.save_h5ad(output_path)


# Transformer

In [6]:
class ProteinDataTransformer3:
    def __init__(
        self,
        csv_path: str,
        uniprot2taxid_path: str,
        taxid2ranks_path: str,
    ):
        """
        Initialize the transformer with file paths.

        :param csv_path: Path to imputed CSV
        :param uniprot2taxid_path: Path to uniprot2taxid.json
        :param taxid2ranks_path: Path to taxid2ranks.json
        """
        self.csv_path = csv_path
        self.uniprot2taxid_path = uniprot2taxid_path
        self.taxid2ranks_path = taxid2ranks_path

        self.raw_df: pd.DataFrame = None
        self.uniprot2taxid: dict = None
        self.taxid2ranks: dict = None
        self.counts_df: pd.DataFrame = None
        self.adata: anndata.AnnData = None
        self.case_labels: pd.Series = None

    def load_data(self):
        """
        Load CSV and JSON mappings into memory, extract case labels, and transpose.
        """
        df = pd.read_csv(self.csv_path, index_col=0)
        # Extract sample-level labels if present
        if 'label' in df.columns:
            self.case_labels = df['label'].copy().astype(int)
            df = df.drop(columns=['label'])

        # Drop second column
        df = df.drop(columns=["batch_label.1"], errors='ignore')

        # Transpose: proteins as index, samples as columns
        self.raw_df = df.T

        # Load mappings
        with open(self.uniprot2taxid_path, 'r') as f:
            self.uniprot2taxid = json.load(f)
        with open(self.taxid2ranks_path, 'r') as f:
            self.taxid2ranks = json.load(f)

    def map_and_collapse_taxids(self):
        """
        Map proteins to taxids and collapse by summing abundances.
        Proteins without a mapping are dropped, and unmapped proteins are printed.
        Produces a DataFrame of taxid rows × case columns.
        """
        prot_to_taxid = self.raw_df.index.to_series().map(
            lambda prot: self.uniprot2taxid.get(str(prot))
        )
        missing = prot_to_taxid[prot_to_taxid.isna()].index.tolist()
        if missing:
            print(f"{len(missing)} proteins were not found in UniProt mapping:")
            for prot in missing:
                print(f" - {prot}")
        valid = prot_to_taxid.dropna().index
        df_valid = self.raw_df.loc[valid].copy()
        df_valid['taxid'] = prot_to_taxid.loc[valid].astype(str).values  # Ensure string taxids

        collapsed = df_valid.groupby('taxid').sum()
        self.counts_df = collapsed

    def build_anndata(self):
        """
        Create AnnData object with organisms as obs and cases as var,
        annotate obs with taxonomic ranks (ordered), convert obs to categorical,
        and add case labels in var.
        Missing ranks stored as NaN.
        """
        # Ordered list of ranks
        ordered_ranks = [
            'root', 'domain', 'kingdom', 'phylum', 'class',
            'order', 'family', 'genus', 'species', 'strain'
        ]
        # Construct AnnData
        self.adata = anndata.AnnData(
            X=self.counts_df.values,
            obs=pd.DataFrame(index=self.counts_df.index),
            var=pd.DataFrame(index=self.counts_df.columns)
        )
        # Annotate taxid as string to prevent float conversion
        self.adata.obs['taxid'] = self.adata.obs.index.astype(str)

        # Populate ranks in order
        for rank in ordered_ranks:
            self.adata.obs[rank] = self.adata.obs['taxid'].map(
                lambda tid: self.taxid2ranks.get(tid, {}).get(rank)
            )

        # Ensure None -> NaN
        self.adata.obs = self.adata.obs.replace({None: np.nan})

        # Convert each rank column to categorical preserving NaNs
        for rank in ordered_ranks:
            self.adata.obs[rank] = pd.Categorical(
                self.adata.obs[rank],
                categories=self.adata.obs[rank].dropna().unique(),
                ordered=False
            )

        # Reorder obs columns: taxid first, then ranks
        cols = ['taxid'] + ordered_ranks
        self.adata.obs = self.adata.obs[cols]

        # Add case-level label to var if available
        if self.case_labels is not None:
            labels = self.case_labels.reindex(self.adata.var.index)
            self.adata.var['label'] = labels.astype(int)

        self.adata.obs['taxid'] = self.adata.obs['taxid'].astype(str)

    def save_h5ad(self, output_path: str):
        """
        Save the AnnData object to an .h5ad file.
        """
        self.adata.write(output_path)

    def run(self, output_path: str):
        """
        Execute full pipeline: load, map, collapse, build, and save.
        """
        self.load_data()
        self.map_and_collapse_taxids()
        self.build_anndata()
        self.save_h5ad(output_path)


## Run

In [7]:
# Iterate and run for each method
methods = [
    ("AE", file_AE_train_imputed_csv, file_AE_train_imputed_h5ad, file_AE_test_imputed_csv, file_AE_test_imputed_h5ad),
    ("VAE", file_VAE_train_imputed_csv, file_VAE_train_imputed_h5ad, file_VAE_test_imputed_csv, file_VAE_test_imputed_h5ad),
    ("DAE", file_DAE_train_imputed_csv, file_DAE_train_imputed_h5ad, file_DAE_test_imputed_csv, file_DAE_test_imputed_h5ad),
    ("CFT", file_CFT_train_imputed_csv, file_CFT_train_imputed_h5ad, file_CFT_test_imputed_csv, file_CFT_test_imputed_h5ad),
    # ("EM", file_EM_train_imputed_csv, file_EM_train_imputed_h5ad, file_EM_test_imputed_csv, file_EM_test_imputed_h5ad),
    ("KNN", file_KNN_train_imputed_csv, file_KNN_train_imputed_h5ad, file_KNN_test_imputed_csv, file_KNN_test_imputed_h5ad),
    ("SVD", file_SVD_train_imputed_csv, file_SVD_train_imputed_h5ad, file_SVD_test_imputed_csv, file_SVD_test_imputed_h5ad),
    ("MLE", file_MLE_train_imputed_csv, file_MLE_train_imputed_h5ad, file_MLE_test_imputed_csv, file_MLE_test_imputed_h5ad),
    ("LC", file_LC_train_imputed_csv, file_LC_train_imputed_h5ad, file_LC_test_imputed_csv, file_LC_test_imputed_h5ad),
    ("BPCA", file_BPCA_train_imputed_csv, file_BPCA_train_imputed_h5ad, file_BPCA_test_imputed_csv, file_BPCA_test_imputed_h5ad),
]

for label, train_csv, train_h5ad, test_csv, test_h5ad in methods:
    transformer = ProteinDataTransformer3(
        csv_path=train_csv,
        uniprot2taxid_path=file_uniprot2taxid_json,
        taxid2ranks_path=file_taxid2ranks_json,
    )
    transformer.run(train_h5ad)

    transformer = ProteinDataTransformer3(
        csv_path=test_csv,
        uniprot2taxid_path=file_uniprot2taxid_json,
        taxid2ranks_path=file_taxid2ranks_json,
    )
    transformer.run(test_h5ad)
    print(label, train_csv, train_h5ad, test_csv, test_h5ad)

AE /content/drive/My Drive/Project/Imputed/AE-NEW/train_data_AE_imputed.csv /content/drive/My Drive/Project/H5AD/AETrain.h5ad /content/drive/My Drive/Project/Imputed/AE-NEW/test_data_AE_imputed.csv /content/drive/My Drive/Project/H5AD/AETest.h5ad
VAE /content/drive/My Drive/Project/Imputed/VAE-NEW/ImputedVAETrain_common.csv /content/drive/My Drive/Project/H5AD/VAETrain.h5ad /content/drive/My Drive/Project/Imputed/VAE-NEW/ImputedVAETest_common.csv /content/drive/My Drive/Project/H5AD/VAETest.h5ad
DAE /content/drive/My Drive/Project/Imputed/DAE-NEW/ImputedDAETrain_common.csv /content/drive/My Drive/Project/H5AD/DAETrain.h5ad /content/drive/My Drive/Project/Imputed/DAE-NEW/ImputedDAETest_common.csv /content/drive/My Drive/Project/H5AD/DAETest.h5ad
CFT /content/drive/My Drive/Project/Imputed/CFT-NEW/Imputed_CFT_Train_common.csv /content/drive/My Drive/Project/H5AD/CFTTrain.h5ad /content/drive/My Drive/Project/Imputed/CFT-NEW/Imputed_CFT_Test_common.csv /content/drive/My Drive/Project/H5AD/

In [11]:
def analyze_h5ad(method_name, file_path, label):
    print(f"--- {method_name} | {label} ---")
    adata = sc.read_h5ad(file_path)

    # Convert sparse to dense if needed
    X = adata.X
    if not isinstance(X, np.ndarray):
        X = X.toarray()

    print("Preview:")
    print(adata)
    # print(X[:1])
    print(adata.obs["taxid"])
    print("Shape:", X.shape)
    print("Min:", np.min(X))
    print("Max:", np.max(X))
    print("Mean:", np.mean(X))
    print()
    print("="*100)

for method_name, _, train_h5ad, _, test_h5ad in methods:
    analyze_h5ad(method_name, train_h5ad, label="Train")
    analyze_h5ad(method_name, test_h5ad, label="Test")

--- AE | Train ---
Preview:
AnnData object with n_obs × n_vars = 775 × 240
    obs: 'taxid', 'root', 'domain', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain'
    var: 'label'
taxid
1000568    1000568
1005705    1005705
1017          1017
1018          1018
1019          1019
            ...   
997877      997877
999422      999422
999424      999424
999425      999425
999437      999437
Name: taxid, Length: 775, dtype: object
Shape: (775, 240)
Min: 30.49746132
Max: 75620371640.78102
Mean: 81286915.49266233

--- AE | Test ---
Preview:
AnnData object with n_obs × n_vars = 775 × 120
    obs: 'taxid', 'root', 'domain', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain'
    var: 'label'
taxid
1000568    1000568
1005705    1005705
1017          1017
1018          1018
1019          1019
            ...   
997877      997877
999422      999422
999424      999424
999425      999425
999437      999437
Name: taxid, Length: 775, dtype: obj

In [None]:
test_data = pd.read_csv("/content/drive/My Drive/Project/Imputed/Test_data_KNN_imputed.csv")
print(test_data.index)
print(test_data.columns)
print(test_data.shape)
print(test_data[['batch_label', 'batch_label.1', 'A0A075B6J9']])

RangeIndex(start=0, stop=120, step=1)
Index(['batch_label', 'batch_label.1', 'A0A075B6J9', 'A0A0B4J1X5',
       'A0A0C4DH36', 'A0A0C4DH43', 'A0A0C4DH67', 'A0A2R8Y4L2', 'A1A5D9',
       'A6NI72',
       ...
       'W3Y643_9FIRM', 'A0A378VW26_NEIGO', 'A7BA67_9ACTO', 'A0A2I1Z4Z0_9MICC',
       'A0A081PVR7_STRMT', 'C0EJV9_NEIFL', 'Z4WU57_9PORP', 'D2NQJ0_ROTMD',
       'E4ZF81_NEIL0', 'label'],
      dtype='object', length=5331)
(120, 5331)
    batch_label  batch_label.1  A0A075B6J9
0          B1_1      193177400  18164060.0
1         B1_10       30568720   2693050.0
2         B1_11       93880400   5200240.0
3         B1_12       46047870   6134930.0
4         B1_13       23309610   3076300.0
..          ...            ...         ...
115        B8_5       91327300   4253970.0
116        B8_6      105783990   1317392.0
117        B8_7       51104450   3738960.0
118        B8_8              0  13973200.0
119        B8_9       97832500   2730590.0

[120 rows x 3 columns]
