In [1]:
# Cell 0 — Imports & config

import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Reproducibility
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# Paths (update if needed)
DATA_DIR = "./tcga_data"  # folder where your CSVs are saved
CLIN_FILE = "clinical_data.csv"
MRNA_FILE = "mrna_expression.csv"
METH_FILE = "methylation_data.csv"

In [2]:
# Cell 1 — Load CSVs

clinical = pd.read_csv(os.path.join(DATA_DIR, CLIN_FILE))
mrna = pd.read_csv(os.path.join(DATA_DIR, MRNA_FILE), index_col=0)
methyl = pd.read_csv(os.path.join(DATA_DIR, METH_FILE), index_col=0)

print("Clinical shape:", clinical.shape)
print("mRNA shape (genes × samples):", mrna.shape)
print("Methylation shape (CpGs × samples):", methyl.shape)

display(clinical.head())
display(mrna.iloc[:5, :5])
display(methyl.iloc[:5, :5])

Clinical shape: (100, 9)
mRNA shape (genes × samples): (1000, 100)
Methylation shape (CpGs × samples): (500, 100)


Unnamed: 0,patient_id,age_at_diagnosis,tumor_stage,er_status,pr_status,her2_status,pam50_subtype,survival_days,vital_status
0,TCGA-00,67.450712,Stage II,Positive,Positive,Positive,Basal,1874.237812,Alive
1,TCGA-01,57.926035,Stage II,Positive,Negative,Negative,Normal,3383.650131,Dead
2,TCGA-02,69.715328,Stage I,Positive,Positive,Negative,Normal,1075.168668,Alive
3,TCGA-03,82.845448,Stage II,Positive,Positive,Positive,Luminal B,1139.294254,Alive
4,TCGA-04,56.487699,Stage IV,Positive,Positive,Positive,Basal,170.042008,Alive


Unnamed: 0,TCGA-00,TCGA-01,TCGA-02,TCGA-03,TCGA-04
BRCA1,400.786275,112.55831,542.060206,3121.471215,92.915758
BRCA2,8.751792,63.988882,74.781533,29.827938,107.493306
TP53,303.558615,455.578961,1294.788301,1221.218127,9.437218
PTEN,28.275904,48.406685,661.551087,503.075637,142.336904
ATM,6.117446,44.757094,149.977817,163.034394,60.332387


Unnamed: 0,TCGA-00,TCGA-01,TCGA-02,TCGA-03,TCGA-04
cg00000000,0.353677,0.248558,0.415959,0.159968,0.550283
cg00000001,0.227215,0.47592,0.115885,0.171854,0.240936
cg00000002,0.388094,0.464165,0.128458,0.055206,0.017279
cg00000003,0.20631,0.038673,0.416867,0.305264,0.360409
cg00000004,0.405636,0.26491,0.380812,0.225332,0.045154


In [3]:
# Cell 2 — Prepare labels and align samples

# Make clinical index = patient_id for easier joins
clinical = clinical.set_index("patient_id")

# Tumor stage label
if "tumor_stage" not in clinical.columns:
    raise ValueError("clinical_data.csv must contain a 'tumor_stage' column.")

labels = clinical["tumor_stage"]

# Samples present in all three: clinical index & mrna/methyl columns
common_samples = (
    set(clinical.index)
    & set(mrna.columns)
    & set(methyl.columns)
)
common_samples = sorted(common_samples)

print("Number of common samples:", len(common_samples))

# Subset and ensure consistent order
labels = labels.loc[common_samples]
mrna = mrna[common_samples]
methyl = methyl[common_samples]

print("After alignment:")
print("  mRNA:", mrna.shape)
print("  Methyl:", methyl.shape)
print("  Labels:", labels.shape)

Number of common samples: 100
After alignment:
  mRNA: (1000, 100)
  Methyl: (500, 100)
  Labels: (100,)


In [4]:
# Cell 3 — Preprocess omics matrices

# mRNA: log transform per gene
mrna_proc = np.log1p(mrna)

# Methylation: fill missing CpG values with per-feature median
methyl_proc = methyl.copy()
methyl_proc = methyl_proc.apply(lambda col: col.fillna(col.median()), axis=0)

print("Processed mRNA shape:", mrna_proc.shape)
print("Processed methylation shape:", methyl_proc.shape)

Processed mRNA shape: (1000, 100)
Processed methylation shape: (500, 100)


In [5]:
# Cell 4 — PCA utilities & embeddings

def compute_pca_from_matrix(features_df, n_components=50, scale=True):
    """
    features_df: (features × samples), e.g., genes × samples
    Returns: PCA embedding (samples × n_components)
    """
    # transpose to samples × features
    X = features_df.T.values

    if scale:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

    n_comp = min(n_components, X.shape[0])  # cannot have more PCs than samples
    pca = PCA(n_components=n_comp, random_state=SEED)
    X_pca = pca.fit_transform(X)

    print(f"PCA: input {features_df.shape} -> embedding {X_pca.shape}, "
          f"explained variance {pca.explained_variance_ratio_.sum():.2%}")
    return X_pca

# Compute PCA embeddings
rna_pca   = compute_pca_from_matrix(mrna_proc,   n_components=50)
meth_pca  = compute_pca_from_matrix(methyl_proc, n_components=50)

# Multimodal early fusion by concatenation
multi_pca = np.concatenate([rna_pca, meth_pca], axis=1)

print("RNA PCA shape:", rna_pca.shape)
print("Methyl PCA shape:", meth_pca.shape)
print("Multimodal PCA shape:", multi_pca.shape)

PCA: input (1000, 100) -> embedding (100, 50), explained variance 63.02%
PCA: input (500, 100) -> embedding (100, 50), explained variance 69.07%
RNA PCA shape: (100, 50)
Methyl PCA shape: (100, 50)
Multimodal PCA shape: (100, 100)


In [6]:
# Cell 5 — VAE definition

class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim=16):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU()
        )
        self.mu = nn.Linear(128, latent_dim)
        self.logvar = nn.Linear(128, latent_dim)

        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, input_dim)
        )

    def encode(self, x):
        h = self.encoder(x)
        mu = self.mu(h)
        logvar = self.logvar(h)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        recon = self.decode(z)
        return recon, mu, logvar, z

def vae_loss(recon_x, x, mu, logvar, beta=1.0):
    recon_loss = nn.MSELoss()(recon_x, x)
    kld = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss + beta * kld

In [7]:
# Cell 6 — Train VAE on multi_pca

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

X_multi = torch.tensor(multi_pca, dtype=torch.float32)
dataset = TensorDataset(X_multi)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

input_dim = multi_pca.shape[1]
latent_dim = 16

vae = VAE(input_dim=input_dim, latent_dim=latent_dim).to(device)
optimizer = torch.optim.Adam(vae.parameters(), lr=1e-3)

n_epochs = 50

vae.train()
for epoch in range(1, n_epochs + 1):
    epoch_loss = 0.0
    for (batch,) in loader:
        batch = batch.to(device)
        recon, mu, logvar, z = vae(batch)
        loss = vae_loss(recon, batch, mu, logvar, beta=1.0)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * batch.size(0)

    epoch_loss /= len(dataset)
    if epoch % 5 == 0 or epoch == 1:
        print(f"Epoch {epoch:3d} / {n_epochs}  Loss: {epoch_loss:.4f}")

Using device: cpu
Epoch   1 / 50  Loss: 9.8258
Epoch   5 / 50  Loss: 9.7765
Epoch  10 / 50  Loss: 9.7618
Epoch  15 / 50  Loss: 9.7572
Epoch  20 / 50  Loss: 9.7530
Epoch  25 / 50  Loss: 9.7563
Epoch  30 / 50  Loss: 9.7412
Epoch  35 / 50  Loss: 9.7056
Epoch  40 / 50  Loss: 9.6423
Epoch  45 / 50  Loss: 9.6196
Epoch  50 / 50  Loss: 9.5272


In [8]:
# Cell 7 — Get VAE latent embeddings

vae.eval()
with torch.no_grad():
    X_multi_t = X_multi.to(device)
    mu, logvar = vae.encode(X_multi_t)
    z_latent = mu.cpu().numpy()

print("VAE latent shape:", z_latent.shape)

VAE latent shape: (100, 16)


In [9]:
# Cell 8 — Evaluation helper (Random Forest + F1)

def eval_random_forest(X, labels, name):
    """
    X: np.array (n_samples × n_features)
    labels: pd.Series with tumor_stage
    """
    y = labels.copy()
    mask = y.notna()
    X = X[mask]
    y = y[mask]

    le = LabelEncoder()
    y_enc = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_enc,
        test_size=0.2,
        random_state=SEED,
        stratify=y_enc
    )

    clf = RandomForestClassifier(
        n_estimators=300,
        class_weight="balanced",
        random_state=SEED
    )
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    f1 = f1_score(y_test, y_pred, average="weighted")
    print(f"\n===== {name} =====")
    print("Weighted F1:", round(f1, 4))
    print(classification_report(y_test, y_pred, target_names=le.classes_))

    return clf, f1

In [10]:
# Cell 9 — Compare embeddings

models = {}
scores = {}

models["RF_RNA"],   scores["RNA PCA"]        = eval_random_forest(rna_pca,   labels, "RNA PCA")
models["RF_METH"],  scores["Methyl PCA"]     = eval_random_forest(meth_pca,  labels, "Methylation PCA")
models["RF_MULTI"], scores["Multimodal PCA"] = eval_random_forest(multi_pca, labels, "Multimodal (RNA+Methyl) PCA")
models["RF_VAE"],   scores["VAE latent"]     = eval_random_forest(z_latent,  labels, "VAE latent (on multimodal PCA)")

print("\n===== Summary of Weighted F1 Scores =====")
for k, v in scores.items():
    print(f"{k:25s}: {v:.4f}")


===== RNA PCA =====
Weighted F1: 0.2585
              precision    recall  f1-score   support

     Stage I       0.00      0.00      0.00         4
    Stage II       0.25      0.38      0.30         8
   Stage III       0.43      0.50      0.46         6
    Stage IV       0.00      0.00      0.00         2

    accuracy                           0.30        20
   macro avg       0.17      0.22      0.19        20
weighted avg       0.23      0.30      0.26        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



===== Methylation PCA =====
Weighted F1: 0.3287
              precision    recall  f1-score   support

     Stage I       0.00      0.00      0.00         4
    Stage II       0.40      0.75      0.52         8
   Stage III       0.50      0.33      0.40         6
    Stage IV       0.00      0.00      0.00         2

    accuracy                           0.40        20
   macro avg       0.23      0.27      0.23        20
weighted avg       0.31      0.40      0.33        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



===== Multimodal (RNA+Methyl) PCA =====
Weighted F1: 0.2549
              precision    recall  f1-score   support

     Stage I       0.00      0.00      0.00         4
    Stage II       0.27      0.38      0.32         8
   Stage III       0.38      0.50      0.43         6
    Stage IV       0.00      0.00      0.00         2

    accuracy                           0.30        20
   macro avg       0.16      0.22      0.19        20
weighted avg       0.22      0.30      0.25        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



===== VAE latent (on multimodal PCA) =====
Weighted F1: 0.1487
              precision    recall  f1-score   support

     Stage I       0.00      0.00      0.00         4
    Stage II       0.22      0.25      0.24         8
   Stage III       0.20      0.17      0.18         6
    Stage IV       0.00      0.00      0.00         2

    accuracy                           0.15        20
   macro avg       0.11      0.10      0.10        20
weighted avg       0.15      0.15      0.15        20


===== Summary of Weighted F1 Scores =====
RNA PCA                  : 0.2585
Methyl PCA               : 0.3287
Multimodal PCA           : 0.2549
VAE latent               : 0.1487


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
# Cell 10 — Optional: Save embeddings to disk

OUT_DIR = os.path.join(DATA_DIR, "embeddings")
os.makedirs(OUT_DIR, exist_ok=True)

np.save(os.path.join(OUT_DIR, "rna_pca.npy"),   rna_pca)
np.save(os.path.join(OUT_DIR, "meth_pca.npy"),  meth_pca)
np.save(os.path.join(OUT_DIR, "multi_pca.npy"), multi_pca)
np.save(os.path.join(OUT_DIR, "vae_latent.npy"), z_latent)

print("Saved embeddings to:", OUT_DIR)

Saved embeddings to: ./tcga_data\embeddings
