# 02 - Intersect, Align, and Scale (TCGA ↔ METABRIC)

**Goal:** Put TCGA (logCPM) and METABRIC (microarray) on the **same gene space** and a **comparable scale** for cross-cohort modeling.  
**Inputs:**  
- `data_proc/tcga_expr_logcpm.parquet`  
- `data_proc/tcga_labels.tsv`  
- `data_proc/metabric_expr_raw.parquet`   
- `data_proc/metabric_labels.tsv`

**Outputs (to be created):**  
- `data_proc/aligned/tcga_expr_aligned.parquet`  
- `data_proc/aligned/metabric_expr_aligned.parquet`  
- `data_proc/aligned/scaler_tcga_stats.json` (if z-scoring)  
- `data_proc/aligned/tcga_expr_z.parquet`, `data_proc/aligned/metabric_expr_z.parquet` (when we scale)

**Provenance:** Conda env `tcga-brca-survival-project` | Date: <fill> | Author: <`Amith Murikinati`>

In [1]:
#core
import os, json, sys, math, time, gc
from pathlib import Path

#data stack
import numpy as np
import pandas as pd

# display options (clean tables)
pd.set_option("display.width", 160)
pd.set_option("display.max_columns", 50)

# reproducibility (used later for splits/models)
SEED = 42
rng = np.random.default_rng(SEED)

print("Python:", sys.version.split()[0])
print("Pandas:", pd.__version__)
print("NumPy:", np.__version__)

Python: 3.11.13
Pandas: 2.3.2
NumPy: 2.3.3


In [2]:
# paths relative to repo root
REPO = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
DATA_PROC = REPO / "data_proc"
ALIGNED = DATA_PROC / "aligned"
ALIGNED.mkdir(parents=True, exist_ok=True)

# expected inputs
P_TCGA_X = DATA_PROC / "tcga_expr_logcpm.parquet"
P_TCGA_Y = DATA_PROC / "tcga_labels.tsv"

# you may have 'metabric_expr_raw.parquet' -we pick whichever exists
P_MB_X = DATA_PROC / "metabric_expr_raw.parquet"
P_MB_Y = DATA_PROC / "metabric_labels.tsv"

print("Repo:", REPO)
print("Exists TCGA X?:", P_TCGA_X.exists())
print("Exists TCGA Y?:", P_TCGA_Y.exists())
print("Exists METABRIC X?:", P_MB_X.exists(), "->", P_MB_X.name)
print("Exists METABRIC Y?:", P_MB_Y.exists())
print("Aligned out dir:", ALIGNED)

Repo: C:\Users\mailt\Desktop\DSprojects2025\tcga-brca-survival-project
Exists TCGA X?: True
Exists TCGA Y?: True
Exists METABRIC X?: True -> metabric_expr_raw.parquet
Exists METABRIC Y?: True
Aligned out dir: C:\Users\mailt\Desktop\DSprojects2025\tcga-brca-survival-project\data_proc\aligned


In [3]:
#load label tables (small)
tcga_y = pd.read_csv(P_TCGA_Y, sep="\t")
mb_y = pd.read_csv(P_MB_Y, sep="\t")
print("TCGA labels:", tcga_y.shape, "| cols:", tcga_y.columns.tolist())
print("MB labels:", mb_y.shape, "| cols:", mb_y.columns.tolist())

# fast load: full matrices (OK for your machine); if memory is tight, we can switch to chunked
tcga_X = pd.read_parquet(P_TCGA_X)
mb_X   = pd.read_parquet(P_MB_X)

print("TCGA X:", tcga_X.shape, tcga_X.dtypes.iloc[:3].tolist())
print("MB   X:", mb_X.shape,   mb_X.dtypes.iloc[:3].tolist())

# basic cross-checks (no action yet; just info)
print("TCGA genes:", tcga_X.index.nunique(), "| samples:", tcga_X.shape[1])
print("MB   genes:", mb_X.index.nunique(),   "| samples:", mb_X.shape[1])

TCGA labels: (1094, 3) | cols: ['SAMPLE_ID', 'os_event', 'os_time_months']
MB labels: (1980, 4) | cols: ['SAMPLE_ID', 'PATIENT_ID', 'os_event', 'os_time_months']
TCGA X: (59427, 1094) [dtype('float32'), dtype('float32'), dtype('float32')]
MB   X: (20385, 1980) [dtype('float32'), dtype('float32'), dtype('float32')]
TCGA genes: 59427 | samples: 1094
MB   genes: 20385 | samples: 1980


In [4]:
# parameters for the upcoming step (we won't execute the transform yet)
SCALING_METHOD = "zscore"   # options later: "zscore" or "qnz" (quantile-normalize-to-TCGA + z)
TRAIN_SOURCE   = "TCGA"     # fit stats on TCGA, apply to METABRIC
SAVE_TAG       = "v1"       # bump when iterating

print(f"Config -> scaling={SCALING_METHOD}, fit_on={TRAIN_SOURCE}, tag={SAVE_TAG}")

Config -> scaling=zscore, fit_on=TCGA, tag=v1


In [8]:
#helper functions
def get_common_genes(X_a: pd.DataFrame, X_b: pd.DataFrame):
    """Return sorted list of shared gene symbols (row index) between two matrices."""
    # ensure uppercase (defensive; your data should already be uppercase)
    ga = pd.Index([str(x).upper() for x in X_a.index])
    gb = pd.Index([str(x).upper() for x in X_b.index])

    # enforce uniqueness (you collapsed duplicates earlier: assert here)
    assert ga.is_unique and gb.is_unique, "Gene index has duplicates-collapse first"
    inter = sorted(list(set(ga) & set(gb)))
    return inter

def align_by_genes(X: pd.DataFrame, genes: list):
    """Subset and reorder rows of X to 'genes' order."""
    missing = [g for g in genes if g not in X.index]
    assert not missing, f"{len(missing)} genes missing from matrix"
    return X.loc[genes]

def fit_zstats(X_train: pd.DataFrame, eps: float = 1e-6):
    """Fit per-gene mean/std on training cohort (rows=genes); return (mean, std) as pd.Series."""
    mu = X_train.mean(axis=1)                   # per-gene mean across samples
    std = X_train.std(axis=1, ddof=0).clip(eps)  # per-gene std, guard tiny std
    return mu.astype("float32"), std.astype("float32")

def apply_z(X: pd.DataFrame, mu: pd.Series, std: pd.Series):
    """Apply (X - mu)/std with index alignment."""
    # Ensure alignment by index (genes)
    Xc = X.copy()
    Xc = Xc.subtract(mu, axis=0).divide(std, axis=0)
    return Xc.astype("float32")

In [11]:
# Intersect & Align (save aligned matrices)

# 1) find common genes
common_genes = get_common_genes(tcga_X, mb_X)
print("Common genes:", len(common_genes))
assert len(common_genes) > 15000, "Too few shared genes-check symbol casing or collapse step"

# 2) align both matrices to identical gene order
tcga_Xa = align_by_genes(tcga_X, common_genes)
mb_Xa = align_by_genes(mb_X, common_genes)

print("Aligned shapes -> TCGA:", tcga_Xa.shape, "| MB:", mb_Xa.shape)
assert list(tcga_Xa.index) == list(mb_Xa.index), "Gene order mismatch after alignment"

# 3) save aligned matrices
P_TCGA_ALN = ALIGNED / "tcga_expr_aligned.parquet"
P_MB_ALN = ALIGNED / "metabric_expr_aligned.parquet"
tcga_Xa.to_parquet(P_TCGA_ALN, index=True)
mb_Xa.to_parquet(P_MB_ALN, index=True)

print("Saved aligned ->", P_TCGA_ALN.name, ",", P_MB_ALN.name)

Common genes: 19451
Aligned shapes -> TCGA: (19451, 1094) | MB: (19451, 1980)
Saved aligned -> tcga_expr_aligned.parquet , metabric_expr_aligned.parquet


In [12]:
# TCGA-fit z-scaling (save scaler + z-scored matrices)

# 1) fit stats on TCGA (full TCGA for now; for CV we'll later fit on train folds)
mu_tcga, sd_tcga = fit_zstats(tcga_Xa)
print("Fitted z-stats on TCGA:", mu_tcga.shape, sd_tcga.shape)

# 2) apply to TCGA and METABRIC
tcga_Z = apply_z(tcga_Xa, mu_tcga, sd_tcga)
mb_Z = apply_z(mb_Xa, mu_tcga, sd_tcga)

print("Z-shapes -> TCGA:", tcga_Z.shape, "| MB:", mb_Z.shape)
print("TCGA Z summary (mean±sd across all entries):", float(tcga_Z.values.mean()), float(tcga_Z.values.std()))
print("MB   Z summary (mean±sd across all entries):", float(mb_Z.values.mean()), float(mb_Z.values.std()))

# 3) save scaler stats (TSV) and z-matrices (Parquet)
P_SCALER = ALIGNED / "tcga_scaler_stats.tsv"
pd.DataFrame({"SYMBOL": tcga_Xa.index, "mean": mu_tcga.values, "std": sd_tcga.values}).to_csv(P_SCALER, sep="\t", index=False)

P_TCGA_Z = ALIGNED / "tcga_expr_z_v1.parquet"
P_MB_Z = ALIGNED / "metabric_expr_z_v1.parquet"
tcga_Z.to_parquet(P_TCGA_Z, index=True)
mb_Z.to_parquet(P_MB_Z, index=True)

print("Saved scaler ->", P_SCALER.name)
print("Saved Z-matrices ->", P_TCGA_Z.name, ",", P_MB_Z.name)

Fitted z-stats on TCGA: (19451,) (19451,)
Z-shapes -> TCGA: (19451, 1094) | MB: (19451, 1980)
TCGA Z summary (mean±sd across all entries): -2.8086150294370782e-08 0.9972972869873047
MB   Z summary (mean±sd across all entries): nan nan
Saved scaler -> tcga_scaler_stats.tsv
Saved Z-matrices -> tcga_expr_z_v1.parquet , metabric_expr_z_v1.parquet


In [13]:
# Label alignment sanity(quick asserts; no saves)

# TCGA labels order should already match columns; verify
assert list(tcga_y["SAMPLE_ID"]) == list(tcga_X.columns), "TCGA labels not aligned to original X columns"
assert list(tcga_y["SAMPLE_ID"]) == list(tcga_Xa.columns), "TCGA labels not aligned after gene alignment"
assert list(tcga_y["SAMPLE_ID"]) == list(tcga_Z.columns),  "TCGA labels not aligned after z-scaling"

# METABRIC labels check
assert list(mb_y["SAMPLE_ID"]) == list(mb_X.columns), "METABRIC labels not aligned to original X columns"
assert list(mb_y["SAMPLE_ID"]) == list(mb_Xa.columns), "METABRIC labels not aligned after gene alignment"
assert list(mb_y["SAMPLE_ID"]) == list(mb_Z.columns),  "METABRIC labels not aligned after z-scaling"

print("Label alignment OK ✅  |  TCGA:", tcga_y.shape, " MB:", mb_y.shape)

Label alignment OK ✅  |  TCGA: (1094, 3)  MB: (1980, 4)
