# PCA vs SVD Ensemble Pipeline

This notebook implements a comprehensive pipeline that:
1. Tests both PCA and SVD dimensionality reduction (256 components)
2. Uses Optuna hyperparameter tuning for each method
3. Trains 5 different seeds for robustness
4. Ensembles predictions using multiple strategies
5. Compares correlation and performance between methods

In [15]:
# =========================
# 0) Imports & constants
# =========================
import os, re, glob, time, json
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from numpy.linalg import norm
import warnings
warnings.filterwarnings('ignore')

SEED = 42
TARGET_COLS = [
    "task_achievement",
    "coherence_and_cohesion", 
    "lexical_resource",
    "grammatical_range",
]
N_COMPONENTS = 256
ANCHOR_DIR = "./Anchors"
OPTUNA_TRIALS = 200          # Reduced for efficiency
TEMP_SOFTMAX = 20.0
TOPK_ANCHORS = 5
VAL_SIZE = 0.2
N_SEEDS = 5                  # Number of seeds for bagging

# Load data
train_emb = pd.read_csv("train_embeddings.csv")
test_emb  = pd.read_csv("test_embeddings.csv")
train_df  = pd.read_csv("train_feature_eng.csv")
test_df   = pd.read_csv("test_feature_eng.csv")

print("Data loaded successfully!")
print(f"Train embeddings shape: {train_emb.shape}")
print(f"Test embeddings shape: {test_emb.shape}")

Data loaded successfully!
Train embeddings shape: (9912, 4096)
Test embeddings shape: (473, 4096)


In [16]:
# ...existing code for feature engineering...
import numpy as np
import pandas as pd
import re
import nltk
from nltk import word_tokenize, pos_tag
from collections import Counter
from sentence_transformers import SentenceTransformer, util
from nltk.corpus import wordnet as wn

nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)

# ---------- Helper: Lexical sophistication ----------
import requests

url = "https://raw.githubusercontent.com/first20hours/google-10000-english/master/20k.txt"
response = requests.get(url)
response.raise_for_status()
COMMON_WORDS = set(response.text.split()[:2000])

def lexical_sophistication(tokens):
    rare_words = [t for t in tokens if t.lower() not in COMMON_WORDS]
    return len(rare_words) / len(tokens) if tokens else 0

# ---------- Helper: Root & Corrected TTR ----------
def root_ttr(tokens):
    return len(set(tokens)) / np.sqrt(len(tokens)) if tokens else 0

def corrected_ttr(tokens):
    return len(set(tokens)) / np.sqrt(2 * len(tokens)) if tokens else 0

# ---------- Helper: MATTR ----------
def mattr(tokens, window_size=50):
    if len(tokens) < window_size:
        return len(set(tokens)) / len(tokens)
    scores = []
    for i in range(len(tokens) - window_size + 1):
        window = tokens[i:i+window_size]
        scores.append(len(set(window)) / window_size)
    return np.mean(scores)

# ...existing code for other helper functions...
def complex_sentence_ratio(text):
    sentences = nltk.sent_tokenize(text)
    complex_count = sum(1 for s in sentences if len(re.findall(r"\b(and|but|or|because|which|although)\b", s.lower())) >= 2)
    return complex_count / len(sentences) if sentences else 0

def avg_hypernym_depth(tokens):
    depths = []
    for t in tokens:
        synsets = wn.synsets(t)
        if synsets:
            depths.append(max((s.min_depth() for s in synsets), default=0))
    return np.mean(depths) if depths else 0

def topic_drift(text, model):
    paras = [p.strip() for p in text.split("\n") if p.strip()]
    if len(paras) < 2:
        return 0
    emb_start = model.encode([paras[0]], convert_to_tensor=True)
    emb_end = model.encode([paras[-1]], convert_to_tensor=True)
    return 1 - util.cos_sim(emb_start, emb_end).item()

def alliteration_ratio(tokens):
    if len(tokens) < 2:
        return 0
    count = sum(1 for i in range(len(tokens)-1) if tokens[i][0].lower() == tokens[i+1][0].lower())
    return count / (len(tokens)-1)

HEDGING_WORDS = {"might", "maybe", "perhaps", "possibly", "could", "should"}
def hedging_count(tokens):
    return sum(1 for t in tokens if t.lower() in HEDGING_WORDS)

EMOTIVE_WORDS = {"love", "hate", "happy", "sad", "angry", "excited", "worried", "proud"}
def emotive_count(tokens):
    return sum(1 for t in tokens if t.lower() in EMOTIVE_WORDS)

def add_extra_features(df, text_col="essay_clean", prompt_col="prompt_clean"):
    sbert = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
    new_feats = {
        "root_ttr": [], "corrected_ttr": [], "mattr": [], "lexical_sophistication": [],
        "complex_sentence_ratio": [], "avg_hypernym_depth": [], "topic_drift": [],
        "alliteration_ratio": [], "hedging_count": [], "emotive_count": []
    }

    for text, prompt in zip(df[text_col], df[prompt_col]):
        tokens = [t for t in word_tokenize(text) if t.isalpha()]
        new_feats["root_ttr"].append(root_ttr(tokens))
        new_feats["corrected_ttr"].append(corrected_ttr(tokens))
        new_feats["mattr"].append(mattr(tokens))
        new_feats["lexical_sophistication"].append(lexical_sophistication(tokens))
        new_feats["complex_sentence_ratio"].append(complex_sentence_ratio(text))
        new_feats["avg_hypernym_depth"].append(avg_hypernym_depth(tokens))
        new_feats["topic_drift"].append(topic_drift(text, sbert))
        new_feats["alliteration_ratio"].append(alliteration_ratio(tokens))
        new_feats["hedging_count"].append(hedging_count(tokens))
        new_feats["emotive_count"].append(emotive_count(tokens))

    return pd.concat([df, pd.DataFrame(new_feats)], axis=1)

print("Adding extra features...")
train_df = add_extra_features(train_df)
test_df = add_extra_features(test_df)
print("Extra features added!")

Adding extra features...
Extra features added!
Extra features added!


In [17]:
# =========================
# 1) Dimensionality Reduction Functions
# =========================

def apply_pca_reduction(train_emb, test_emb, n_components=N_COMPONENTS):
    """Apply PCA dimensionality reduction"""
    pca = PCA(n_components=n_components, random_state=SEED)
    train_reduced = pca.fit_transform(train_emb)
    test_reduced = pca.transform(test_emb)
    explained_var = pca.explained_variance_ratio_.sum()
    print(f"[PCA] explained_variance_sum = {explained_var:.4f}")
    return train_reduced, test_reduced, explained_var

def apply_svd_reduction(train_emb, test_emb, n_components=N_COMPONENTS):
    """Apply SVD dimensionality reduction"""
    svd = TruncatedSVD(n_components=n_components, random_state=SEED)
    train_reduced = svd.fit_transform(train_emb)
    test_reduced = svd.transform(test_emb)
    explained_var = svd.explained_variance_ratio_.sum()
    print(f"[SVD] explained_variance_sum = {explained_var:.4f}")
    return train_reduced, test_reduced, explained_var

def prepare_dataset(train_df, test_df, train_emb_reduced, test_emb_reduced, method_name="emb"):
    """Prepare dataset with reduced embeddings"""
    y = train_df[TARGET_COLS].copy()
    drop_cols = ["prompt","essay","essay_clean","prompt_clean","merged_text"] + TARGET_COLS
    
    X_train_core = train_df.drop(columns=drop_cols, errors="ignore").reset_index(drop=True)
    X_test_core = test_df.drop(columns=drop_cols, errors="ignore").reset_index(drop=True)
    
    train_emb_df = pd.DataFrame(train_emb_reduced, columns=[f"{method_name}_{i}" for i in range(N_COMPONENTS)])
    test_emb_df = pd.DataFrame(test_emb_reduced, columns=[f"{method_name}_{i}" for i in range(N_COMPONENTS)])
    
    train_combined = pd.concat([X_train_core, train_emb_df], axis=1)
    test_combined = pd.concat([X_test_core, test_emb_df], axis=1)
    
    X_mat = train_combined.values.astype(np.float32)
    Y_mat = y.values.astype(np.float32)
    X_test = test_combined.values.astype(np.float32)
    
    return X_mat, Y_mat, X_test

print("Dimensionality reduction functions ready!")

Dimensionality reduction functions ready!


In [18]:
# =========================
# 2) Load anchors and utility functions
# =========================

def _read_pred_vec(path: str, n_expected_rows: int = None) -> np.ndarray:
    df = pd.read_csv(path)
    drop_unnamed = [c for c in df.columns if str(c).startswith("Unnamed")]
    if drop_unnamed:
        df = df.drop(columns=drop_unnamed)
    cols = [c for c in TARGET_COLS if c in df.columns]
    if len(cols) == 4:
        df = df[cols]
    else:
        df = df.iloc[:, -4:]
        df.columns = TARGET_COLS
    
    # Only validate row count if n_expected_rows is provided
    if n_expected_rows is not None and len(df) != n_expected_rows:
        raise ValueError(f"{os.path.basename(path)} rows={len(df)} != expected {n_expected_rows}")
    
    return df.values.astype(np.float32).ravel()

def load_anchor_submissions(anchor_dir=ANCHOR_DIR, n_expected_rows=None):
    anchors = []
    for p in sorted(glob.glob(os.path.join(anchor_dir, "*.csv"))):
        m = re.match(r"(\d+)\.csv$", os.path.basename(p))
        if not m:
            continue
        lb = float(m.group(1)) / 100.0
        try:
            vec = _read_pred_vec(p, n_expected_rows)  # Pass through the expected rows
            anchors.append({"name": os.path.basename(p), "lb": lb, "vec": vec})
        except Exception as e:
            print(f"Skipping {p}: {e}")
    
    if len(anchors) < 3:
        print(f"Found only {len(anchors)} valid anchors, need at least 3")
        return []  # Return empty list instead of raising error
    
    return anchors

def mean_rmse_across_targets(y_true, y_pred):
    rmses = []
    for i in range(y_true.shape[1]):
        m = ~np.isnan(y_true[:, i])
        if m.any():
            rmses.append(np.sqrt(mean_squared_error(y_true[m, i], y_pred[m, i])))
    return float(np.mean(rmses)) if rmses else float("inf")

def predict_lb_from_anchors(test_vec: np.ndarray, anchors, temp: float, k: int):
    tvn = norm(test_vec) + 1e-12
    sims = []
    for a in anchors:
        avn = norm(a["vec"]) + 1e-12
        sims.append(float(np.dot(test_vec, a["vec"]) / (tvn * avn)))
    sims = np.array(sims, dtype=np.float32)
    idx = np.argsort(-sims)[:min(k, len(sims))]
    sims_k = sims[idx]
    lbs_k = np.array([anchors[i]["lb"] for i in idx], dtype=np.float32)
    w = np.exp(sims_k * temp); w /= w.sum()
    pred_lb = float((w * lbs_k).sum())
    top_info = list(zip([anchors[i]["name"] for i in idx], sims_k, lbs_k))
    return pred_lb, top_info

# Load anchors without row validation initially
print("Loading anchors...")
ANCHORS = load_anchor_submissions()
if ANCHORS:
    print(f"✅ Loaded {len(ANCHORS)} anchors successfully")
    for a in ANCHORS[:3]:  # Show first 3
        print(f"  - {a['name']}: LB={a['lb']:.3f}, vec_len={len(a['vec'])}")
else:
    print("⚠️ No valid anchors loaded - will use validation RMSE for optimization")

Loading anchors...
✅ Loaded 6 anchors successfully
  - 11.csv: LB=0.110, vec_len=1892
  - 15.csv: LB=0.150, vec_len=1892
  - 17.csv: LB=0.170, vec_len=1892


In [19]:
# =========================
# 3) Training and Optuna functions  
# =========================
from catboost import CatBoostRegressor, Pool
import optuna

BASE_PARAMS_GPU = dict(
    loss_function="MultiRMSE",
    eval_metric="MultiRMSE",
    task_type="GPU",
    devices="0",
    iterations=2000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3.0,
    bootstrap_type="Bayesian",
    bagging_temperature=1.0,
    border_count=128,
    min_data_in_leaf=5,
    random_seed=SEED,
    use_best_model=True,
    verbose=False,
)

def train_val_and_test_once(params, X_tr, Y_tr, X_va, Y_va, X_test, es_rounds=200, gpu=True):
    base = BASE_PARAMS_GPU.copy() if gpu else {**BASE_PARAMS_GPU, "task_type": "CPU"}
    m = CatBoostRegressor(**{**base, **params})
    tr_pool = Pool(X_tr, label=Y_tr)
    va_pool = Pool(X_va, label=Y_va)
    m.fit(tr_pool, eval_set=va_pool, early_stopping_rounds=es_rounds, use_best_model=True)
    va_pred = m.predict(X_va)
    val_rmse = mean_rmse_across_targets(Y_va, va_pred)
    test_pred = m.predict(X_test).astype(np.float32).ravel()
    return val_rmse, test_pred

def tune_with_optuna(X_tr, Y_tr, X_va, Y_va, X_test, n_trials=OPTUNA_TRIALS, use_anchors=True):
    """Run Optuna hyperparameter tuning"""
    study = optuna.create_study(direction="minimize")
    
    # Validate anchor compatibility once
    anchors_valid = False
    if use_anchors and ANCHORS:
        # Get expected length from a test prediction
        test_params = {"iterations": 100, "depth": 6, "learning_rate": 0.1}
        try:
            _, test_vec = train_val_and_test_once(test_params, X_tr[:100], Y_tr[:100], 
                                                X_va[:50], Y_va[:50], X_test, gpu=True)
            expected_len = len(test_vec)
            valid_anchors = [a for a in ANCHORS if len(a["vec"]) == expected_len]
            if len(valid_anchors) >= 3:
                anchors_valid = True
                print(f"✅ Found {len(valid_anchors)} compatible anchors (vec_len={expected_len})")
            else:
                print(f"⚠️ Only {len(valid_anchors)} anchors match expected length {expected_len}")
        except Exception as e:
            print(f"⚠️ Anchor validation failed: {e}")
    
    def objective(trial: optuna.Trial):
        params = {
            "iterations": trial.suggest_int("iterations", 800, 3000),
            "depth": trial.suggest_int("depth", 4, 10),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.2, log=True),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-2, 30.0, log=True),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.1, 2.0),
            "boosting_type": "Plain",
            "bootstrap_type": "Poisson",
            "loss_function": "MultiRMSE",
        }
        
        try:
            val_rmse, test_vec = train_val_and_test_once(params, X_tr, Y_tr, X_va, Y_va, X_test, gpu=True)
        except Exception as e:
            print(f"[GPU failed, falling back to CPU]: {e}")
            val_rmse, test_vec = train_val_and_test_once(params, X_tr, Y_tr, X_va, Y_va, X_test, gpu=False)
        
        trial.set_user_attr("val_rmse", float(val_rmse))
        
        # Use anchor-based objective if available and compatible
        if anchors_valid:
            try:
                pred_lb, top = predict_lb_from_anchors(test_vec, valid_anchors, temp=TEMP_SOFTMAX, k=TOPK_ANCHORS)
                trial.set_user_attr("anchor_similar", top)
                trial.set_user_attr("pred_lb", float(pred_lb))
                return float(pred_lb)
            except Exception as e:
                print(f"Anchor prediction failed: {e}")
                return float(val_rmse)
        else:
            return float(val_rmse)
    
    print(f"🚀 Starting optimization with {'anchor-based' if anchors_valid else 'RMSE-based'} objective")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study

print("Training functions ready!")

Training functions ready!


In [20]:
# =========================
# 4) Multi-seed training function
# =========================

def train_multiple_seeds(X_mat, Y_mat, X_test, best_params, method_name, n_seeds=N_SEEDS, use_gpu_final=False):
    """Train model with multiple seeds and return predictions"""
    predictions = []
    seeds = [42, 1, 26, 5, 2005][:n_seeds]  # Updated seeds
    
    print(f"\n[{method_name}] Training with {n_seeds} seeds...")
    
    for i, seed in enumerate(seeds):
        print(f"  Seed {i+1}/{n_seeds}: {seed}")
        
        if use_gpu_final:
            # GPU-compatible final parameters
            final_params = {
                "loss_function": "MultiRMSE",  # GPU compatible
                "eval_metric": "MultiRMSE",
                "task_type": "GPU",
                "devices": "0",
                "iterations": best_params.get("iterations", 2000),
                "learning_rate": best_params.get("learning_rate", 0.05),
                "depth": best_params.get("depth", 8),
                "l2_leaf_reg": best_params.get("l2_leaf_reg", 3.0),
                "bootstrap_type": "Poisson",  # GPU safe
                "bagging_temperature": best_params.get("bagging_temperature", 1.0),
                "border_count": 128,
                "min_data_in_leaf": 5,
                "random_seed": seed,
                "use_best_model": False,
                "verbose": False,
            }
        else:
            # CPU-optimized final parameters with advanced features
            final_params = {
                "loss_function": "MultiRMSEWithMissingValues",
                "eval_metric": "MultiRMSEWithMissingValues", 
                "task_type": "CPU",
                "iterations": best_params.get("iterations", 2000),
                "learning_rate": best_params.get("learning_rate", 0.05),
                "depth": best_params.get("depth", 8),
                "l2_leaf_reg": best_params.get("l2_leaf_reg", 3.0),
                "rsm": 0.93,  # CPU-only parameter
                "grow_policy": "SymmetricTree",  # CPU-only parameter
                "bootstrap_type": "Bayesian",
                "bagging_temperature": best_params.get("bagging_temperature", 1.0),
                "border_count": 128,
                "feature_border_type": "GreedyLogSum",  # CPU-only parameter
                "min_data_in_leaf": 5,
                "random_seed": seed,
                "use_best_model": False,
                "verbose": False,
            }
        
        try:
            model = CatBoostRegressor(**final_params)
            model.fit(Pool(X_mat, label=Y_mat))
            pred = model.predict(X_test)
            predictions.append(pred)
            device_used = "GPU" if use_gpu_final else "CPU"
            print(f"    ✅ Seed {seed} completed on {device_used}")
        except Exception as e:
            if use_gpu_final:
                print(f"    ⚠️ GPU failed for seed {seed}, falling back to CPU: {e}")
                # Fallback to CPU parameters
                cpu_params = {
                    "loss_function": "MultiRMSEWithMissingValues",
                    "eval_metric": "MultiRMSEWithMissingValues", 
                    "task_type": "CPU",
                    "iterations": best_params.get("iterations", 2000),
                    "learning_rate": best_params.get("learning_rate", 0.05),
                    "depth": best_params.get("depth", 8),
                    "l2_leaf_reg": best_params.get("l2_leaf_reg", 3.0),
                    "rsm": 0.93,
                    "grow_policy": "SymmetricTree",
                    "bootstrap_type": "Bayesian",
                    "bagging_temperature": best_params.get("bagging_temperature", 1.0),
                    "border_count": 128,
                    "feature_border_type": "GreedyLogSum",
                    "min_data_in_leaf": 5,
                    "random_seed": seed,
                    "use_best_model": False,
                    "verbose": False,
                }
                model = CatBoostRegressor(**cpu_params)
                model.fit(Pool(X_mat, label=Y_mat))
                pred = model.predict(X_test)
                predictions.append(pred)
                print(f"    ✅ Seed {seed} completed on CPU (fallback)")
            else:
                print(f"    ❌ CPU training failed for seed {seed}: {e}")
                raise e
    
    # Average predictions across seeds
    avg_pred = np.mean(predictions, axis=0)
    
    # Save individual seed predictions
    ts = int(time.time())
    for i, (pred, seed) in enumerate(zip(predictions, seeds)):
        pred_df = pd.DataFrame(pred, columns=TARGET_COLS)
        pred_df["ID"] = range(1, len(pred) + 1)
        pred_path = f"{method_name}_seed{seed}_{ts}.csv"
        pred_df.to_csv(pred_path, index=False)
        print(f"    Saved: {pred_path}")
    
    # Save averaged prediction
    avg_df = pd.DataFrame(avg_pred, columns=TARGET_COLS)
    avg_df["ID"] = range(1, len(avg_pred) + 1)
    avg_path = f"{method_name}_averaged_{ts}.csv"
    avg_df.to_csv(avg_path, index=False)
    print(f"  Saved averaged: {avg_path}")
    
    return avg_pred, predictions

print("Multi-seed training function ready!")

Multi-seed training function ready!


In [21]:
# =========================
# 5) Main Pipeline Execution
# =========================

print("="*60)
print("STARTING PCA vs SVD ENSEMBLE PIPELINE")
print("="*60)

# Prepare train/val split (reuse same split for both methods)
y = train_df[TARGET_COLS].copy()
mask_cc = ~np.isnan(y.values).any(axis=1)  # Complete cases only for Optuna (GPU requires no missing values)

results = {}

# ===== PCA PIPELINE =====
print("\n" + "="*40)
print("PCA PIPELINE")
print("="*40)

print("Applying PCA reduction...")
train_pca, test_pca, pca_var = apply_pca_reduction(train_emb, test_emb)
X_mat_pca, Y_mat_pca, X_test_pca = prepare_dataset(train_df, test_df, train_pca, test_pca, "pca")

print(f"PCA dataset shapes: X={X_mat_pca.shape}, Y={Y_mat_pca.shape}, X_test={X_test_pca.shape}")
print(f"Complete cases for Optuna: {mask_cc.sum()}/{len(mask_cc)} ({mask_cc.mean()*100:.1f}%)")

# Prepare consistent train/val split (complete cases only for GPU Optuna)
X_cc_pca = X_mat_pca[mask_cc]
Y_cc_pca = Y_mat_pca[mask_cc]
X_tr_pca, X_va_pca, Y_tr_pca, Y_va_pca = train_test_split(
    X_cc_pca, Y_cc_pca, test_size=VAL_SIZE, random_state=SEED, shuffle=True
)

print("Running PCA Optuna tuning...")
study_pca = tune_with_optuna(X_tr_pca, Y_tr_pca, X_va_pca, Y_va_pca, X_test_pca)
print(f"PCA Best objective: {study_pca.best_value:.6f}")
print(f"PCA Best params: {study_pca.best_trial.params}")

# Train PCA with multiple seeds on FULL dataset (CPU can handle missing values)
print("Training PCA on full dataset (including missing target values)...")
pca_pred, pca_seeds = train_multiple_seeds(X_mat_pca, Y_mat_pca, X_test_pca, 
                                          study_pca.best_trial.params, "pca", 
                                          use_gpu_final=False)  # CPU uses MultiRMSEWithMissingValues

results['pca'] = {
    'predictions': pca_pred,
    'seed_predictions': pca_seeds,
    'best_params': study_pca.best_trial.params,
    'best_score': study_pca.best_value,
    'explained_variance': pca_var
}

STARTING PCA vs SVD ENSEMBLE PIPELINE

PCA PIPELINE
Applying PCA reduction...


[I 2025-08-12 20:51:19,482] A new study created in memory with name: no-name-6373c256-1160-4018-94a3-a68abab0fa06


[PCA] explained_variance_sum = 0.9116
PCA dataset shapes: X=(9912, 321), Y=(9912, 4), X_test=(473, 321)
Complete cases for Optuna: 8866/9912 (89.4%)
Running PCA Optuna tuning...
✅ Found 6 compatible anchors (vec_len=1892)
🚀 Starting optimization with anchor-based objective
✅ Found 6 compatible anchors (vec_len=1892)
🚀 Starting optimization with anchor-based objective


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2025-08-12 20:51:40,835] Trial 0 finished with value: 0.4970618784427643 and parameters: {'iterations': 2375, 'depth': 7, 'learning_rate': 0.008652485177391358, 'l2_leaf_reg': 0.07127734417018772, 'bagging_temperature': 1.416541673530918}. Best is trial 0 with value: 0.4970618784427643.
[I 2025-08-12 20:53:16,419] Trial 1 finished with value: 0.4975072741508484 and parameters: {'iterations': 2090, 'depth': 10, 'learning_rate': 0.0038460940471333476, 'l2_leaf_reg': 0.14910853076146235, 'bagging_temperature': 1.8435787559880488}. Best is trial 0 with value: 0.4970618784427643.
[I 2025-08-12 20:53:16,419] Trial 1 finished with value: 0.4975072741508484 and parameters: {'iterations': 2090, 'depth': 10, 'learning_rate': 0.0038460940471333476, 'l2_leaf_reg': 0.14910853076146235, 'bagging_temperature': 1.8435787559880488}. Best is trial 0 with value: 0.4970618784427643.
[I 2025-08-12 20:53:41,169] Trial 2 finished with value: 0.4977587163448334 and parameters: {'iterations': 878, 'depth': 

In [22]:
# ===== SVD PIPELINE =====
print("\n" + "="*40)
print("SVD PIPELINE") 
print("="*40)

print("Applying SVD reduction...")
train_svd, test_svd, svd_var = apply_svd_reduction(train_emb, test_emb)
X_mat_svd, Y_mat_svd, X_test_svd = prepare_dataset(train_df, test_df, train_svd, test_svd, "svd")

print(f"SVD dataset shapes: X={X_mat_svd.shape}, Y={Y_mat_svd.shape}, X_test={X_test_svd.shape}")

# Use same mask for consistency (complete cases for GPU Optuna)
X_cc_svd = X_mat_svd[mask_cc]
Y_cc_svd = Y_mat_svd[mask_cc]  
X_tr_svd, X_va_svd, Y_tr_svd, Y_va_svd = train_test_split(
    X_cc_svd, Y_cc_svd, test_size=VAL_SIZE, random_state=SEED, shuffle=True
)

print("Running SVD Optuna tuning...")
study_svd = tune_with_optuna(X_tr_svd, Y_tr_svd, X_va_svd, Y_va_svd, X_test_svd)
print(f"SVD Best objective: {study_svd.best_value:.6f}")
print(f"SVD Best params: {study_svd.best_trial.params}")

# Train SVD with multiple seeds on FULL dataset (CPU can handle missing values)
print("Training SVD on full dataset (including missing target values)...")
svd_pred, svd_seeds = train_multiple_seeds(X_mat_svd, Y_mat_svd, X_test_svd,
                                          study_svd.best_trial.params, "svd",
                                          use_gpu_final=False)  # CPU uses MultiRMSEWithMissingValues

results['svd'] = {
    'predictions': svd_pred,
    'seed_predictions': svd_seeds,
    'best_params': study_svd.best_trial.params,
    'best_score': study_svd.best_value,
    'explained_variance': svd_var
}

print("\nBoth pipelines completed!")


SVD PIPELINE
Applying SVD reduction...


[I 2025-08-12 21:54:09,242] A new study created in memory with name: no-name-69e360c2-cdd7-40a2-9fe7-25a3b41042b2


[SVD] explained_variance_sum = 0.9113
SVD dataset shapes: X=(9912, 321), Y=(9912, 4), X_test=(473, 321)
Running SVD Optuna tuning...
✅ Found 6 compatible anchors (vec_len=1892)
🚀 Starting optimization with anchor-based objective
✅ Found 6 compatible anchors (vec_len=1892)
🚀 Starting optimization with anchor-based objective


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2025-08-12 21:54:15,799] Trial 0 finished with value: 0.5026648044586182 and parameters: {'iterations': 1705, 'depth': 5, 'learning_rate': 0.006756413390203931, 'l2_leaf_reg': 0.10821415899066898, 'bagging_temperature': 0.34564721528115194}. Best is trial 0 with value: 0.5026648044586182.
[I 2025-08-12 21:54:17,916] Trial 1 finished with value: 0.499750018119812 and parameters: {'iterations': 2018, 'depth': 5, 'learning_rate': 0.06211599801130484, 'l2_leaf_reg': 0.3589754277678872, 'bagging_temperature': 0.353720843210666}. Best is trial 1 with value: 0.499750018119812.
[I 2025-08-12 21:54:17,916] Trial 1 finished with value: 0.499750018119812 and parameters: {'iterations': 2018, 'depth': 5, 'learning_rate': 0.06211599801130484, 'l2_leaf_reg': 0.3589754277678872, 'bagging_temperature': 0.353720843210666}. Best is trial 1 with value: 0.499750018119812.
[I 2025-08-12 21:54:22,010] Trial 2 finished with value: 0.49863606691360474 and parameters: {'iterations': 2131, 'depth': 7, 'learni

In [23]:
# =========================
# 6) Ensemble Analysis and Creation
# =========================

print("\n" + "="*40)
print("ENSEMBLE ANALYSIS")
print("="*40)

# Calculate correlation between PCA and SVD predictions
pca_flat = results['pca']['predictions'].flatten()
svd_flat = results['svd']['predictions'].flatten()
correlation = np.corrcoef(pca_flat, svd_flat)[0, 1]

print(f"Correlation between PCA and SVD predictions: {correlation:.6f}")

# Simple ensemble (50-50 average)
ensemble_simple = (results['pca']['predictions'] + results['svd']['predictions']) / 2

# Weighted ensemble based on best scores (lower is better)
pca_score = results['pca']['best_score']
svd_score = results['svd']['best_score'] 

# Convert to weights (inverse of scores)
pca_weight = 1 / (pca_score + 1e-8)
svd_weight = 1 / (svd_score + 1e-8)
total_weight = pca_weight + svd_weight

pca_weight_norm = pca_weight / total_weight
svd_weight_norm = svd_weight / total_weight

ensemble_weighted = (pca_weight_norm * results['pca']['predictions'] + 
                    svd_weight_norm * results['svd']['predictions'])

print(f"Weighted ensemble - PCA weight: {pca_weight_norm:.3f}, SVD weight: {svd_weight_norm:.3f}")

# Anchor-based ensemble (if anchors available)
if ANCHORS and len(ANCHORS) > 0:
    print("\nCalculating anchor-based ensemble...")
    
    # Validate anchor dimensions
    expected_len = len(pca_flat)
    valid_anchors = [a for a in ANCHORS if len(a["vec"]) == expected_len]
    
    if valid_anchors:
        pca_lb, _ = predict_lb_from_anchors(pca_flat, valid_anchors, TEMP_SOFTMAX, TOPK_ANCHORS)
        svd_lb, _ = predict_lb_from_anchors(svd_flat, valid_anchors, TEMP_SOFTMAX, TOPK_ANCHORS)
        
        print(f"PCA predicted LB: {pca_lb:.6f}")
        print(f"SVD predicted LB: {svd_lb:.6f}")
        
        # Anchor-based weights (higher LB = lower weight since we want lower scores)
        anchor_pca_weight = 1 / (pca_lb + 1e-8)
        anchor_svd_weight = 1 / (svd_lb + 1e-8)
        anchor_total = anchor_pca_weight + anchor_svd_weight
        
        anchor_pca_norm = anchor_pca_weight / anchor_total
        anchor_svd_norm = anchor_svd_weight / anchor_total
        
        ensemble_anchor = (anchor_pca_norm * results['pca']['predictions'] + 
                          anchor_svd_norm * results['svd']['predictions'])
        
        print(f"Anchor ensemble - PCA weight: {anchor_pca_norm:.3f}, SVD weight: {anchor_svd_norm:.3f}")
    else:
        ensemble_anchor = ensemble_simple
        print("No valid anchors found, using simple ensemble")
else:
    ensemble_anchor = ensemble_simple
    print("No anchors available, using simple ensemble")


ENSEMBLE ANALYSIS
Correlation between PCA and SVD predictions: 0.994248
Weighted ensemble - PCA weight: 0.500, SVD weight: 0.500

Calculating anchor-based ensemble...
PCA predicted LB: 0.334075
SVD predicted LB: 0.337420
Anchor ensemble - PCA weight: 0.502, SVD weight: 0.498


In [24]:
# =========================
# 7) Save All Results
# =========================

ts = int(time.time())

print(f"\n" + "="*40)
print("SAVING RESULTS")
print("="*40)

# Save ensemble predictions
ensembles = {
    'simple': ensemble_simple,
    'weighted': ensemble_weighted, 
    'anchor': ensemble_anchor
}

for ensemble_name, predictions in ensembles.items():
    df = pd.DataFrame(predictions, columns=TARGET_COLS)
    df["ID"] = range(1, len(predictions) + 1)
    path = f"ensemble_{ensemble_name}_{ts}.csv"
    df.to_csv(path, index=False)
    print(f"Saved: {path}")

# Save results summary
summary = {
    'timestamp': ts,
    'correlation_pca_svd': float(correlation),
    'pca_explained_variance': float(results['pca']['explained_variance']),
    'svd_explained_variance': float(results['svd']['explained_variance']),
    'pca_best_score': float(results['pca']['best_score']),
    'svd_best_score': float(results['svd']['best_score']),
    'pca_best_params': results['pca']['best_params'],
    'svd_best_params': results['svd']['best_params'],
    'ensemble_weights': {
        'score_based': {'pca': float(pca_weight_norm), 'svd': float(svd_weight_norm)},
    },
    'optuna_trials': OPTUNA_TRIALS,
    'n_seeds': N_SEEDS,
    'n_components': N_COMPONENTS
}

summary_path = f"pipeline_summary_{ts}.json"
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)
print(f"Saved: {summary_path}")

# Print final summary
print(f"\n" + "="*60)
print("PIPELINE SUMMARY")
print("="*60)
print(f"PCA explained variance: {results['pca']['explained_variance']:.4f}")
print(f"SVD explained variance: {results['svd']['explained_variance']:.4f}")
print(f"PCA best score: {results['pca']['best_score']:.6f}")
print(f"SVD best score: {results['svd']['best_score']:.6f}")
print(f"Correlation: {correlation:.6f}")
print(f"Ensemble diversity: {'Good' if correlation < 0.995 else 'Limited'}")

if correlation < 0.995:
    print("\n✅ Low correlation detected - ensemble should be beneficial!")
    print("📊 Recommended: Use weighted or anchor-based ensemble")
else:
    print("\n⚠️ High correlation - ensemble may provide limited benefit")
    print("🎯 Recommended: Use the better performing individual model")

print(f"\n🎉 Pipeline completed! All results saved with timestamp {ts}")


SAVING RESULTS
Saved: ensemble_simple_1755016989.csv
Saved: ensemble_weighted_1755016989.csv
Saved: ensemble_anchor_1755016989.csv
Saved: pipeline_summary_1755016989.json

PIPELINE SUMMARY
PCA explained variance: 0.9116
SVD explained variance: 0.9113
PCA best score: 0.495457
SVD best score: 0.495309
Correlation: 0.994248
Ensemble diversity: Good

✅ Low correlation detected - ensemble should be beneficial!
📊 Recommended: Use weighted or anchor-based ensemble

🎉 Pipeline completed! All results saved with timestamp 1755016989


In [None]:
d