In [None]:
!pip install -q sentence-transformers transformers
!pip install -q lightgbm

In [None]:
import torch
import transformers
from sentence_transformers import SentenceTransformer

print(torch.__version__)
print("Transformers OK:", transformers.__version__)
model = SentenceTransformer("intfloat/multilingual-e5-large")
print("Model loaded!")


In [None]:
import json
import pandas as pd

# Load the JSON files
with open("train_data.json", "r", encoding="utf8") as f:
    train_raw = json.load(f)

with open("test_data.json", "r", encoding="utf8") as f:
    test_raw = json.load(f)

with open("metric_names.json", "r", encoding="utf8") as f:
    metric_map = json.load(f)

# Convert to DataFrames
train_df = pd.DataFrame(train_raw)
test_df = pd.DataFrame(test_raw)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# Show 3 samples
train_df.head(3)


In [None]:
def combine_all(row):
    sp = str(row.get("system_prompt", "")) if row.get("system_prompt") else ""
    up = str(row.get("user_prompt", ""))
    rp = str(row.get("response", ""))

    return sp + " [SYS] " + up + " [USR] " + rp + " [RES]"

train_df["combined_text"] = train_df.apply(combine_all, axis=1)
test_df["combined_text"]  = test_df.apply(combine_all, axis=1)

train_df[["combined_text"]].head(3)


In [None]:
train_df["metric_text"] = train_df["metric_name"].astype(str)
test_df["metric_text"]  = test_df["metric_name"].astype(str)

train_df[["metric_name", "metric_text"]].head()

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm

# Load the embedding model
model = SentenceTransformer("intfloat/multilingual-e5-large")

# Prepare metric_text values
unique_metric_texts = train_df["metric_text"].unique().tolist()

# Dictionary to store metric embeddings
metric_emb_dict = {}
batch_size = 64

# Embed metric names
print("Embedding metric_text values...")
for i in tqdm(range(0, len(unique_metric_texts), batch_size)):
    batch = unique_metric_texts[i:i+batch_size]
    batch_emb = model.encode(batch, batch_size=batch_size, convert_to_numpy=True)
    for txt, emb in zip(batch, batch_emb):
        metric_emb_dict[txt] = emb

# Build aligned metric embedding arrays
train_metric_embs = np.vstack([metric_emb_dict[t] for t in train_df["metric_text"]])
test_metric_embs  = np.vstack([metric_emb_dict[t] for t in test_df["metric_text"]])

# Embed combined text (system + user + response)
print("Embedding combined prompt/response texts...")
train_text_embs = model.encode(
    train_df["combined_text"].tolist(),
    batch_size=batch_size,
    convert_to_numpy=True,
    show_progress_bar=True
)

test_text_embs = model.encode(
    test_df["combined_text"].tolist(),
    batch_size=batch_size,
    convert_to_numpy=True,
    show_progress_bar=True
)

# Save the embeddings
np.save("train_metric_embs.npy", train_metric_embs)
np.save("test_metric_embs.npy", test_metric_embs)
np.save("train_text_embs.npy", train_text_embs)
np.save("test_text_embs.npy", test_text_embs)

print("Embedding shapes:")
print("train_metric_embs:", train_metric_embs.shape)
print("train_text_embs :",  train_text_embs.shape)
print("test_metric_embs :",  test_metric_embs.shape)
print("test_text_embs :",   test_text_embs.shape)


In [None]:
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)

# Load embeddings again (to combine)
train_metric = np.load("train_metric_embs.npy")
train_text   = np.load("train_text_embs.npy")
y_real       = train_df["score"].values.astype(np.float32)

N = len(train_metric)


# --------------------
# 1) Shuffle-based negatives
# --------------------
perm = rng.permutation(N)
neg_metric_1 = train_metric
neg_text_1   = train_text[perm]   # mismatched
neg_y_1      = rng.integers(0, 3, size=N)


# --------------------
# 2) Noise-corrupted negatives
# --------------------
noise = rng.normal(scale=0.6, size=train_text.shape)
neg_metric_2 = train_metric
neg_text_2   = train_text + noise
neg_y_2      = rng.integers(0, 3, size=N)


# --------------------
# 3) Metric swap negatives
# --------------------
perm2 = rng.permutation(N)
neg_metric_3 = train_metric[perm2]  # mismatched metric
neg_text_3   = train_text
neg_y_3      = rng.integers(0, 3, size=N)


# --------------------
# Combine everything
# --------------------
m_all = np.vstack([train_metric, neg_metric_1, neg_metric_2, neg_metric_3])
t_all = np.vstack([train_text,   neg_text_1,   neg_text_2,   neg_text_3])
y_all = np.concatenate([y_real,  neg_y_1,      neg_y_2,      neg_y_3]).astype(np.float32)

print("Combined shapes:")
print("m_all:", m_all.shape)
print("t_all:", t_all.shape)
print("y_all:", y_all.shape)

np.save("m_all.npy", m_all)
np.save("t_all.npy", t_all)
np.save("y_all.npy", y_all)

In [None]:
# Step 4 — Build features (concat, absdiff, prod, cosine) and save
import numpy as np
from sklearn.preprocessing import StandardScaler

# load combined embeddings + labels (you already saved them)
m_all = np.load("m_all.npy")      # shape (20000, 1024)
t_all = np.load("t_all.npy")      # shape (20000, 1024)
y_all = np.load("y_all.npy")      # shape (20000,)

# cosine similarity
dot = np.sum(m_all * t_all, axis=1)
norms = (np.linalg.norm(m_all, axis=1) * np.linalg.norm(t_all, axis=1)) + 1e-9
cos = (dot / norms).reshape(-1, 1).astype(np.float32)

# elementwise features
absdiff = np.abs(m_all - t_all).astype(np.float32)
prod = (m_all * t_all).astype(np.float32)

# concat metric and text
concat = np.hstack([m_all.astype(np.float32), t_all.astype(np.float32)])  # (N, 2048)

# final X
X = np.hstack([concat, absdiff, prod, cos]).astype(np.float32)  # (N, 4097)

print("X shape:", X.shape)
print("y shape:", y_all.shape)
print("sample cosines min/max:", float(cos.min()), float(cos.max()))

# (Optional) Save a scaler fitted on the training features (you can also fit inside the training loop)
# scaler = StandardScaler()
# scaler.fit(X)
# import joblib
# joblib.dump(scaler, "feature_scaler.joblib")

# Save features and labels
np.save("X_all.npy", X)
np.save("y_all.npy", y_all)

print("Saved X_all.npy and y_all.npy")


In [None]:
# NLL-MLP on existing embeddings (classes 0..10) — reproducibility test
# Paste & run in your notebook. Assumes X_all.npy, y_all.npy exist and test_metric_embs/test_text_embs exist.

import os, numpy as np, pandas as pd, random, time
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# -------------------------
# Load features / build test features if needed
# -------------------------
X = np.load("X_all.npy").astype(np.float32)     # (N, 4097)
y = np.load("y_all.npy").astype(np.float32)     # (N, )

# If X_test.npy exists use it, otherwise build from test embeddings
if os.path.exists("X_test.npy"):
    X_test = np.load("X_test.npy").astype(np.float32)
else:
    print("X_test.npy not found — building from test_metric_embs.npy + test_text_embs.npy")
    tm = np.load("test_metric_embs.npy")
    tt = np.load("test_text_embs.npy")
    dot = np.sum(tm * tt, axis=1)
    norms = (np.linalg.norm(tm, axis=1) * np.linalg.norm(tt, axis=1)) + 1e-9
    cos = (dot / norms).reshape(-1,1).astype(np.float32)
    absdiff = np.abs(tm - tt).astype(np.float32)
    prod = (tm * tt).astype(np.float32)
    concat = np.hstack([tm.astype(np.float32), tt.astype(np.float32)])
    X_test = np.hstack([concat, absdiff, prod, cos]).astype(np.float32)
    np.save("X_test.npy", X_test)
    print("Saved X_test.npy")

print("Shapes -> X:", X.shape, "X_test:", X_test.shape, "y:", y.shape)

# -------------------------
# Convert labels to integer classes 0..10
# -------------------------
y_int = np.rint(y).astype(int)  # round if necessary
y_int = np.clip(y_int, 0, 10)
assert y_int.min() >= 0 and y_int.max() <= 10

# -------------------------
# Dataset + model
# -------------------------
class EmbClsDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

class NLL_MLP(nn.Module):
    def __init__(self, in_dim, hidden1=1024, hidden2=512, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden1),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden1, hidden2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden2, 128),
            nn.GELU(),
            nn.Linear(128, 11)   # 11 classes: 0..10 logits
        )
    def forward(self, x):
        return self.net(x)  # logits

# -------------------------
# Training params
# -------------------------
NFOLDS = 5
EPOCHS = 12
BATCH = 256
LR = 1e-3
WD = 1e-5
PATIENCE = 3   # early stop patience on val rmse

kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

oof_probs = np.zeros((len(X), 11), dtype=np.float32)
test_probs_folds = np.zeros((NFOLDS, X_test.shape[0], 11), dtype=np.float32)

# -------------------------
# Train folds
# -------------------------
for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n=== Fold {fold} ===")
    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y_int[tr_idx], y_int[val_idx]

    train_ds = EmbClsDataset(X_tr, y_tr)
    val_ds   = EmbClsDataset(X_val, y_val)
    test_ds  = EmbClsDataset(X_test)

    train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True, pin_memory=True)
    val_dl   = DataLoader(val_ds, batch_size=BATCH, shuffle=False, pin_memory=True)
    test_dl  = DataLoader(test_ds, batch_size=BATCH, shuffle=False, pin_memory=True)

    model = NLL_MLP(in_dim=X.shape[1]).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
    criterion = nn.CrossEntropyLoss()

    best_rmse = 1e9
    best_state = None
    patience = 0

    for ep in range(1, EPOCHS+1):
        model.train()
        running_loss = 0.0
        t0 = time.time()
        for xb, yb in train_dl:
            xb = xb.to(device); yb = yb.to(device)
            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            running_loss += loss.item() * xb.size(0)

        # validation
        model.eval()
        val_probs = []
        with torch.no_grad():
            for xb, yb in val_dl:
                xb = xb.to(device)
                logits = model(xb)
                probs = torch.softmax(logits, dim=1).cpu().numpy()
                val_probs.append(probs)
        val_probs = np.concatenate(val_probs, axis=0)
        val_exp = (val_probs * np.arange(11)[None, :]).sum(axis=1)
        val_rmse = np.sqrt(mean_squared_error(y[val_idx], val_exp))

        scheduler.step()
        print(f"Epoch {ep}/{EPOCHS} | train_loss={running_loss/len(train_ds):.4f} | val_RMSE={val_rmse:.4f} | time={time.time()-t0:.1f}s")

        # early stopping
        if val_rmse < best_rmse - 1e-5:
            best_rmse = val_rmse
            best_state = model.state_dict()
            patience = 0
        else:
            patience += 1
            if patience >= PATIENCE:
                print("Early stopping")
                break

    # load best
    model.load_state_dict(best_state)
    model.to(device)
    model.eval()

    # OOF probs
    val_probs = []
    with torch.no_grad():
        for xb, yb in val_dl:
            xb = xb.to(device)
            logits = model(xb)
            val_probs.append(torch.softmax(logits, dim=1).cpu().numpy())
    val_probs = np.concatenate(val_probs, axis=0)
    oof_probs[val_idx] = val_probs

    # test probs for this fold
    fold_test_probs = []
    with torch.no_grad():
        for xb in test_dl:
            xb = xb.to(device)
            logits = model(xb)
            fold_test_probs.append(torch.softmax(logits, dim=1).cpu().numpy())
    fold_test_probs = np.concatenate(fold_test_probs, axis=0)
    test_probs_folds[fold] = fold_test_probs

    print(f"Fold {fold} best val RMSE: {best_rmse:.4f}")

# -------------------------
# Evaluate OOF, Calibration, Final test preds
# -------------------------
oof_exp = (oof_probs * np.arange(11)[None, :]).sum(axis=1)
oof_rmse = np.sqrt(mean_squared_error(y, oof_exp))
print("\nOOF RMSE (raw):", oof_rmse)

# calibrate with linear regression
cal = LinearRegression().fit(oof_exp.reshape(-1,1), y)
oof_cal = cal.predict(oof_exp.reshape(-1,1))
print("OOF RMSE (calibrated):", np.sqrt(mean_squared_error(y, oof_cal)))
print("Calibration params: a=", cal.coef_[0], "b=", cal.intercept_)

# final test expected score (mean over fold probs -> expected -> calibrate)
test_probs_mean = test_probs_folds.mean(axis=0)
test_exp = (test_probs_mean * np.arange(11)[None,:]).sum(axis=1)
test_exp_cal = cal.predict(test_exp.reshape(-1,1))
test_exp_cal = np.clip(test_exp_cal, 0, 10)

# Save artifacts
np.save("oof_probs_nll.npy", oof_probs)
np.save("test_probs_nll.npy", test_probs_mean)
np.save("oof_nll.npy", oof_exp)
np.save("test_nll.npy", test_exp)

# Submission
# make sure test_df exists (we used earlier)
if 'test_df' not in globals():
    import json
    with open("test_data.json","r",encoding="utf8") as f:
        tdata = json.load(f)
    test_df = pd.DataFrame(tdata)

test_df["ID"] = np.arange(1, len(test_exp_cal)+1)
sub = pd.DataFrame({"ID": test_df["ID"], "score": test_exp_cal})
sub.to_csv("submission_nll_mlp.csv", index=False)
print("\nSaved submission_nll_mlp.csv")
print("OOF RMSE final (calibrated):", np.sqrt(mean_squared_error(y, oof_cal)))


NEGATIVE LOG LIKELIHOOD UPAR WLA

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

tok = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-large-en-v1.5")
model = AutoModel.from_pretrained("Alibaba-NLP/gte-large-en-v1.5")
print("OK")

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

MODEL_NAME = "Alibaba-NLP/gte-large-en-v1.5"

print("Loading tokenizer...")
tok = AutoTokenizer.from_pretrained(MODEL_NAME)

print("Loading model...")
model = AutoModel.from_pretrained(MODEL_NAME)

print("Loaded successfully! Device:", "cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# ------------------------------
# FULL CELL: compute GTE embeddings, build features, train NLL-MLP (KFold)
# Paste & run in the notebook where you loaded GTE model/tokenizer and confirmed CUDA.
# ------------------------------
import os, time, math, random
import numpy as np, pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModel

# -----------------------
# Config (tweak if needed)
# -----------------------
MODEL_NAME = "Alibaba-NLP/gte-large-en-v1.5"   # you're already loaded, but kept for clarity
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

ENC_BATCH = 32          # batch for encoding (reduce to 8/16 if OOM)
MAX_LEN = 512

# -----------------------
# Ensure tokenizer & model exist in this session
# (If not loaded already, load them; otherwise reuse)
# -----------------------
try:
    tokenizer  # if defined earlier
    model
except NameError:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME).to(device)

model.eval()

# -----------------------
# Load augmented train (or fallback to original train)
# -----------------------
if os.path.exists("augmented_train.csv"):
    df = pd.read_csv("augmented_train.csv")
    # expected columns: metric_text, combined_text, label (or score)
    if "label" not in df.columns and "score" in df.columns:
        df = df.rename(columns={"score": "label"})
else:
    # fallback: build from train_data.json
    import json
    with open("train_data.json","r",encoding="utf8") as f:
        r = json.load(f)
    df = pd.DataFrame(r)
    def combine_all(row):
        sp = str(row.get("system_prompt","")) if row.get("system_prompt") else ""
        up = str(row.get("user_prompt","")) if row.get("user_prompt") else ""
        rp = str(row.get("response","")) if row.get("response") else ""
        return sp + " [SYS] " + up + " [USR] " + rp + " [RES]"
    df["combined_text"] = df.apply(combine_all, axis=1)
    df["metric_text"] = df["metric_name"].astype(str)
    df = df.rename(columns={"score":"label"})

print("Loaded train rows:", len(df))
# keep only first N if debugging
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# load test_data for final inference
import json
with open("test_data.json","r",encoding="utf8") as f:
    test_raw = json.load(f)
test_df = pd.DataFrame(test_raw)
def combine_all(row):
    sp = str(row.get("system_prompt","")) if row.get("system_prompt") else ""
    up = str(row.get("user_prompt","")) if row.get("user_prompt") else ""
    rp = str(row.get("response","")) if row.get("response") else ""
    return sp + " [SYS] " + up + " [USR] " + rp + " [RES]"
test_df["combined_text"] = test_df.apply(combine_all, axis=1)
test_df["metric_text"] = test_df["metric_name"].astype(str)
print("Loaded test rows:", len(test_df))

# -----------------------
# Helper: mean-pool to get sentence embedding
# -----------------------
def mean_pool(last_hidden_state, attention_mask):
    # last_hidden_state: (B, L, D); attention_mask: (B, L)
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    masked = last_hidden_state * mask
    summed = masked.sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1e-9)
    mean_pooled = summed / counts
    return mean_pooled

# -----------------------
# Encode helper (batched)
# -----------------------
@torch.no_grad()
def encode_texts(texts, batch_size=32, prefix_tokenize=None):
    embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding", ncols=80):
        batch = texts[i:i+batch_size]
        # optionally prepend metric or other prefix in the string if needed
        enc = tokenizer(batch, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt")
        input_ids = enc["input_ids"].to(device)
        attn = enc["attention_mask"].to(device)
        out = model(input_ids=input_ids, attention_mask=attn, output_hidden_states=True, return_dict=True)
        last = out.last_hidden_state  # (B, L, D)
        pooled = mean_pool(last, attn)  # (B, D)
        embs.append(pooled.cpu().numpy())
    embs = np.vstack(embs)
    return embs

# -----------------------
# Build text lists to encode
# For metric_text we can encode the metric string alone
# For combined_text we encode as-is (system + user + response)
# -----------------------
train_metric_texts = df["metric_text"].astype(str).tolist()
train_combined_texts = df["combined_text"].astype(str).tolist()
test_metric_texts = test_df["metric_text"].astype(str).tolist()
test_combined_texts = test_df["combined_text"].astype(str).tolist()

# Encode (may take a couple minutes)
print("Encoding train metric_texts...")
train_metric_embs = encode_texts(train_metric_texts, batch_size=ENC_BATCH)
print("Encoding train combined_texts...")
train_text_embs = encode_texts(train_combined_texts, batch_size=ENC_BATCH)
print("Encoding test metric_texts...")
test_metric_embs = encode_texts(test_metric_texts, batch_size=ENC_BATCH)
print("Encoding test combined_texts...")
test_text_embs = encode_texts(test_combined_texts, batch_size=ENC_BATCH)

print("Shapes:", train_metric_embs.shape, train_text_embs.shape, test_metric_embs.shape, test_text_embs.shape)

# save embeddings (so we can reuse)
np.save("train_metric_gte.npy", train_metric_embs)
np.save("train_text_gte.npy", train_text_embs)
np.save("test_metric_gte.npy", test_metric_embs)
np.save("test_text_gte.npy", test_text_embs)
print("Saved GTE embeddings to disk.")

# -----------------------
# Build features (concat, absdiff, prod, cos) same as earlier pipeline
# -----------------------
def build_features(metric_embs, text_embs):
    assert metric_embs.shape == text_embs.shape
    dot = np.sum(metric_embs * text_embs, axis=1)
    norms = (np.linalg.norm(metric_embs, axis=1) * np.linalg.norm(text_embs, axis=1)) + 1e-9
    cos = (dot / norms).reshape(-1,1).astype(np.float32)
    absdiff = np.abs(metric_embs - text_embs).astype(np.float32)
    prod = (metric_embs * text_embs).astype(np.float32)
    concat = np.hstack([metric_embs.astype(np.float32), text_embs.astype(np.float32)])
    X = np.hstack([concat, absdiff, prod, cos]).astype(np.float32)
    return X

print("Building train features...")
X_gte = build_features(train_metric_embs, train_text_embs)
print("Building test features...")
X_test_gte = build_features(test_metric_embs, test_text_embs)

y = df["label"].values.astype(np.float32)
# if 'label' is a continuous score, round to nearest int for classification
y_int = np.rint(y).astype(int)
y_int = np.clip(y_int, 0, 10)

print("Final shapes -> X:", X_gte.shape, "X_test:", X_test_gte.shape, "y:", y_int.shape)
np.save("X_gte.npy", X_gte)
np.save("X_test_gte.npy", X_test_gte)
np.save("y_int_gte.npy", y_int)
print("Saved feature matrices.")

# -----------------------
# Train NLL-MLP classifier on new GTE features (reuse model structure from before)
# -----------------------
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

class EmbClsDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

class NLL_MLP(nn.Module):
    def __init__(self, in_dim, hidden1=1024, hidden2=512, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden1),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden1, hidden2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden2, 128),
            nn.GELU(),
            nn.Linear(128, 11)
        )
    def forward(self, x):
        return self.net(x)

# training params (tune)
NFOLDS = 5
EPOCHS = 12
BATCH = 256
LR = 1e-3
WD = 1e-5
PATIENCE = 3

kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
oof_probs = np.zeros((len(X_gte), 11), dtype=np.float32)
test_probs_folds = np.zeros((NFOLDS, X_test_gte.shape[0], 11), dtype=np.float32)

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_gte)):
    print(f"\n=== Fold {fold} ===")
    X_tr, X_val = X_gte[tr_idx], X_gte[val_idx]
    y_tr, y_val = y_int[tr_idx], y_int[val_idx]

    train_ds = EmbClsDataset(X_tr, y_tr)
    val_ds   = EmbClsDataset(X_val, y_val)
    test_ds  = EmbClsDataset(X_test_gte)

    train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True, pin_memory=True)
    val_dl   = DataLoader(val_ds, batch_size=BATCH, shuffle=False, pin_memory=True)
    test_dl  = DataLoader(test_ds, batch_size=BATCH, shuffle=False, pin_memory=True)

    model_cls = NLL_MLP(in_dim=X_gte.shape[1]).to(device)
    optimizer = torch.optim.AdamW(model_cls.parameters(), lr=LR, weight_decay=WD)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
    criterion = nn.CrossEntropyLoss()

    best_rmse = 1e9; best_state = None; patience = 0

    for ep in range(1, EPOCHS+1):
        model_cls.train()
        running_loss = 0.0
        t0 = time.time()
        for xb, yb in train_dl:
            xb = xb.to(device); yb = yb.to(device)
            optimizer.zero_grad()
            logits = model_cls(xb)
            loss = criterion(logits, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model_cls.parameters(), 1.0)
            optimizer.step()
            running_loss += loss.item() * xb.size(0)

        # validation
        model_cls.eval()
        val_probs = []
        with torch.no_grad():
            for xb, yb in val_dl:
                xb = xb.to(device)
                logits = model_cls(xb)
                probs = torch.softmax(logits, dim=1).cpu().numpy()
                val_probs.append(probs)
        val_probs = np.concatenate(val_probs, axis=0)
        val_exp = (val_probs * np.arange(11)[None,:]).sum(axis=1)
        val_rmse = np.sqrt(mean_squared_error(y[val_idx], val_exp))

        scheduler.step()
        print(f"Epoch {ep}/{EPOCHS} | train_loss={running_loss/len(train_ds):.4f} | val_RMSE={val_rmse:.4f} | time={time.time()-t0:.1f}s")

        if val_rmse < best_rmse - 1e-5:
            best_rmse = val_rmse; best_state = model_cls.state_dict(); patience = 0
        else:
            patience += 1
            if patience >= PATIENCE:
                print("Early stopping")
                break

    # load best
    model_cls.load_state_dict(best_state)
    model_cls.to(device)
    model_cls.eval()

    # OOF
    val_probs = []
    with torch.no_grad():
        for xb, yb in val_dl:
            xb = xb.to(device)
            logits = model_cls(xb)
            val_probs.append(torch.softmax(logits, dim=1).cpu().numpy())
    val_probs = np.concatenate(val_probs, axis=0)
    oof_probs[val_idx] = val_probs

    # test probs
    fold_test_probs = []
    with torch.no_grad():
        for xb in test_dl:
            xb = xb.to(device)
            logits = model_cls(xb)
            fold_test_probs.append(torch.softmax(logits, dim=1).cpu().numpy())
    fold_test_probs = np.concatenate(fold_test_probs, axis=0)
    test_probs_folds[fold] = fold_test_probs

    print(f"Fold {fold} best val RMSE: {best_rmse:.4f}")

# Evaluate OOF
oof_exp = (oof_probs * np.arange(11)[None,:]).sum(axis=1)
oof_rmse = np.sqrt(mean_squared_error(y, oof_exp))
print("\nOOF RMSE (raw):", oof_rmse)

# Calibrate
cal = LinearRegression().fit(oof_exp.reshape(-1,1), y)
oof_cal = cal.predict(oof_exp.reshape(-1,1))
print("OOF RMSE (calibrated):", np.sqrt(mean_squared_error(y, oof_cal)))
print("Calibration params: a=", cal.coef_[0], "b=", cal.intercept_)

# Final test preds
test_probs_mean = test_probs_folds.mean(axis=0)
test_exp = (test_probs_mean * np.arange(11)[None,:]).sum(axis=1)
test_exp_cal = cal.predict(test_exp.reshape(-1,1))
test_exp_cal = np.clip(test_exp_cal, 0, 10)

# Save artifacts + submission
np.save("oof_probs_gte.npy", oof_probs)
np.save("test_probs_gte.npy", test_probs_mean)
np.save("oof_gte.npy", oof_exp)
np.save("test_gte.npy", test_exp)

test_df["ID"] = np.arange(1, len(test_exp_cal)+1)
sub = pd.DataFrame({"ID": test_df["ID"], "score": test_exp_cal})
sub.to_csv("submission_gte_nll_mlp.csv", index=False)
print("\nSaved submission_gte_nll_mlp.csv")
print("OOF RMSE final (calibrated):", np.sqrt(mean_squared_error(y, oof_cal)))


GTE MULTILINGUAL WALA

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

tok = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base")
model = AutoModel.from_pretrained(
    "Alibaba-NLP/gte-multilingual-base",
    trust_remote_code=True
).to(device)

print("Loaded multilingual GTE!")


In [None]:
import torch.nn.functional as F

def get_gte_embedding(texts, batch_size=32):
    all_embs = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tok(
            batch,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            out = model(**enc)
            emb = out.last_hidden_state[:, 0, :]   # CLS pooling
            emb = F.normalize(emb, p=2, dim=1)

        all_embs.append(emb.cpu().numpy())

    return np.vstack(all_embs)


START NEW

In [None]:
# Cell 1 — imports and device
import os, time, math, random
import numpy as np, pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel

print("Torch:", torch.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


In [None]:
# Cell 2 — load multilingual GTE (Alibaba)
MODEL_NAME = "Alibaba-NLP/gte-multilingual-base"   # multilingual model (recommended)
print("Loading tokenizer and model:", MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# trust_remote_code may be required for some Alibaba models; include if HF asks for it
model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True).to(device)
model.eval()
print("Loaded model ->", MODEL_NAME)


In [None]:
# Cell 3 — load raw data and build combined_text / metric_text
import json

# Load train_data.json and test_data.json (assumes files in working dir)
with open("train_data.json","r",encoding="utf8") as f:
    train_raw = json.load(f)
with open("test_data.json","r",encoding="utf8") as f:
    test_raw = json.load(f)

train_df = pd.DataFrame(train_raw)
test_df  = pd.DataFrame(test_raw)

def combine_all(row):
    sp = str(row.get("system_prompt","")) if row.get("system_prompt") else ""
    up = str(row.get("user_prompt","")) if row.get("user_prompt") else ""
    rp = str(row.get("response","")) if row.get("response") else ""
    return sp + " [SYS] " + up + " [USR] " + rp + " [RES]"

# Ensure combined_text and metric_text exist
train_df["combined_text"] = train_df.apply(combine_all, axis=1)
train_df["metric_text"]   = train_df["metric_name"].astype(str)

test_df["combined_text"] = test_df.apply(combine_all, axis=1)
test_df["metric_text"]   = test_df["metric_name"].astype(str)

print("Train shape:", train_df.shape, "Test shape:", test_df.shape)
train_df.head(2)


In [None]:
# Cell 4 — pooling + batched encoder helper
MAX_LEN = 512
ENC_BATCH = 32   # reduce to 16 or 8 if OOM

def mean_pool(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    masked = last_hidden_state * mask
    summed = masked.sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1e-9)
    mean_pooled = summed / counts
    return mean_pooled

@torch.no_grad()
def encode_texts(texts, desc="Encode", batch_size=ENC_BATCH, max_len=MAX_LEN):
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc=desc):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
        input_ids = enc["input_ids"].to(device)
        attn = enc["attention_mask"].to(device)
        out = model(input_ids=input_ids, attention_mask=attn, return_dict=True)
        last = out.last_hidden_state  # (B, L, D)
        pooled = mean_pool(last, attn)  # (B, D)
        # optional L2 normalize
        pooled = F.normalize(pooled, p=2, dim=1)
        all_embs.append(pooled.cpu().numpy())
    all_embs = np.vstack(all_embs)
    return all_embs


In [None]:
# Cell 5 — encode train/test metric and combined texts
train_metric_texts = train_df["metric_text"].astype(str).tolist()
train_combined_texts = train_df["combined_text"].astype(str).tolist()
test_metric_texts = test_df["metric_text"].astype(str).tolist()
test_combined_texts = test_df["combined_text"].astype(str).tolist()

print("Encoding train metric_texts...")
train_metric_embs = encode_texts(train_metric_texts, desc="train_metric")
print("Encoding train combined_texts...")
train_text_embs = encode_texts(train_combined_texts, desc="train_text")
print("Encoding test metric_texts...")
test_metric_embs = encode_texts(test_metric_texts, desc="test_metric")
print("Encoding test combined_texts...")
test_text_embs = encode_texts(test_combined_texts, desc="test_text")

print("Shapes:", train_metric_embs.shape, train_text_embs.shape, test_metric_embs.shape, test_text_embs.shape)

# Save embeddings to disk for reuse
np.save("train_metric_gte_m.npy", train_metric_embs)
np.save("train_text_gte_m.npy", train_text_embs)
np.save("test_metric_gte_m.npy", test_metric_embs)
np.save("test_text_gte_m.npy", test_text_embs)
print("Saved GTE multilingual embeddings.")


In [None]:
# Cell 6 — negative generation (same logic as your earlier code)
rng = np.random.default_rng(42)

train_metric = np.load("train_metric_gte_m.npy")
train_text   = np.load("train_text_gte_m.npy")
y_real       = train_df["score"].values.astype(np.float32)
N = len(train_metric)
print("N =", N)

# 1) Shuffle-based negatives (m same, text permuted)
perm = rng.permutation(N)
neg_metric_1 = train_metric
neg_text_1   = train_text[perm]
neg_y_1      = rng.integers(0, 3, size=N)

# 2) Noise-corrupted negatives (text + gaussian noise)
noise = rng.normal(scale=0.6, size=train_text.shape).astype(np.float32)
neg_metric_2 = train_metric
neg_text_2   = (train_text + noise).astype(np.float32)
neg_y_2      = rng.integers(0, 3, size=N)

# 3) Metric swap negatives
perm2 = rng.permutation(N)
neg_metric_3 = train_metric[perm2]
neg_text_3   = train_text
neg_y_3      = rng.integers(0, 3, size=N)

# Combine everything
m_all = np.vstack([train_metric, neg_metric_1, neg_metric_2, neg_metric_3]).astype(np.float32)
t_all = np.vstack([train_text,   neg_text_1,   neg_text_2,   neg_text_3]).astype(np.float32)
y_all = np.concatenate([y_real,  neg_y_1,      neg_y_2,      neg_y_3]).astype(np.float32)

print("Combined shapes ->", m_all.shape, t_all.shape, y_all.shape)
np.save("m_all_gte.npy", m_all)
np.save("t_all_gte.npy", t_all)
np.save("y_all_gte.npy", y_all)
print("Saved m_all_gte.npy, t_all_gte.npy, y_all_gte.npy")


In [None]:
# Cell 7 — build features exactly as before (concat, absdiff, prod, cos)
m_all = np.load("m_all_gte.npy")
t_all = np.load("t_all_gte.npy")
y_all = np.load("y_all_gte.npy")

# cosine
dot = np.sum(m_all * t_all, axis=1)
norms = (np.linalg.norm(m_all, axis=1) * np.linalg.norm(t_all, axis=1)) + 1e-9
cos = (dot / norms).reshape(-1,1).astype(np.float32)

absdiff = np.abs(m_all - t_all).astype(np.float32)
prod = (m_all * t_all).astype(np.float32)
concat = np.hstack([m_all.astype(np.float32), t_all.astype(np.float32)])  # (N, 2*D)

X_all = np.hstack([concat, absdiff, prod, cos]).astype(np.float32)
print("X_all shape:", X_all.shape, "y_all shape:", y_all.shape)

# Build X_test from test embeddings
test_metric = np.load("test_metric_gte_m.npy")
test_text   = np.load("test_text_gte_m.npy")
dot = np.sum(test_metric * test_text, axis=1)
norms = (np.linalg.norm(test_metric, axis=1) * np.linalg.norm(test_text, axis=1)) + 1e-9
cos_test = (dot / norms).reshape(-1,1).astype(np.float32)
absdiff_test = np.abs(test_metric - test_text).astype(np.float32)
prod_test = (test_metric * test_text).astype(np.float32)
concat_test = np.hstack([test_metric.astype(np.float32), test_text.astype(np.float32)])
X_test = np.hstack([concat_test, absdiff_test, prod_test, cos_test]).astype(np.float32)
print("X_test shape:", X_test.shape)

# Save
np.save("X_all_gte.npy", X_all)
np.save("y_all_gte.npy", y_all)
np.save("X_test_gte.npy", X_test)
print("Saved X_all_gte.npy, y_all_gte.npy, X_test_gte.npy")


In [None]:
# Cell 8 — train NLL MLP on X_all_gte.npy features (11-class classification)
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

# hyperparams (tune if needed)
SEED = 42
NFOLDS = 5
EPOCHS = 12
BATCH = 256
LR = 1e-3
WD = 1e-5
PATIENCE = 3

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

X = np.load("X_all_gte.npy").astype(np.float32)
y = np.load("y_all_gte.npy").astype(np.float32)
X_test = np.load("X_test_gte.npy").astype(np.float32)

# ensure labels are integer 0..10
y_int = np.rint(y).astype(int)
y_int = np.clip(y_int, 0, 10)

print("Shapes -> X:", X.shape, "y:", y_int.shape, "X_test:", X_test.shape)

class EmbClsDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

class NLL_MLP(nn.Module):
    def __init__(self, in_dim, hidden1=1024, hidden2=512, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden1),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden1, hidden2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden2, 128),
            nn.GELU(),
            nn.Linear(128, 11)
        )
    def forward(self, x):
        return self.net(x)

kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
oof_probs = np.zeros((len(X), 11), dtype=np.float32)
test_probs_folds = np.zeros((NFOLDS, X_test.shape[0], 11), dtype=np.float32)

for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n=== Fold {fold} ===")
    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y_int[tr_idx], y_int[val_idx]

    train_ds = EmbClsDataset(X_tr, y_tr)
    val_ds   = EmbClsDataset(X_val, y_val)
    test_ds  = EmbClsDataset(X_test)

    train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True, pin_memory=True)
    val_dl   = DataLoader(val_ds, batch_size=BATCH, shuffle=False, pin_memory=True)
    test_dl  = DataLoader(test_ds, batch_size=BATCH, shuffle=False, pin_memory=True)

    model_cls = NLL_MLP(in_dim=X.shape[1]).to(device)
    optimizer = torch.optim.AdamW(model_cls.parameters(), lr=LR, weight_decay=WD)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
    criterion = nn.CrossEntropyLoss()

    best_rmse = 1e9; best_state = None; patience = 0

    for ep in range(1, EPOCHS+1):
        model_cls.train()
        running_loss = 0.0
        t0 = time.time()
        for xb, yb in train_dl:
            xb = xb.to(device); yb = yb.to(device)
            optimizer.zero_grad()
            logits = model_cls(xb)
            loss = criterion(logits, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model_cls.parameters(), 1.0)
            optimizer.step()
            running_loss += loss.item() * xb.size(0)

        # validation
        model_cls.eval()
        val_probs = []
        with torch.no_grad():
            for xb, yb in val_dl:
                xb = xb.to(device)
                logits = model_cls(xb)
                probs = torch.softmax(logits, dim=1).cpu().numpy()
                val_probs.append(probs)
        val_probs = np.concatenate(val_probs, axis=0)
        val_exp = (val_probs * np.arange(11)[None,:]).sum(axis=1)
        val_rmse = np.sqrt(mean_squared_error(y[val_idx], val_exp))

        scheduler.step()
        print(f"Epoch {ep}/{EPOCHS} | train_loss={running_loss/len(train_ds):.4f} | val_RMSE={val_rmse:.4f} | time={time.time()-t0:.1f}s")

        if val_rmse < best_rmse - 1e-5:
            best_rmse = val_rmse; best_state = model_cls.state_dict(); patience = 0
        else:
            patience += 1
            if patience >= PATIENCE:
                print("Early stopping")
                break

    # load best state
    model_cls.load_state_dict(best_state)
    model_cls.to(device)
    model_cls.eval()

    # OOF probs
    val_probs = []
    with torch.no_grad():
        for xb, yb in val_dl:
            xb = xb.to(device)
            logits = model_cls(xb)
            val_probs.append(torch.softmax(logits, dim=1).cpu().numpy())
    val_probs = np.concatenate(val_probs, axis=0)
    oof_probs[val_idx] = val_probs

    # fold test probs
    fold_test_probs = []
    with torch.no_grad():
        for xb in test_dl:
            xb = xb.to(device)
            logits = model_cls(xb)
            fold_test_probs.append(torch.softmax(logits, dim=1).cpu().numpy())
    fold_test_probs = np.concatenate(fold_test_probs, axis=0)
    test_probs_folds[fold] = fold_test_probs

    print(f"Fold {fold} best val RMSE: {best_rmse:.4f}")

# Evaluate OOF
oof_exp = (oof_probs * np.arange(11)[None,:]).sum(axis=1)
oof_rmse = np.sqrt(mean_squared_error(y, oof_exp))
print("\nOOF RMSE (raw):", oof_rmse)

# Calibrate and final test preds
cal = LinearRegression().fit(oof_exp.reshape(-1,1), y)
oof_cal = cal.predict(oof_exp.reshape(-1,1))
print("OOF RMSE (calibrated):", np.sqrt(mean_squared_error(y, oof_cal)))
print("Calibration params: a=", cal.coef_[0], "b=", cal.intercept_)

test_probs_mean = test_probs_folds.mean(axis=0)
test_exp = (test_probs_mean * np.arange(11)[None,:]).sum(axis=1)
test_exp_cal = cal.predict(test_exp.reshape(-1,1))
test_exp_cal = np.clip(test_exp_cal, 0, 10)

# Save outputs + submission
np.save("oof_probs_gte_m.npy", oof_probs)
np.save("test_probs_gte_m.npy", test_probs_mean)
np.save("oof_gte_m.npy", oof_exp)
np.save("test_gte_m.npy", test_exp)

test_df["ID"] = np.arange(1, len(test_exp_cal)+1)
sub = pd.DataFrame({"ID": test_df["ID"], "score": test_exp_cal})
sub.to_csv("submission_gte_multilingual_nll_mlp.csv", index=False)
print("\nSaved submission_gte_multilingual_nll_mlp.csv")
print("OOF RMSE final (calibrated):", np.sqrt(mean_squared_error(y, oof_cal)))


END HERE

In [None]:
# STEP 5 — Train MLP with K-Fold CV, get OOF, calibrate, test inference

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import pandas as pd
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Load training features
X = np.load("X_all.npy").astype(np.float32)
y = np.load("y_all.npy").astype(np.float32)

# Load test metric/text embeddings
test_metric = np.load("test_metric_embs.npy")
test_text   = np.load("test_text_embs.npy")

# ---- build test features same way ----
dot = np.sum(test_metric * test_text, axis=1)
norms = (np.linalg.norm(test_metric, axis=1) * np.linalg.norm(test_text, axis=1)) + 1e-9
cos_test = (dot / norms).reshape(-1,1).astype(np.float32)

absdiff_test = np.abs(test_metric - test_text).astype(np.float32)
prod_test    = (test_metric * test_text).astype(np.float32)
concat_test  = np.hstack([test_metric.astype(np.float32),
                          test_text.astype(np.float32)])

X_test = np.hstack([concat_test, absdiff_test, prod_test, cos_test]).astype(np.float32)


# -------------------------------
# PyTorch Dataset
# -------------------------------
class EmbDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self): return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


# -------------------------------
# MLP Model definition
# -------------------------------
class MLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    def forward(self, x):
        return self.net(x).squeeze(-1)


# -------------------------------
# Training hyperparameters
# -------------------------------
EPOCHS = 20
BATCH = 256
LR = 1e-3
NFOLDS = 5

kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)

oof = np.zeros(len(X), dtype=np.float32)
test_preds = np.zeros((NFOLDS, len(X_test)), dtype=np.float32)

fold = 0

for train_idx, val_idx in kf.split(X):
    print(f"\n===== Fold {fold} =====")

    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    train_ds = EmbDataset(X_tr, y_tr)
    val_ds   = EmbDataset(X_val, y_val)

    train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True)
    val_dl   = DataLoader(val_ds, batch_size=BATCH, shuffle=False)

    model = MLP(in_dim=X.shape[1]).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-5)
    criterion = nn.MSELoss()

    best_rmse = 999

    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0

        for xb, yb in train_dl:
            xb, yb = xb.to(device), yb.to(device)

            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * xb.size(0)

        # validation
        model.eval()
        val_preds_list = []
        with torch.no_grad():
            for xb, yb in val_dl:
                xb = xb.to(device)
                p = model(xb).detach().cpu().numpy()
                val_preds_list.append(p)

        val_preds = np.concatenate(val_preds_list)
        val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))

        print(f"Epoch {epoch+1}/{EPOCHS} - TrainLoss: {running_loss/len(train_ds):.4f}  ValRMSE: {val_rmse:.4f}")

        if val_rmse < best_rmse:
            best_rmse = val_rmse
            torch.save(model.state_dict(), f"mlp_fold{fold}.pt")

    # load best model
    model.load_state_dict(torch.load(f"mlp_fold{fold}.pt"))
    model.eval()

    # generate OOF preds
    val_preds_list = []
    with torch.no_grad():
        for xb, yb in val_dl:
            xb = xb.to(device)
            p = model(xb).detach().cpu().numpy()
            val_preds_list.append(p)

    oof[val_idx] = np.concatenate(val_preds_list)

    # test predictions
    test_pred_list = []
    with torch.no_grad():
        for i in range(0, len(X_test), BATCH):
            xb = torch.from_numpy(X_test[i:i+BATCH]).to(device)
            p = model(xb).detach().cpu().numpy()
            test_pred_list.append(p)

    test_preds[fold] = np.concatenate(test_pred_list)

    print(f"Fold {fold} best RMSE: {best_rmse:.4f}")

    fold += 1


# -------------------------------
# Calibration (Label shift fix!)
# -------------------------------
oof_rmse = np.sqrt(mean_squared_error(y, oof))
print("\nOOF RMSE before calibration:", oof_rmse)

lr = LinearRegression().fit(oof.reshape(-1,1), y)
a = float(lr.coef_[0])
b = float(lr.intercept_)
print("Calibration: y ≈ a*x + b =", a, b)

oof_cal = lr.predict(oof.reshape(-1,1))
oof_cal_rmse = np.sqrt(mean_squared_error(y, oof_cal))
print("OOF RMSE after calibration:", oof_cal_rmse)


# -------------------------------
# Final test predictions
# -------------------------------
test_pred_raw = test_preds.mean(axis=0)
test_pred_cal = lr.predict(test_pred_raw.reshape(-1,1))
test_pred_cal = np.clip(test_pred_cal, 0, 10).reshape(-1)


# -------------------------------
# Save submission (Corrected)
# -------------------------------

# Ensure test_df exists (use the one already loaded earlier)
# test_df is already in memory because we used it to create embeddings

# Create ID column (1 to N) to match sample_submission.csv
test_df["ID"] = np.arange(1, len(test_df) + 1)

# Build submission DataFrame
sub = pd.DataFrame({
    "ID": test_df["ID"],
    "score": test_pred_cal
})

# Save CSV
sub.to_csv("submission_mlp_calibrated.csv", index=False)
print("\nSaved submission_mlp_calibrated.csv")


In [None]:
# ======================================================
# STEP 5 — Train Strong MLP with K-Fold CV + Calibration
# ======================================================

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import pandas as pd
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# ======================================================
# Load train features + labels
# ======================================================
X = np.load("X_all.npy").astype(np.float32)
y = np.load("y_all.npy").astype(np.float32)

# ======================================================
# Load test embeddings + build test features
# ======================================================
test_metric = np.load("test_metric_embs.npy")
test_text   = np.load("test_text_embs.npy")

dot = np.sum(test_metric * test_text, axis=1)
norms = (np.linalg.norm(test_metric, axis=1) * np.linalg.norm(test_text, axis=1)) + 1e-9
cos_test = (dot / norms).reshape(-1,1).astype(np.float32)

absdiff_test = np.abs(test_metric - test_text).astype(np.float32)
prod_test    = (test_metric * test_text).astype(np.float32)
concat_test  = np.hstack([test_metric.astype(np.float32), test_text.astype(np.float32)])

X_test = np.hstack([concat_test, absdiff_test, prod_test, cos_test]).astype(np.float32)


# ======================================================
# Dataset Class
# ======================================================
class EmbDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self): return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


# ======================================================
# Strong MLP (Residual + LayerNorm + GELU)
# ======================================================
class ResBlock(nn.Module):
    def __init__(self, dim, drop=0.15):
        super().__init__()
        self.lin1 = nn.Linear(dim, dim)
        self.act = nn.GELU()
        self.lin2 = nn.Linear(dim, dim)
        self.norm = nn.LayerNorm(dim)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        residual = x
        x = self.norm(x)
        x = self.lin1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.lin2(x)
        return residual + x


class StrongMLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        hidden = 512

        self.input = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.GELU(),
            nn.Dropout(0.1)
        )

        # 6 deep residual blocks
        self.blocks = nn.Sequential(
            ResBlock(hidden),
            ResBlock(hidden),
            ResBlock(hidden),
            ResBlock(hidden),
            ResBlock(hidden),
            ResBlock(hidden),
        )

        self.output = nn.Sequential(
            nn.LayerNorm(hidden),
            nn.Linear(hidden, 128),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        x = self.input(x)
        x = self.blocks(x)
        x = self.output(x)
        return x.squeeze(-1)


# ======================================================
# Training Hyperparameters
# ======================================================
EPOCHS = 30        # deeper model → more epochs
BATCH = 256
LR = 2e-4          # smaller LR → more stable
NFOLDS = 5

kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)

oof = np.zeros(len(X), dtype=np.float32)
test_preds = np.zeros((NFOLDS, len(X_test)), dtype=np.float32)

fold = 0

# ======================================================
# K-Fold Training Loop
# ======================================================
for train_idx, val_idx in kf.split(X):
    print(f"\n================ Fold {fold} ================")

    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    train_ds = EmbDataset(X_tr, y_tr)
    val_ds   = EmbDataset(X_val, y_val)

    train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True)
    val_dl   = DataLoader(val_ds, batch_size=BATCH, shuffle=False)

    model = StrongMLP(in_dim=X.shape[1]).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
    criterion = nn.MSELoss()

    best_rmse = 999

    # ------------------ Train Epochs ------------------
    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0

        for xb, yb in train_dl:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()

            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * xb.size(0)

        # ------------------ Validation ------------------
        model.eval()
        val_list = []
        with torch.no_grad():
            for xb, yb in val_dl:
                xb = xb.to(device)
                output = model(xb).detach().cpu().numpy()
                val_list.append(output)

        val_preds = np.concatenate(val_list)
        val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))

        print(f"Epoch {epoch+1}/{EPOCHS}  "
              f"TrainLoss={train_loss/len(train_ds):.4f}  "
              f"ValRMSE={val_rmse:.4f}")

        if val_rmse < best_rmse:
            best_rmse = val_rmse
            torch.save(model.state_dict(), f"strong_mlp_fold{fold}.pt")

    print(f"Fold {fold} BEST RMSE = {best_rmse:.4f}")

    # Load best model
    model.load_state_dict(torch.load(f"strong_mlp_fold{fold}.pt"))
    model.eval()

    # -------- OOF predictions --------
    val_list = []
    with torch.no_grad():
        for xb, yb in val_dl:
            xb = xb.to(device)
            out = model(xb).detach().cpu().numpy()
            val_list.append(out)

    oof[val_idx] = np.concatenate(val_list)

    # -------- Test predictions --------
    test_list = []
    with torch.no_grad():
        for i in range(0, len(X_test), BATCH):
            xb = torch.from_numpy(X_test[i:i+BATCH]).to(device)
            out = model(xb).detach().cpu().numpy()
            test_list.append(out)

    test_preds[fold] = np.concatenate(test_list)

    fold += 1


# ======================================================
# Calibration
# ======================================================
oof_rmse_raw = np.sqrt(mean_squared_error(y, oof))
print("\nOOF RMSE (raw):", oof_rmse_raw)

lr = LinearRegression().fit(oof.reshape(-1,1), y)
a = float(lr.coef_[0]); b = float(lr.intercept_)
print("Calibration params:  a=", a, " b=", b)

oof_cal = lr.predict(oof.reshape(-1,1))
oof_rmse_cal = np.sqrt(mean_squared_error(y, oof_cal))
print("OOF RMSE (calibrated):", oof_rmse_cal)


# ======================================================
# Final Test Prediction
# ======================================================
test_raw = test_preds.mean(axis=0)
test_cal = lr.predict(test_raw.reshape(-1,1))
test_cal = np.clip(test_cal, 0, 10).reshape(-1)


# ======================================================
# Save Submission
# ======================================================
test_df["ID"] = np.arange(1, len(test_df)+1)

sub = pd.DataFrame({
    "ID": test_df["ID"],
    "score": test_cal
})

sub.to_csv("submission_strongmlp_calibrated.csv", index=False)
print("\nSaved submission_strongmlp_calibrated.csv")


In [None]:
# Full hybrid training: CE + Pairwise RankNet + SWA + EMA (KFold)
# Paste this whole cell and run (assumes X_all.npy, y_all.npy, test_metric_embs.npy, test_text_embs.npy exist)

import os, time, math, random
import numpy as np, pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from torch.optim.swa_utils import AveragedModel, SWALR

# -----------------------
# Config / hyperparams
# -----------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

NFOLDS = 5
EPOCHS = 18               # you can increase (SWA helps)
BATCH = 256
LR = 3e-4
WEIGHT_DECAY = 1e-5
PAIRWISE_WEIGHT = 0.5     # relative weight for pairwise loss vs CE
SWA_START_FRAC = 0.7      # start SWA after this fraction of epochs
EMA_DECAY = 0.999         # EMA decay for parameter averaging

NUM_CLASSES = 11

# -----------------------
# Load data (features already computed)
# -----------------------
X = np.load("X_all.npy").astype(np.float32)
y = np.load("y_all.npy").astype(np.float32)   # continuous labels 0..10
# Convert to integer classes for CE target (round and clip)
y_int = np.rint(y).astype(int)
y_int = np.clip(y_int, 0, NUM_CLASSES-1)

# Build X_test like you did before
test_metric = np.load("test_metric_embs.npy")
test_text   = np.load("test_text_embs.npy")

dot = np.sum(test_metric * test_text, axis=1)
norms = (np.linalg.norm(test_metric, axis=1) * np.linalg.norm(test_text, axis=1)) + 1e-9
cos_test = (dot / norms).reshape(-1,1).astype(np.float32)
absdiff_test = np.abs(test_metric - test_text).astype(np.float32)
prod_test = (test_metric * test_text).astype(np.float32)
concat_test = np.hstack([test_metric.astype(np.float32), test_text.astype(np.float32)])
X_test = np.hstack([concat_test, absdiff_test, prod_test, cos_test]).astype(np.float32)

print("Shapes -> X:", X.shape, "y:", y_int.shape, "X_test:", X_test.shape)

# -----------------------
# Dataset
# -----------------------
class EmbDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        x = self.X[idx]
        if self.y is None:
            return x
        return x, self.y[idx]

# -----------------------
# Model: shallow expressive MLP
# -----------------------
class ShallowMLP(nn.Module):
    def __init__(self, in_dim, hidden=512, dropout=0.12):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.LayerNorm(hidden),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, hidden//2),
            nn.LayerNorm(hidden//2),
            nn.GELU(),
            nn.Dropout(dropout),
        )
        self.head = nn.Linear(hidden//2, NUM_CLASSES)  # logits for 11 classes

    def forward(self, x):
        x = self.net(x)
        logits = self.head(x)
        return logits

# -----------------------
# Pairwise RankNet loss helper
#   Uses expected score (E[y] from softmax) as scalar "s".
#   For pairs (i,j) create label t = 1 if y_i > y_j else 0.
#   Loss = BCEWithLogitsLoss(s_i - s_j, t)
# -----------------------
bce_logits = nn.BCEWithLogitsLoss(reduction='mean')

def pairwise_rank_loss_from_logits(logits, targets, max_pairs=1024):
    """
    logits: tensor (B, C)
    targets: tensor (B,) integer classes
    returns: mean pairwise logistic loss
    """
    with torch.no_grad():
        # expected scalar scores (E[class])
        probs = torch.softmax(logits.detach(), dim=1)  # detach to not backprop through expectation if you want; but we use logits for pair diff so ok
        classes = torch.arange(0, logits.shape[1], dtype=torch.float32, device=logits.device)
        exp_scores = (probs * classes[None,:]).sum(dim=1)  # (B,)

    B = logits.shape[0]
    if B < 2:
        return torch.tensor(0.0, device=logits.device)

    # sample up to max_pairs random pairs for scalability
    max_pairs = min(max_pairs, B*(B-1)//2)
    # Create random pair indices
    idx = torch.randperm(B, device=logits.device)
    # We'll do random pairing by choosing two shuffled copies and filtering equal targets
    i_idx = idx
    j_idx = torch.randperm(B, device=logits.device)
    s_i = exp_scores[i_idx]
    s_j = exp_scores[j_idx]
    y_i = targets[i_idx]
    y_j = targets[j_idx]
    # Keep only pairs where y_i != y_j
    mask = (y_i != y_j)
    if mask.sum() == 0:
        return torch.tensor(0.0, device=logits.device)
    s_diff = s_i[mask] - s_j[mask]          # predict probability that i>j using s_diff
    t = (y_i[mask] > y_j[mask]).float()     # target 1 if i>j else 0
    if s_diff.numel() > max_pairs:
        perm2 = torch.randperm(s_diff.numel(), device=logits.device)[:max_pairs]
        s_diff = s_diff[perm2]
        t = t[perm2]
    loss = bce_logits(s_diff, t)
    return loss

# -----------------------
# KFold loop with SWA + EMA
# -----------------------
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

oof_probs = np.zeros((len(X), NUM_CLASSES), dtype=np.float32)
test_probs_folds = np.zeros((NFOLDS, X_test.shape[0], NUM_CLASSES), dtype=np.float32)

for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n=== Fold {fold} ===")

    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y_int[tr_idx], y_int[val_idx]

    train_ds = EmbDataset(X_tr, y_tr)
    val_ds = EmbDataset(X_val, y_val)
    test_ds = EmbDataset(X_test)

    train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True, pin_memory=True)
    val_dl = DataLoader(val_ds, batch_size=BATCH, shuffle=False, pin_memory=True)
    test_dl = DataLoader(test_ds, batch_size=BATCH, shuffle=False, pin_memory=True)

    model = ShallowMLP(in_dim=X.shape[1]).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

    # SWA setup
    swa_start = int(EPOCHS * SWA_START_FRAC)
    swa_model = AveragedModel(model)
    swa_scheduler = None  # set below if needed

    # EMA shadow weights
    ema_shadow = {name: param.detach().cpu().clone() for name, param in model.named_parameters()}
    ema_n = 0

    # criterion CE
    ce_loss = nn.CrossEntropyLoss()

    best_val_rmse = 1e9
    best_state = None
    patience = 0
    EARLY_STOP = 4

    # LR scheduler: cosine annealing
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

    for ep in range(1, EPOCHS+1):
        model.train()
        t0 = time.time()
        total_loss = 0.0
        total_ce = 0.0
        total_pair = 0.0
        n_samples = 0

        for xb, yb in train_dl:
            xb = xb.to(device); yb = yb.to(device)
            optimizer.zero_grad()
            logits = model(xb)                       # (B, C)
            loss_ce = ce_loss(logits, yb)
            loss_pair = pairwise_rank_loss_from_logits(logits, yb, max_pairs=512)
            loss = loss_ce + PAIRWISE_WEIGHT * loss_pair
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            # update EMA shadow
            ema_n += 1
            with torch.no_grad():
                for name, param in model.named_parameters():
                    if param.requires_grad:
                        s = ema_shadow[name]
                        # move current param to CPU tensor for numerical stability
                        cur = param.detach().cpu()
                        ema_shadow[name] = EMA_DECAY * s + (1.0 - EMA_DECAY) * cur

            total_loss += float(loss.item()) * xb.size(0)
            total_ce += float(loss_ce.item()) * xb.size(0)
            total_pair += float(loss_pair.item()) * xb.size(0)
            n_samples += xb.size(0)

        scheduler.step()
        # optionally update SWA
        if ep > swa_start:
            swa_model.update_parameters(model)

        # validation - use current model for validation
        model.eval()
        val_probs_list = []
        with torch.no_grad():
            for xb, yb in val_dl:
                xb = xb.to(device)
                logits = model(xb)
                probs = torch.softmax(logits, dim=1).cpu().numpy()
                val_probs_list.append(probs)
        val_probs = np.concatenate(val_probs_list, axis=0)
        val_exp = (val_probs * np.arange(NUM_CLASSES)[None,:]).sum(axis=1)
        val_rmse = np.sqrt(mean_squared_error(y[val_idx], val_exp))

        avg_loss = total_loss / n_samples
        print(f"Epoch {ep}/{EPOCHS} | loss={avg_loss:.4f} (CE={total_ce/n_samples:.4f} Pair={total_pair/n_samples:.4f}) | val_RMSE={val_rmse:.4f} | time={time.time()-t0:.1f}s")

        # early save best model by val_rmse
        if val_rmse + 1e-6 < best_val_rmse:
            best_val_rmse = val_rmse
            best_state = {k:v.cpu().clone() for k,v in model.state_dict().items()}
            patience = 0
        else:
            patience += 1
            if patience >= EARLY_STOP and ep > 6:
                print("Early stopping triggered")
                break

    # Finished epochs for this fold
    print(f">>> Fold {fold} training done. Best val RMSE: {best_val_rmse:.4f}")

    # finalize SWA model: update BN statistics (if any) using train_dl
    if isinstance(swa_model, AveragedModel):
        # copy averaged weights to a temp model for evaluation
        swa_state = swa_model.module.state_dict() if hasattr(swa_model, "module") else swa_model.state_dict()
        # create eval model and load swa_state
        eval_model = ShallowMLP(in_dim=X.shape[1]).to(device)
        eval_model.load_state_dict(swa_state)
        # update bn (if present) - here we have LayerNorm but ok:
        # torch.optim.swa_utils.update_bn(train_dl, eval_model, device=device)  # optional
    else:
        eval_model = ShallowMLP(in_dim=X.shape[1]).to(device)
        eval_model.load_state_dict(best_state)

    # ALSO prepare EMA-evaluated model by copying shadow into a model
    ema_model = ShallowMLP(in_dim=X.shape[1]).to(device)
    # copy EMA params (shadow is CPU tensors)
    ema_state = ema_model.state_dict()
    for name in ema_state.keys():
        if name in ema_shadow:
            ema_state[name] = ema_shadow[name].to(device)
    ema_model.load_state_dict(ema_state)

    # Choose which to use for OOF prediction: you can try both
    # We'll use SWA-eval (eval_model) for OOF and later ensemble fold predictions across folds
    eval_model.eval()
    val_probs_list = []
    with torch.no_grad():
        for xb, yb in val_dl:
            xb = xb.to(device)
            logits = eval_model(xb)
            probs = torch.softmax(logits, dim=1).cpu().numpy()
            val_probs_list.append(probs)
    val_probs = np.concatenate(val_probs_list, axis=0)
    oof_probs[val_idx] = val_probs

    # Test prediction for this fold using eval_model
    test_fold_list = []
    with torch.no_grad():
        for xb in DataLoader(test_ds, batch_size=BATCH, shuffle=False):
            xb = torch.tensor(xb, dtype=torch.float32).to(device)
            logits = eval_model(xb)
            probs = torch.softmax(logits, dim=1).cpu().numpy()
            test_fold_list.append(probs)
    test_fold_preds = np.concatenate(test_fold_list, axis=0)
    test_probs_folds[fold] = test_fold_preds

    # Save fold model for safety
    torch.save(eval_model.state_dict(), f"ranknet_swa_fold{fold}.pt")
    print(f"Saved ranknet_swa_fold{fold}.pt")

# -----------------------
# OOF eval and calibration
# -----------------------
oof_exp = (oof_probs * np.arange(NUM_CLASSES)[None,:]).sum(axis=1)
oof_rmse_raw = np.sqrt(mean_squared_error(y, oof_exp))
print("\nOOF RMSE (raw):", oof_rmse_raw)

# Calibrate with linear regression
cal = LinearRegression().fit(oof_exp.reshape(-1,1), y)
oof_cal = cal.predict(oof_exp.reshape(-1,1))
print("OOF RMSE (calibrated):", np.sqrt(mean_squared_error(y, oof_cal)))
print("Calibration params: a=", cal.coef_[0], "b=", cal.intercept_)

# -----------------------
# Final test predictions
# -----------------------
test_probs_mean = test_probs_folds.mean(axis=0)
test_exp = (test_probs_mean * np.arange(NUM_CLASSES)[None,:]).sum(axis=1)
test_exp_cal = cal.predict(test_exp.reshape(-1,1))
test_exp_cal = np.clip(test_exp_cal, 0, 10).reshape(-1)

# Save submission
# Make sure test_df exists in memory (we used it earlier) or reload sample submission index
try:
    test_df
except NameError:
    # if test_df not in memory, try reading test_data.json to get length
    import json
    with open("test_data.json","r",encoding="utf8") as f:
        test_raw = json.load(f)
    test_df = pd.DataFrame(test_raw)

test_df["ID"] = np.arange(1, len(test_exp_cal)+1)
sub = pd.DataFrame({"ID": test_df["ID"], "score": test_exp_cal})
sub.to_csv("submission_ranknet_swa_ema.csv", index=False)
print("\nSaved submission_ranknet_swa_ema.csv")
print("OOF RMSE final (calibrated):", np.sqrt(mean_squared_error(y, oof_cal)))


In [None]:
# FULL: Train MSE + KL(hist) with SWA+EMA, quantile map postprocess, save submission
# Paste and run in the notebook where your embeddings/features exist.

import os, time, math, random
import numpy as np, pandas as pd
from tqdm import tqdm
import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from torch.optim.swa_utils import AveragedModel, SWALR

# ----------------- CONFIG -----------------
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

NFOLDS = 5
EPOCHS = 22           # increase if you have time
BATCH = 256
LR = 2.5e-4
WEIGHT_DECAY = 1e-5

# histogram loss params
NUM_BINS = 200        # histogram resolution
SIGMA_BIN = 0.15      # RBF width for soft histogram
LAMBDA_KL = 0.06      # weight of KL term vs MSE (tune!)

SWA_START_FRAC = 0.7  # start SWA after this fraction of epochs
EMA_DECAY = 0.999

# ----------------- Load features -----------------
X = np.load("X_all.npy").astype(np.float32)   # (N, D)
y = np.load("y_all.npy").astype(np.float32)   # continuous 0..10

# build test features (same as you used before)
test_metric = np.load("test_metric_embs.npy")
test_text   = np.load("test_text_embs.npy")

dot = np.sum(test_metric * test_text, axis=1)
norms = (np.linalg.norm(test_metric, axis=1) * np.linalg.norm(test_text, axis=1)) + 1e-9
cos_test = (dot / norms).reshape(-1,1).astype(np.float32)

absdiff_test = np.abs(test_metric - test_text).astype(np.float32)
prod_test    = (test_metric * test_text).astype(np.float32)
concat_test  = np.hstack([test_metric.astype(np.float32), test_text.astype(np.float32)])

X_test = np.hstack([concat_test, absdiff_test, prod_test, cos_test]).astype(np.float32)

print("X", X.shape, "y", y.shape, "X_test", X_test.shape)

# ----------------- Target histogram (friend-like multimodal) -----------------
# If you have an explicit target PDF from your friend, replace the construction below.
def build_target_pdf(num_bins=NUM_BINS):
    bins = np.linspace(0,10,num_bins+1)
    centers = 0.5*(bins[:-1] + bins[1:])
    # Two Gaussians + small mid mass - tune amplitudes/widths as per your friend's plot
    pdf_low = np.exp(-0.5*((centers-0.9)/0.25)**2)
    pdf_high = 1.0 * np.exp(-0.5*((centers-8.6)/0.6)**2)
    pdf_mid = 0.08 * np.exp(-0.5*((centers-4.0)/1.3)**2)
    pdf = pdf_low*1.0 + pdf_high + pdf_mid
    pdf = np.maximum(pdf, 1e-12)
    pdf = pdf / pdf.sum()
    return centers.astype(np.float32), pdf.astype(np.float32)

BIN_CENTERS, TARGET_PDF = build_target_pdf(NUM_BINS)
TARGET_PDF_T = torch.tensor(TARGET_PDF, dtype=torch.float32, device=DEVICE)
BIN_CENTERS_T = torch.tensor(BIN_CENTERS, dtype=torch.float32, device=DEVICE)

# ----------------- Helpers: soft histogram + KL loss -----------------
def soft_histogram_torch(preds, bin_centers_t, sigma=SIGMA_BIN):
    # preds: (B,), bin_centers_t: (M,)
    # Output: normalized histogram (M,)
    d = preds.unsqueeze(1) - bin_centers_t.unsqueeze(0)   # (B, M)
    w = torch.exp(-0.5 * (d / sigma)**2)                 # (B, M)
    hist = w.sum(dim=0)                                   # (M,)
    hist = hist / (hist.sum() + 1e-12)
    return hist

def kl_hist_loss_torch(preds, target_pdf_t=TARGET_PDF_T, bin_centers_t=BIN_CENTERS_T, sigma=SIGMA_BIN):
    hist = soft_histogram_torch(preds, bin_centers_t, sigma)
    # KL(target || pred) or KL(pred||target)? we use KL(pred || target) via log(hist) - log(target) * hist
    # use F.kl_div(logP, Q) expects logP input and target Q.
    loss = F.kl_div((hist+1e-12).log(), target_pdf_t, reduction='batchmean')
    return loss

# ----------------- Dataset -----------------
class EmbDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = X
        self.y = y
    def __len__(self): return len(self.X)
    def __getitem__(self, idx):
        x = self.X[idx]
        if self.y is None:
            return x
        return x, self.y[idx]

# ----------------- Shallow MLP model (robust) -----------------
class RobustMLP(nn.Module):
    def __init__(self, in_dim, hidden1=768, hidden2=256, drop=0.12):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden1),
            nn.LayerNorm(hidden1),
            nn.GELU(),
            nn.Dropout(drop),
            nn.Linear(hidden1, hidden2),
            nn.LayerNorm(hidden2),
            nn.GELU(),
            nn.Dropout(drop),
            nn.Linear(hidden2, 1)  # regression output
        )
    def forward(self, x):
        return self.net(x).squeeze(-1)

# ----------------- Training hyperparams -----------------
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
oof = np.zeros(len(X), dtype=np.float32)
test_preds_folds = np.zeros((NFOLDS, X_test.shape[0]), dtype=np.float32)

fold = 0
for tr_idx, val_idx in kf.split(X):
    print("\n========== Fold", fold, "==========")
    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    train_ds = EmbDataset(X_tr, y_tr)
    val_ds   = EmbDataset(X_val, y_val)
    test_ds  = EmbDataset(X_test)

    train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True, pin_memory=True)
    val_dl   = DataLoader(val_ds, batch_size=BATCH, shuffle=False, pin_memory=True)
    test_dl  = DataLoader(test_ds, batch_size=BATCH, shuffle=False, pin_memory=True)

    model = RobustMLP(in_dim=X.shape[1]).to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=EPOCHS)

    # SWA & EMA setup
    swa_start = int(EPOCHS * SWA_START_FRAC)
    swa_model = AveragedModel(model)
    swa_count = 0

    # EMA shadow (on CPU for stable accumulation)
    ema_shadow = {name: param.detach().cpu().clone() for name, param in model.named_parameters()}
    ema_n = 0

    best_val_rmse = 1e9
    best_state = None
    patience = 0
    EARLY_STOP = 4

    mse_loss = nn.MSELoss()

    for ep in range(1, EPOCHS+1):
        model.train()
        t0 = time.time()
        total_loss = 0.0
        total_mse = 0.0
        total_kl = 0.0
        ns = 0

        for xb, yb in train_dl:
            xb = xb.to(DEVICE); yb = yb.to(DEVICE)
            opt.zero_grad()
            preds = model(xb)                 # (B,)
            loss_mse = mse_loss(preds, yb)
            loss_kl = kl_hist_loss_torch(preds, TARGET_PDF_T, BIN_CENTERS_T, SIGMA_BIN)
            loss = loss_mse + LAMBDA_KL * loss_kl
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()

            # update EMA shadow (CPU)
            ema_n += 1
            with torch.no_grad():
                for name, param in model.named_parameters():
                    if param.requires_grad:
                        s = ema_shadow[name]
                        cur = param.detach().cpu()
                        ema_shadow[name] = EMA_DECAY * s + (1.0 - EMA_DECAY) * cur

            total_loss += float(loss.item()) * xb.size(0)
            total_mse += float(loss_mse.item()) * xb.size(0)
            total_kl += float(loss_kl.item()) * xb.size(0)
            ns += xb.size(0)

        scheduler.step()

        # update SWA
        if ep > swa_start:
            swa_model.update_parameters(model)
            swa_count += 1

        # validation using current (non-SWA) model
        model.eval()
        preds_val_list = []
        with torch.no_grad():
            for xb, yb in val_dl:
                xb = xb.to(DEVICE)
                p = model(xb).detach().cpu().numpy()
                preds_val_list.append(p)
        preds_val = np.concatenate(preds_val_list, axis=0)
        val_rmse = np.sqrt(mean_squared_error(y_val, preds_val))

        avg_loss = total_loss / ns
        print(f"Epoch {ep}/{EPOCHS} | loss={avg_loss:.5f} (mse={total_mse/ns:.5f}, kl={total_kl/ns:.5f}) | val_RMSE={val_rmse:.4f} | time={time.time()-t0:.1f}s")

        # early save best non-SWA model by val_rmse
        if val_rmse + 1e-8 < best_val_rmse:
            best_val_rmse = val_rmse
            best_state = {k:v.cpu().clone() for k,v in model.state_dict().items()}
            patience = 0
        else:
            patience += 1
            if patience >= EARLY_STOP and ep > 6:
                print("Early stopping triggered")
                break

    print(f"Fold {fold} finished. best_val_rmse={best_val_rmse:.4f} swa_count={swa_count}")

    # choose final eval model: SWA if available else best_state
    if swa_count > 0:
        eval_model = RobustMLP(in_dim=X.shape[1]).to(DEVICE)
        swa_state = swa_model.module.state_dict() if hasattr(swa_model, "module") else swa_model.state_dict()
        eval_model.load_state_dict(swa_state)
    else:
        eval_model = RobustMLP(in_dim=X.shape[1]).to(DEVICE)
        eval_model.load_state_dict(best_state)

    # Also build EMA model by copying shadow params
    ema_model = RobustMLP(in_dim=X.shape[1]).to(DEVICE)
    ema_state = ema_model.state_dict()
    for n in ema_state.keys():
        if n in ema_shadow:
            ema_state[n] = ema_shadow[n].to(DEVICE)
    ema_model.load_state_dict(ema_state)

    # Use eval_model for OOF predictions (you can try ema_model instead)
    eval_model.eval()
    val_preds_list = []
    with torch.no_grad():
        for xb, yb in val_dl:
            xb = xb.to(DEVICE)
            p = eval_model(xb).detach().cpu().numpy()
            val_preds_list.append(p)
    val_preds = np.concatenate(val_preds_list, axis=0)
    oof[val_idx] = val_preds

    # test preds by fold
    test_fold_list = []
    with torch.no_grad():
        for xb in test_dl:
            xb = xb.to(DEVICE)
            p = eval_model(xb).detach().cpu().numpy()
            test_fold_list.append(p)
    test_fold_preds = np.concatenate(test_fold_list, axis=0)
    test_preds_folds[fold] = test_fold_preds

    # save fold models
    torch.save(eval_model.state_dict(), f"robustmlp_fold{fold}.pt")
    torch.save(ema_model.state_dict(), f"robustmlp_ema_fold{fold}.pt")
    print(f"Saved robustmlp_fold{fold}.pt and ema version.")

    fold += 1

# ----------------- OOF eval + calibration -----------------
oof_rmse_raw = np.sqrt(mean_squared_error(y, oof))
print("\nOOF RMSE (raw):", oof_rmse_raw)

# Linear calibration
lr_cal = LinearRegression().fit(oof.reshape(-1,1), y)
oof_cal = lr_cal.predict(oof.reshape(-1,1))
oof_rmse_cal = np.sqrt(mean_squared_error(y, oof_cal))
print("OOF RMSE (calibrated):", oof_rmse_cal)
print("Calibration params: a=", lr_cal.coef_[0], "b=", lr_cal.intercept_)

# ----------------- Quantile / histogram post-processing (map test preds to target) -----------------
# Build a sampled array from TARGET_PDF to draw target quantiles
n_sample_map = 100000
sampled_vals = np.random.choice(BIN_CENTERS, size=n_sample_map, p=TARGET_PDF)

def quantile_match_array(preds_raw, sampled_vals):
    # preds_raw: numpy (N,)
    # returns mapped preds (N,) where quantiles of preds mapped to quantiles of sampled_vals
    order = np.argsort(preds_raw)
    ranks = np.empty_like(order)
    ranks[order] = np.arange(len(preds_raw))
    q = ranks.astype(np.float32) / (len(preds_raw)-1 + 1e-12)
    mapped = np.quantile(sampled_vals, q)
    return mapped

# Build final test prediction: average fold predictions -> calibrate -> quantile map -> clip
test_raw_mean = test_preds_folds.mean(axis=0)
test_cal = lr_cal.predict(test_raw_mean.reshape(-1,1)).reshape(-1)

# Apply quantile mapping (fold-level mapping can be done but here use global)
test_quant_mapped = quantile_match_array(test_cal, sampled_vals)
test_final = np.clip(test_quant_mapped, 0.0, 10.0)

# ----------------- Save submission -----------------
# load test_df if not in memory
try:
    test_df
except NameError:
    import json
    with open("test_data.json","r",encoding="utf8") as f:
        test_raw = json.load(f)
    test_df = pd.DataFrame(test_raw)

test_df["ID"] = np.arange(1, len(test_final)+1)
sub = pd.DataFrame({"ID": test_df["ID"], "score": test_final})
sub.to_csv("submission_histKL_swa_ema_quantmap.csv", index=False)
print("\nSaved submission_histKL_swa_ema_quantmap.csv")

print("\nFinal stats:")
print("OOF RMSE raw:", oof_rmse_raw)
print("OOF RMSE calibrated:", oof_rmse_cal)


END ...