In [5]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import os
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from collections import Counter
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Create output directory
output_dir = Path('/kaggle/working/output')
output_dir.mkdir(parents=True, exist_ok=True)

# Load datasets
with open('/kaggle/input/da5401-2025-data-challenge/train_data.json', 'r') as f:
    train_data = json.load(f)

with open('/kaggle/input/da5401-2025-data-challenge/test_data.json', 'r') as f:
    test_data = json.load(f)

with open('/kaggle/input/da5401-2025-data-challenge/metric_names.json', 'r') as f:
    metric_names_list = json.load(f)

metric_embeddings = np.load('/kaggle/input/da5401-2025-data-challenge/metric_name_embeddings.npy')
sample_submission = pd.read_csv('//kaggle/input/da5401-2025-data-challenge/sample_submission.csv')

# Convert to DataFrames
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"✓ Training samples: {len(train_df)}")
print(f"✓ Test samples: {len(test_df)}")
print(f"✓ Metric embeddings shape: {metric_embeddings.shape}")
print(f"✓ Unique metrics: {train_df['metric_name'].nunique()}")

✓ Training samples: 5000
✓ Test samples: 3638
✓ Metric embeddings shape: (145, 768)
✓ Unique metrics: 145


In [6]:
print("\nANALYZING SCORE DISTRIBUTION")
print("="*80)
# Convert scores to numeric
train_df['score'] = pd.to_numeric(train_df['score'])

score_counts = Counter(train_df['score'])
print("\nScore Distribution:")
for score in sorted(score_counts.keys()):
    count = score_counts[score]
    percentage = (count / len(train_df)) * 100
    bar = '█' * int(percentage / 2)
    print(f"Score {int(score):2d}: {count:5d} ({percentage:5.1f}%) {bar}")

print(f"\nMean: {train_df['score'].mean():.2f}")
print(f"Median: {train_df['score'].median():.2f}")
print(f"Std: {train_df['score'].std():.2f}")

# Create sample weights to handle skew
score_weights = len(train_df) / (len(score_counts) * train_df['score'].value_counts())
train_df['sample_weight'] = train_df['score'].map(score_weights)

print(f"\n✓ Sample weights calculated (min: {train_df['sample_weight'].min():.3f}, max: {train_df['sample_weight'].max():.3f})")



ANALYZING SCORE DISTRIBUTION

Score Distribution:
Score  0:    13 (  0.3%) 
Score  1:     6 (  0.1%) 
Score  2:     5 (  0.1%) 
Score  3:     7 (  0.1%) 
Score  4:     3 (  0.1%) 
Score  5:     1 (  0.0%) 
Score  6:    45 (  0.9%) 
Score  7:    95 (  1.9%) 
Score  8:   259 (  5.2%) ██
Score  9:  3123 ( 62.5%) ███████████████████████████████
Score  9:     1 (  0.0%) 
Score 10:  1442 ( 28.8%) ██████████████

Mean: 9.12
Median: 9.00
Std: 0.94

✓ Sample weights calculated (min: 0.133, max: 416.667)


In [7]:
print("CREATING EMBEDDINGS WITH l3cube-pune/indic-sentence-similarity-sbert")
print("="*80)

# Load Indic Sentence Similarity model
print("\nl3cube-pune/indic-sentence-similarity-sbert model")
embedding_model = SentenceTransformer(
    "l3cube-pune/indic-sentence-similarity-sbert"
)

print("✓ Model loaded successfully")
print(f"  Model dimension: {embedding_model.get_sentence_embedding_dimension()}")
print(f"  Max sequence length: {embedding_model.max_seq_length}")

def encode_long_text_with_sliding_window(
    texts,
    model,
    max_length=512,
    stride=256,
    batch_size=32,
    show_progress=True,
    normalize=True
):

    from tqdm.auto import tqdm
    import torch

    tokenizer = model.tokenizer
    all_embeddings = []

    texts_iter = tqdm(texts, desc="Encoding texts") if show_progress else texts

    for idx, text in enumerate(texts_iter):
        try:
            if text is None or (isinstance(text, float) and np.isnan(text)):
                text = ""
            elif not isinstance(text, str):
                text = str(text)
                
            text = text.strip()

            if len(text) == 0:
                # Create zero embedding for empty text
                embedding_dim = model.get_sentence_embedding_dimension()
                embedding = np.zeros(embedding_dim)
                all_embeddings.append(embedding)
                continue

            # Tokenize the full text with truncation disabled
            tokens = tokenizer.encode(
                text,
                add_special_tokens=False,
                truncation=False
            )

            # If text is short enough, encode directly
            if len(tokens) <= max_length - 2:  
                embedding = model.encode(
                    [text],
                    batch_size=1,
                    show_progress_bar=False,
                    convert_to_numpy=True,
                    normalize_embeddings=normalize
                )[0]
                all_embeddings.append(embedding)
                continue

            # For long texts, use sliding window
            chunk_embeddings = []
            start_idx = 0

            while start_idx < len(tokens):
                # Extract chunk of tokens
                end_idx = min(start_idx + max_length - 2, len(tokens))
                chunk_tokens = tokens[start_idx:end_idx]

                # Decode back to text
                chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)

                # Encode chunk
                chunk_emb = model.encode(
                    [chunk_text],
                    batch_size=1,
                    show_progress_bar=False,
                    convert_to_numpy=True,
                    normalize_embeddings=False  
                )[0]

                chunk_embeddings.append(chunk_emb)

                # Move window
                if end_idx == len(tokens):
                    break
                start_idx += stride

            # Average all chunk embeddings
            if len(chunk_embeddings) > 0:
                final_embedding = np.mean(chunk_embeddings, axis=0)

                # Normalize if requested
                if normalize:
                    norm = np.linalg.norm(final_embedding)
                    if norm > 0:
                        final_embedding = final_embedding / norm

                all_embeddings.append(final_embedding)
            else:
                # Fallback- create zero embedding
                embedding_dim = model.get_sentence_embedding_dimension()
                all_embeddings.append(np.zeros(embedding_dim))

        except Exception as e:
            print(f"\n⚠️  Error encoding text at index {idx}: {str(e)}")
            print(f"   Text preview: {str(text)[:100]}...")
            # Create zero embedding as fallback
            embedding_dim = model.get_sentence_embedding_dimension()
            all_embeddings.append(np.zeros(embedding_dim))
            continue

    return np.array(all_embeddings)

# Handle NULL values
train_df['system_prompt'] = train_df['system_prompt'].fillna("").astype(str)
test_df['system_prompt'] = test_df['system_prompt'].fillna("").astype(str)

if 'user_prompt' in train_df.columns:
    train_df['prompt'] = train_df['user_prompt'].fillna("").astype(str)
    test_df['prompt'] = test_df['user_prompt'].fillna("").astype(str)
else:
    train_df['prompt'] = train_df['prompt'].fillna("").astype(str)
    test_df['prompt'] = test_df['prompt'].fillna("").astype(str)

train_df['response'] = train_df['response'].fillna("").astype(str)
test_df['response'] = test_df['response'].fillna("").astype(str)

# ENCODE TRAINING DATA WITH SLIDING WINDOW

print("\nEncoding training data with sliding window")
print("Prompts")
train_prompt_embp = encode_long_text_with_sliding_window(
    train_df['prompt'].tolist(),
    embedding_model,
    max_length=510,  # Slightly reduced to account for special tokens
    stride=256,
    batch_size=32,
    show_progress=True,
    normalize=True
)

print("System prompts")
train_system_embp = encode_long_text_with_sliding_window(
    train_df['system_prompt'].tolist(),
    embedding_model,
    max_length=510,
    stride=256,
    batch_size=32,
    show_progress=True,
    normalize=True
)

print("Responses")
train_response_embp = encode_long_text_with_sliding_window(
    train_df['response'].tolist(),
    embedding_model,
    max_length=510,
    stride=256,
    batch_size=32,
    show_progress=True,
    normalize=True
)

print("\nEncoding test data with sliding window")
print("Prompts")
test_prompt_embp = encode_long_text_with_sliding_window(
    test_df['prompt'].tolist(),
    embedding_model,
    max_length=510,
    stride=256,
    batch_size=32,
    show_progress=True,
    normalize=True
)

print("System prompts")
test_system_embp = encode_long_text_with_sliding_window(
    test_df['system_prompt'].tolist(),
    embedding_model,
    max_length=510,
    stride=256,
    batch_size=32,
    show_progress=True,
    normalize=True
)

print("Responses")
test_response_embp = encode_long_text_with_sliding_window(
    test_df['response'].tolist(),
    embedding_model,
    max_length=510,
    stride=256,
    batch_size=32,
    show_progress=True,
    normalize=True
)

np.save(output_dir / 'train_prompt_embp.npy', train_prompt_embp)
np.save(output_dir / 'train_system_embp.npy', train_system_embp)
np.save(output_dir / 'train_response_embp.npy', train_response_embp)
np.save(output_dir / 'test_prompt_embp.npy', test_prompt_embp)
np.save(output_dir / 'test_system_embp.npy', test_system_embp)
np.save(output_dir / 'test_response_embp.npy', test_response_embp)

print(f"\n✓ All embeddings saved to {output_dir}")
print(f"\nEmbedding Shapes:")
print(f"  Train prompt: {train_prompt_embp.shape}")
print(f"  Train system: {train_system_embp.shape}")
print(f"  Train response: {train_response_embp.shape}")
print(f"  Test prompt: {test_prompt_embp.shape}")
print(f"  Test system: {test_system_embp.shape}")
print(f"  Test response: {test_response_embp.shape}")

print("\nEMBEDDING GENERATION COMPLETE")
print("="*80)

CREATING EMBEDDINGS WITH l3cube-pune/indic-sentence-similarity-sbert

l3cube-pune/indic-sentence-similarity-sbert model
✓ Model loaded successfully
  Model dimension: 768
  Max sequence length: 512

Encoding training data with sliding window
Prompts


Encoding texts:   0%|          | 0/5000 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (726 > 512). Running this sequence through the model will result in indexing errors


System prompts


Encoding texts:   0%|          | 0/5000 [00:00<?, ?it/s]

Responses


Encoding texts:   0%|          | 0/5000 [00:00<?, ?it/s]


Encoding test data with sliding window
Prompts


Encoding texts:   0%|          | 0/3638 [00:00<?, ?it/s]

System prompts


Encoding texts:   0%|          | 0/3638 [00:00<?, ?it/s]

Responses


Encoding texts:   0%|          | 0/3638 [00:00<?, ?it/s]


✓ All embeddings saved to /kaggle/working/output

Embedding Shapes:
  Train prompt: (5000, 768)
  Train system: (5000, 768)
  Train response: (5000, 768)
  Test prompt: (3638, 768)
  Test system: (3638, 768)
  Test response: (3638, 768)

EMBEDDING GENERATION COMPLETE


In [8]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from collections import defaultdict
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

print("ORDINAL + XGBOOST ENSEMBLE")
print("="*80)

train_prompt_emb = np.load('/kaggle/input/puneemb/train_prompt_embp.npy')
train_system_emb = np.load('/kaggle/input/puneemb/train_system_embp.npy')
train_response_emb = np.load('/kaggle/input/puneemb/train_response_embp.npy')
test_prompt_emb = np.load('/kaggle/input/puneemb/test_prompt_embp.npy')
test_system_emb = np.load('/kaggle/input/puneemb/test_system_embp.npy')
test_response_emb = np.load('/kaggle/input/puneemb/test_response_embp.npy')

metric_embeddings_norm = metric_embeddings / np.linalg.norm(metric_embeddings, axis=1, keepdims=True)
metric_to_idx = {name: idx for idx, name in enumerate(metric_names_list)}

print("\nDATA AUGMENTATION")
print("="*80)

def generate_comprehensive_negatives(df, prompt_emb, system_emb, response_emb,
                                     num_negatives_per_positive):
    metric_groups = defaultdict(list)
    for idx, row in df.iterrows():
        metric_groups[row['metric_name']].append(idx)

    augmented_data = []
    augmented_prompt_emb = []
    augmented_system_emb = []
    augmented_response_emb = []

    for idx, row in df.iterrows():
        score = row['score']

        if score <= 3:
            replication_factor = 20
        elif score <= 5:
            replication_factor = 10
        elif score <= 7:
            replication_factor = 3
        else:
            replication_factor = 1

        for _ in range(replication_factor):
            augmented_data.append({
                'metric_name': row['metric_name'],
                'prompt': row['prompt'],
                'system_prompt': row['system_prompt'],
                'response': row['response'],
                'score': row['score'],
                'is_synthetic': False,
                'original_idx': idx
            })
            augmented_prompt_emb.append(prompt_emb[idx])
            augmented_system_emb.append(system_emb[idx])
            augmented_response_emb.append(response_emb[idx])

    high_score_indices = df[df['score'] >= 7].index.tolist()

    for high_idx in high_score_indices:
        high_row = df.loc[high_idx]
        high_metric = high_row['metric_name']

        other_metrics = [m for m in metric_groups.keys() if m != high_metric]
        if len(other_metrics) == 0:
            continue

        for _ in range(num_negatives_per_positive):
            neg_metric = np.random.choice(other_metrics)
            neg_idx = np.random.choice(metric_groups[neg_metric])
            neg_row = df.loc[neg_idx]

            synthetic_score = np.random.randint(0, 3)

            augmented_data.append({
                'metric_name': high_metric,
                'prompt': neg_row['prompt'],
                'system_prompt': neg_row['system_prompt'],
                'response': neg_row['response'],
                'score': synthetic_score,
                'is_synthetic': True,
                'original_idx': high_idx
            })

            augmented_prompt_emb.append(prompt_emb[neg_idx])
            augmented_system_emb.append(system_emb[neg_idx])
            augmented_response_emb.append(response_emb[neg_idx])

    augmented_df = pd.DataFrame(augmented_data)

    print(f"Augmented: {len(augmented_df)} samples")
    print(f"  Synthetic: {augmented_df['is_synthetic'].sum()}")

    return (augmented_df,
            np.array(augmented_prompt_emb),
            np.array(augmented_system_emb),
            np.array(augmented_response_emb))

train_aug_df, train_aug_prompt_emb, train_aug_system_emb, train_aug_response_emb = \
    generate_comprehensive_negatives(train_df, train_prompt_emb, train_system_emb,
                                    train_response_emb, num_negatives_per_positive=13)

ORDINAL + XGBOOST ENSEMBLE

DATA AUGMENTATION
Augmented: 69865 samples
  Synthetic: 63960


In [9]:
print("ORDINAL REGRESSION MODEL")
print("="*80)

class OrdinalRegressionDataset(Dataset):
    def __init__(self, metric_emb, prompt_emb, system_emb, response_emb, scores, weights=None):
        self.metric_emb = torch.FloatTensor(metric_emb)
        self.prompt_emb = torch.FloatTensor(prompt_emb)
        self.system_emb = torch.FloatTensor(system_emb)
        self.response_emb = torch.FloatTensor(response_emb)
        self.scores = torch.LongTensor(scores.astype(int))
        self.weights = torch.FloatTensor(weights) if weights is not None else torch.ones(len(scores))

    def __len__(self):
        return len(self.scores)

    def __getitem__(self, idx):
        return (self.metric_emb[idx], self.prompt_emb[idx],
                self.system_emb[idx], self.response_emb[idx],
                self.scores[idx], self.weights[idx])

class OrdinalWithThresholds(nn.Module):
    def __init__(self, input_dim=768, num_classes=11):
        super().__init__()

        self.feature_net = nn.Sequential(
            nn.Linear(input_dim * 4 + 10, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1)
        )

        initial_thresholds = torch.linspace(0.5, 9.5, num_classes - 1)
        self.thresholds = nn.Parameter(initial_thresholds)

    def forward(self, metric_emb, prompt_emb, system_emb, response_emb):
        sims = torch.stack([
            (metric_emb * prompt_emb).sum(dim=1),
            (metric_emb * system_emb).sum(dim=1),
            (metric_emb * response_emb).sum(dim=1),
            (prompt_emb * response_emb).sum(dim=1),
            (prompt_emb * system_emb).sum(dim=1),
            (system_emb * response_emb).sum(dim=1),
            (metric_emb * (prompt_emb + response_emb) / 2).sum(dim=1),
            (metric_emb * (prompt_emb + system_emb + response_emb) / 3).sum(dim=1),
            ((prompt_emb + response_emb) * metric_emb).sum(dim=1),
            ((system_emb + response_emb) * metric_emb).sum(dim=1)
        ], dim=1)

        combined = torch.cat([metric_emb, prompt_emb, system_emb, response_emb, sims], dim=1)
        continuous_score = self.feature_net(combined).squeeze()

        return continuous_score

    def get_class_probabilities(self, continuous_score):
        sorted_thresholds = torch.sort(self.thresholds)[0]
        cumulative_probs = torch.sigmoid(continuous_score.unsqueeze(1) - sorted_thresholds.unsqueeze(0))

        num_classes = len(sorted_thresholds) + 1
        class_probs = torch.zeros(continuous_score.size(0), num_classes, device=continuous_score.device)

        class_probs[:, 0] = 1 - cumulative_probs[:, 0]
        for i in range(1, num_classes - 1):
            class_probs[:, i] = cumulative_probs[:, i-1] - cumulative_probs[:, i]
        class_probs[:, -1] = cumulative_probs[:, -1]

        return class_probs

def ordinal_loss_with_weights(continuous_scores, targets, thresholds, weights):
    sorted_thresholds = torch.sort(thresholds)[0]
    batch_size = continuous_scores.size(0)
    num_thresholds = len(sorted_thresholds)

    cumulative_labels = torch.zeros(batch_size, num_thresholds, device=continuous_scores.device)
    for i, target in enumerate(targets):
        cumulative_labels[i, :target] = 1

    cumulative_probs = torch.sigmoid(continuous_scores.unsqueeze(1) - sorted_thresholds.unsqueeze(0))
    bce = F.binary_cross_entropy(cumulative_probs, cumulative_labels, reduction='none')
    weighted_bce = bce * weights.unsqueeze(1)

    return weighted_bce.mean()

def train_ordinal_model(train_dataset, val_dataset, epochs=25):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = OrdinalWithThresholds().to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2)

    train_scores = train_dataset.scores.numpy()
    score_counts = np.bincount(train_scores, minlength=11)
    score_weights = 1.0 / (score_counts + 1)
    sample_weights = score_weights[train_scores]
    sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)

    train_loader = DataLoader(train_dataset, batch_size=128, sampler=sampler)
    val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)

    best_rmse = float('inf')
    best_model = None
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for batch in train_loader:
            metric_emb, prompt_emb, system_emb, response_emb, targets, weights = [b.to(device) for b in batch]

            optimizer.zero_grad()
            continuous_scores = model(metric_emb, prompt_emb, system_emb, response_emb)
            loss = ordinal_loss_with_weights(continuous_scores, targets, model.thresholds, weights)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            train_loss += loss.item()

        model.eval()
        val_preds = []
        val_targets = []

        with torch.no_grad():
            for batch in val_loader:
                metric_emb, prompt_emb, system_emb, response_emb, targets, _ = [b.to(device) for b in batch]
                continuous_scores = model(metric_emb, prompt_emb, system_emb, response_emb)
                class_probs = model.get_class_probabilities(continuous_scores)
                pred_scores = (class_probs * torch.arange(11, device=device).float()).sum(dim=1)

                val_preds.extend(pred_scores.cpu().numpy())
                val_targets.extend(targets.cpu().numpy())

        val_preds = np.clip(np.round(val_preds), 0, 10)
        val_rmse = np.sqrt(mean_squared_error(val_targets, val_preds))

        if val_rmse < best_rmse:
            best_rmse = val_rmse
            best_model = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= 10:
                break

        scheduler.step()

    model.load_state_dict(best_model)
    return model

# Prepare data
train_aug_metric_embs = np.array([
    metric_embeddings_norm[metric_to_idx[row['metric_name']]]
    if row['metric_name'] in metric_to_idx
    else np.mean(metric_embeddings_norm, axis=0)
    for _, row in train_aug_df.iterrows()
])

test_metric_embs = np.array([
    metric_embeddings_norm[metric_to_idx[row['metric_name']]]
    if row['metric_name'] in metric_to_idx
    else np.mean(metric_embeddings_norm, axis=0)
    for _, row in test_df.iterrows()
])

score_counts = train_aug_df['score'].value_counts()
max_count = score_counts.max()
sample_weights = train_aug_df['score'].map(lambda x: max_count / score_counts[x]).values

# CV setup
gkf = GroupKFold(n_splits=5)
original_groups = np.array([
    train_df.loc[row['original_idx'], 'metric_name'] if not row['is_synthetic']
    else f"synthetic_{row['original_idx']}"
    for _, row in train_aug_df.iterrows()
])

ordinal_models = []
ordinal_oof = np.zeros(len(train_df))
ordinal_test = np.zeros(len(test_df))

# Train ordinal models
for fold, (train_idx, val_idx) in enumerate(gkf.split(train_aug_metric_embs,
                                                       train_aug_df['score'],
                                                       original_groups), 1):
    print(f"\nOrdinal Fold {fold}/5")

    val_original_mask = ~train_aug_df.iloc[val_idx]['is_synthetic'].values
    val_original_indices = train_aug_df.iloc[val_idx][val_original_mask]['original_idx'].values

    train_dataset = OrdinalRegressionDataset(
        train_aug_metric_embs[train_idx], train_aug_prompt_emb[train_idx],
        train_aug_system_emb[train_idx], train_aug_response_emb[train_idx],
        train_aug_df['score'].values[train_idx], sample_weights[train_idx]
    )

    val_dataset = OrdinalRegressionDataset(
        train_aug_metric_embs[val_idx][val_original_mask],
        train_aug_prompt_emb[val_idx][val_original_mask],
        train_aug_system_emb[val_idx][val_original_mask],
        train_aug_response_emb[val_idx][val_original_mask],
        train_aug_df['score'].values[val_idx][val_original_mask],
        sample_weights[val_idx][val_original_mask]
    )

    model = train_ordinal_model(train_dataset, val_dataset, epochs=25)
    ordinal_models.append(model)

    # OOF
    model.eval()
    device = next(model.parameters()).device
    val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)

    val_preds = []
    with torch.no_grad():
        for batch in val_loader:
            metric_emb, prompt_emb, system_emb, response_emb, _, _ = [b.to(device) for b in batch]
            continuous_scores = model(metric_emb, prompt_emb, system_emb, response_emb)
            class_probs = model.get_class_probabilities(continuous_scores)
            pred_scores = (class_probs * torch.arange(11, device=device).float()).sum(dim=1)
            val_preds.extend(pred_scores.cpu().numpy())

    ordinal_oof[val_original_indices] = np.array(val_preds)

    # Test
    test_dataset = OrdinalRegressionDataset(
        test_metric_embs, test_prompt_emb, test_system_emb, test_response_emb,
        np.zeros(len(test_df)), np.ones(len(test_df))
    )
    test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

    test_preds = []
    with torch.no_grad():
        for batch in test_loader:
            metric_emb, prompt_emb, system_emb, response_emb, _, _ = [b.to(device) for b in batch]
            continuous_scores = model(metric_emb, prompt_emb, system_emb, response_emb)
            class_probs = model.get_class_probabilities(continuous_scores)
            pred_scores = (class_probs * torch.arange(11, device=device).float()).sum(dim=1)
            test_preds.extend(pred_scores.cpu().numpy())

    ordinal_test += np.array(test_preds) / 5

ordinal_oof_rounded = np.clip(np.round(ordinal_oof), 0, 10)
ordinal_rmse = np.sqrt(mean_squared_error(train_df['score'], ordinal_oof_rounded))
print(f"\n✓ Ordinal OOF RMSE: {ordinal_rmse:.4f}")

# ADD XGBOOST ENSEMBLE

print("\nADDING XGBOOST ENSEMBLE")
print("="*80)

# Create simple features for XGBoost
X_train_aug = np.concatenate([
    train_aug_metric_embs,
    train_aug_prompt_emb,
    train_aug_response_emb,
    (train_aug_metric_embs * train_aug_prompt_emb),
    (train_aug_metric_embs * train_aug_response_emb),
    (train_aug_prompt_emb * train_aug_response_emb)
], axis=1)

X_test = np.concatenate([
    test_metric_embs,
    test_prompt_emb,
    test_response_emb,
    (test_metric_embs * test_prompt_emb),
    (test_metric_embs * test_response_emb),
    (test_prompt_emb * test_response_emb)
], axis=1)

xgb_oof = np.zeros(len(train_df))
xgb_test = np.zeros(len(test_df))

for fold, (train_idx, val_idx) in enumerate(gkf.split(train_aug_metric_embs,
                                                       train_aug_df['score'],
                                                       original_groups), 1):
    print(f"XGBoost Fold {fold}/5", end=' ')

    val_original_mask = ~train_aug_df.iloc[val_idx]['is_synthetic'].values
    val_original_indices = train_aug_df.iloc[val_idx][val_original_mask]['original_idx'].values

    dtrain = xgb.DMatrix(X_train_aug[train_idx],
                        label=train_aug_df['score'].values[train_idx],
                        weight=sample_weights[train_idx])
    dval = xgb.DMatrix(X_train_aug[val_idx][val_original_mask],
                      label=train_aug_df['score'].values[val_idx][val_original_mask])
    dtest = xgb.DMatrix(X_test)

    params = {
        'objective': 'reg:squarederror',
        'max_depth': 6,
        'learning_rate': 0.02,
        'subsample': 0.85,
        'colsample_bytree': 0.85,
        'reg_alpha': 0.1,
        'reg_lambda': 1.5,
        'random_state': 42
    }

    model = xgb.train(params, dtrain, num_boost_round=600,
                     evals=[(dval, 'val')], early_stopping_rounds=50, verbose_eval=False)

    xgb_oof[val_original_indices] = model.predict(dval)
    xgb_test += model.predict(dtest) / 5

    rmse = np.sqrt(mean_squared_error(
        train_df.loc[val_original_indices, 'score'],
        np.clip(np.round(model.predict(dval)), 0, 10)
    ))
    print(f"RMSE: {rmse:.4f}")

xgb_oof_rounded = np.clip(np.round(xgb_oof), 0, 10)
xgb_rmse = np.sqrt(mean_squared_error(train_df['score'], xgb_oof_rounded))
print(f"✓ XGBoost OOF RMSE: {xgb_rmse:.4f}")

# OPTIMAL ENSEMBLE WEIGHTING

print("\nFINDING OPTIMAL ENSEMBLE WEIGHT")
print("="*80)

best_rmse = float('inf')
best_weight = 0.5

for ordinal_weight in np.arange(0.5, 1.0, 0.05):
    xgb_weight = 1 - ordinal_weight
    ensemble_oof = ordinal_weight * ordinal_oof + xgb_weight * xgb_oof
    ensemble_oof_rounded = np.clip(np.round(ensemble_oof), 0, 10)
    rmse = np.sqrt(mean_squared_error(train_df['score'], ensemble_oof_rounded))

    if rmse < best_rmse:
        best_rmse = rmse
        best_weight = ordinal_weight

print(f"Optimal: {best_weight:.2f} Ordinal + {1-best_weight:.2f} XGBoost")
print(f"Ensemble OOF RMSE: {best_rmse:.4f}")
print(f"Improvement: {ordinal_rmse - best_rmse:.4f}")

# Final predictions
final_test = best_weight * ordinal_test + (1 - best_weight) * xgb_test
final_test = np.clip(np.round(final_test), 0, 10)


ORDINAL REGRESSION MODEL

Ordinal Fold 1/5

Ordinal Fold 2/5

Ordinal Fold 3/5

Ordinal Fold 4/5

Ordinal Fold 5/5

✓ Ordinal OOF RMSE: 4.0077

ADDING XGBOOST ENSEMBLE
XGBoost Fold 1/5 RMSE: 5.5887
XGBoost Fold 2/5 RMSE: 4.9165
XGBoost Fold 3/5 RMSE: 4.9857
XGBoost Fold 4/5 RMSE: 4.7968
XGBoost Fold 5/5 RMSE: 5.3610
✓ XGBoost OOF RMSE: 5.4601

FINDING OPTIMAL ENSEMBLE WEIGHT
Optimal: 0.95 Ordinal + 0.05 XGBoost
Ensemble OOF RMSE: 4.0262
Improvement: -0.0185


In [11]:
submission = sample_submission.copy()
submission['score'] = final_test
submission.to_csv('/kaggle/working/output/submission_ensemble.csv', index=False)

print("\n✓ Submission created")
print(f"\nTest mean: {submission['score'].mean():.2f}")
print(f"\nDistribution:")
for score in sorted(submission['score'].unique()):
    count = (submission['score'] == score).sum()
    pct = count / len(submission) * 100
    bar = '█' * int(pct / 2)
    print(f"Score {int(score):2d}: {count:4d} ({pct:5.1f}%) {bar}")

print("\nCOMPLETE")
print("="*80)



✓ Submission created

Test mean: 6.20

Distribution:
Score  0:   79 (  2.2%) █
Score  1:  337 (  9.3%) ████
Score  2:  209 (  5.7%) ██
Score  3:  164 (  4.5%) ██
Score  4:  150 (  4.1%) ██
Score  5:  152 (  4.2%) ██
Score  6:  261 (  7.2%) ███
Score  7:  633 ( 17.4%) ████████
Score  8:  975 ( 26.8%) █████████████
Score  9:  613 ( 16.8%) ████████
Score 10:   65 (  1.8%) 

COMPLETE
