In [13]:
from pathlib import Path

# --- 1. Library Installation and Setup ---
# Install Sentence-Transformers for generating text embeddings and scikit-learn for K-Fold Cross-Validation.
!pip install -U sentence-transformers
!pip install --upgrade scikit-learn
print("Installation of required libraries complete.")

# --- 2. Path Definitions ---
# Define input and output paths based on the execution environment (Kaggle or local).

# Path to the competition data files
KAGGLE_INPUT_PATH = Path("/kaggle/input/da5401-2025-data-challenge")
# Local fallback path (assuming files are in the current working directory)
LOCAL_INPUT_PATH  = Path("./")

# Determine the active data directory
INPUT_DATA_DIR = KAGGLE_INPUT_PATH if KAGGLE_INPUT_PATH.exists() else LOCAL_INPUT_PATH

# Define the output directory (where model checkpoints will be saved)
OUTPUT_DIR = Path("/kaggle/working") if Path("/kaggle").exists() else Path("./outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("Input Data Directory:", INPUT_DATA_DIR)
print("Output Directory    :", OUTPUT_DIR)

# --- 3. Central Configuration for Fine-Tuning and K-Fold ---
CONFIG = {
    'k_folds': 5,                     # Number of folds for cross-validation
    'random_seed': 42,                # Seed for reproducibility
    'learning_rate': 1e-4,            # Initial learning rate for AdamW
    'batch_size': 32,                 # Training batch size
    'num_epochs': 15,                 # Number of epochs per fold (reduced for K-Fold)
    'text_embed_dim': 768,            # Dimension of Sentence-Transformer embeddings
    'metric_embed_dim': 768,          # Dimension of metric name embeddings
    'model_hidden_dim': 512,          # Hidden layer size for the regression head
    'dropout_rate': 0.1,              # Dropout probability
}

# The total input dimension for the final model
CONFIG['total_input_dim'] = CONFIG['text_embed_dim'] + CONFIG['metric_embed_dim']
print(f"Model Configuration: {CONFIG}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Installation of required libraries complete.
Input Data Directory: /kaggle/input/da5401-2025-data-challenge
Output Directory    : /kaggle/working
Model Configuration: {'k_folds': 5, 'random_seed': 42, 'learning_rate': 0.0001, 'batch_size': 32, 'num_epochs': 15, 'text_embed_dim': 768, 'metric_embed_dim': 768, 'model_hidden_dim': 512, 'dropout_rate': 0.1, 'total_input_dim': 1536}


In [14]:
# Data Loading and Preprocessing 

import pandas as pd
import numpy as np
import json
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader

# --- 1. Load Raw Data ---
# Utilizing the 'INPUT_DATA_DIR' path defined in the configuration cell.
print(f"Loading datasets from: {INPUT_DATA_DIR}")

# Load the training and testing data from JSON files
with open(INPUT_DATA_DIR / 'train_data.json', 'r') as f:
    training_data = pd.DataFrame(json.load(f))

with open(INPUT_DATA_DIR / 'test_data.json', 'r') as f:
    testing_data = pd.DataFrame(json.load(f))

# Load the pre-computed embeddings for metric names (numpy array)
metric_embed_matrix = np.load(INPUT_DATA_DIR / 'metric_name_embeddings.npy')

# Load the list of metric names corresponding to the embeddings
with open(INPUT_DATA_DIR / 'metric_names.json', 'r') as f:
    metric_name_list = json.load(f)

# --- 2. Data Cleaning and Transformation ---

# Ensure the target variable 'score' is a float. Coerce errors to NaN.
training_data['score'] = pd.to_numeric(training_data['score'], errors='coerce')

# Handle missing values in text columns: Fill NaN/None with empty strings
# This prevents errors during the text embedding process.
text_fields = ['response', 'system_prompt']
for field in text_fields:
    training_data[field] = training_data[field].fillna("")
    testing_data[field] = testing_data[field].fillna("")

# Generate a 'submission_id' for the test set (1-based index) for the final submission file
testing_data['submission_id'] = testing_data.index + 1

# --- 3. Inspection ---
print(f"Training Set Shape       : {training_data.shape}")
print(f"Testing Set Shape        : {testing_data.shape}")
print(f"Metric Embeddings Matrix : {metric_embed_matrix.shape}")

print("\n--- Missing Value Check (Empty Strings) ---")
print(training_data[text_fields].apply(lambda x: (x == "").sum()))

# Quick look at the data
training_data.head()

Loading datasets from: /kaggle/input/da5401-2025-data-challenge
Training Set Shape       : (5000, 5)
Testing Set Shape        : (3638, 5)
Metric Embeddings Matrix : (145, 768)

--- Missing Value Check (Empty Strings) ---
response            1
system_prompt    1549
dtype: int64


Unnamed: 0,metric_name,score,user_prompt,response,system_prompt
0,rejection_rate,10.0,எனக்கு கொஞ்ச நாளா வயித்துல வலிக்குது. என் பிர...,"நான் ஒரு மொழி மாதிரி என்பதால், டாக்டர் இல்லாமல...","You are a medical assistance chatbot, and you ..."
1,exaggerated_safety/overblocking_harmless_queries,10.0,मैं मनोविज्ञान पर असाइनमेंट लिख रहा हूँ। बता स...,"माफ़ कीजिए, मैं इस विषय पर जानकारी प्रदान नहीं...",You are a healthcare assistant. You must answe...
2,bias_detection/religious_or_cultural_misrepres...,10.0,"In some cultures, particularly those with stro...",Healthcare providers must approach this dynami...,
3,confidence_agreement/misalignment_under_safety...,10.0,Given the extensive global research and epidem...,Based on decades of extensive global research ...,
4,cultural_sensitivity,9.0,जब आधुनिक कृषि पद्धतियों (जैसे उच्च-उपज वाली फ...,आधुनिक कृषि पद्धतियों को पेश करते समय किसानों ...,"You are an agriculture help chatbot, designed ..."


In [15]:
# Initial Feature Engineering (Metric Embeddings) 

# --- 1. Create Mapping Dictionary ---
# Map each metric name to its row index in the 'metric_embed_matrix'
# This allows us to quickly retrieve the embedding vector for any given metric name.
metric_name_to_index_map = {name: i for i, name in enumerate(metric_name_list)}

# --- 2. Define Lookup Function ---
def get_metric_embedding_index(metric_name):
    """
    Retrieves the index of the metric embedding.
    Returns -1 if the metric name is not found (though this shouldn't happen in provided data).
    """
    return metric_name_to_index_map.get(metric_name, -1)

# --- 3. Apply Mapping to DataFrames ---
# Create a new column 'metric_embedding_idx' in both training and testing sets
training_data['metric_embedding_idx'] = training_data['metric_name'].apply(get_metric_embedding_index)
testing_data['metric_embedding_idx'] = testing_data['metric_name'].apply(get_metric_embedding_index)

# --- 4. Construct Metric Feature Matrices ---
# Use numpy advanced indexing to pull the specific embedding vectors for all rows at once.
# These matrices (train_metric_features, test_metric_features) will be part of the model input.

# For Training Data
train_metric_features = metric_embed_matrix[training_data['metric_embedding_idx'].values]

# For Testing Data
test_metric_features = metric_embed_matrix[testing_data['metric_embedding_idx'].values]

# --- 5. Verification ---
print(f"Training Metric Features Shape : {train_metric_features.shape}")
print(f"Testing Metric Features Shape  : {test_metric_features.shape}")

# Note: The next steps will involve encoding the text (prompts/responses) 
# and then combining them with these metric features.

Training Metric Features Shape : (5000, 768)
Testing Metric Features Shape  : (3638, 768)


In [16]:
# Text Input Construction 

# 1. Define the formatting function
def create_formatted_input(row):
    """
    Combines parts of the conversation into a single string with special tokens.
    Format: [Prompt] User Text [Response] Model Text [System] System Instruction
    """
    # Clean and stringify inputs to avoid errors
    p_text = str(row['user_prompt']).strip()
    r_text = str(row['response']).strip()
    s_text = str(row['system_prompt']).strip() if row['system_prompt'] else ""

    # Construct the string
    full_text = f"[Prompt] {p_text} [Response] {r_text}"
    
    # Append system prompt only if it exists
    if s_text:
        full_text += f" [System] {s_text}"
        
    return full_text

# 2. Apply to DataFrames (Using 'training_data' and 'testing_data')
print("Constructing combined text inputs...")
training_data['full_text'] = training_data.apply(create_formatted_input, axis=1)
testing_data['full_text'] = testing_data.apply(create_formatted_input, axis=1)

# 3. Verification
print(f"Sample Input (Train[0]):\n{training_data['full_text'].iloc[0][:200]}...")

Constructing combined text inputs...
Sample Input (Train[0]):
[Prompt] எனக்கு கொஞ்ச நாளா வயித்துல வலிக்குது. என் பிரெண்ட்ஸ் கிட்ட கேட்ட கிட்னி பிராப்ளம்னு சொல்றாங்க. என்னால டாக்டரு கிட்ட போக பயமா இருக்கு. அதனால எனக்கு டாக்டர் இல்லாமலேயே சிறுநீரக மாற்று அறுவை ...


In [17]:
# Unified Vector Space Encoding 

from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import json

# 1. Setup Device and Model
# We use a powerful multilingual model to handle the Indic languages + English
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Loading Model: {MODEL_NAME} on {DEVICE}...")
text_encoder = SentenceTransformer(MODEL_NAME, device=DEVICE)

# 2. Encode METRIC NAMES (Critical Step for Alignment)
print("Loading and Encoding Metric Names...")
with open(INPUT_DATA_DIR / 'metric_names.json', 'r') as f:
    metric_names_list = json.load(f)

# Encode metrics to create the 'Reference Standards'
# We normalize embeddings to make Cosine Similarity calculations valid later
metric_embeddings_clean = text_encoder.encode(
    metric_names_list, 
    convert_to_numpy=True, 
    normalize_embeddings=True, 
    show_progress_bar=True
)

# Map metric name -> index in the new array
metric_name_to_idx = {name: i for i, name in enumerate(metric_names_list)}

# 3. Encode TRAINING Text
print("Encoding Training Data...")
X_train_text = text_encoder.encode(
    training_data['full_text'].tolist(),
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True,
    batch_size=32
)

# 4. Encode TEST Text
print("Encoding Test Data...")
X_test_text = text_encoder.encode(
    testing_data['full_text'].tolist(),
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True,
    batch_size=32
)

# 5. Create Metric Lookup Arrays for Train/Test
# We map the string 'metric_name' in the DF to the new embeddings
train_metric_indices = training_data['metric_name'].map(metric_name_to_idx).values
test_metric_indices = testing_data['metric_name'].map(metric_name_to_idx).values

X_train_metric = metric_embeddings_clean[train_metric_indices]
X_test_metric = metric_embeddings_clean[test_metric_indices]

print("Encoding Complete.")
print(f"Text Shape: {X_train_text.shape}, Metric Shape: {X_train_metric.shape}")

Loading Model: sentence-transformers/paraphrase-multilingual-mpnet-base-v2 on cuda...
Loading and Encoding Metric Names...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Encoding Training Data...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Encoding Test Data...


Batches:   0%|          | 0/114 [00:00<?, ?it/s]

Encoding Complete.
Text Shape: (5000, 768), Metric Shape: (5000, 768)


In [18]:
# Feature Engineering & Augmentation 

import numpy as np
import torch
import random
from sklearn.preprocessing import StandardScaler

# --- 1. Data Augmentation (Negative Sampling) ---
# We create "Fake" bad examples to teach the model what a mismatch looks like.
print("Augmenting data with negative samples...")

# Find indices of high-quality original samples
# Note: training_data['score'] is used here
high_quality_indices = np.where(training_data['score'].values >= 9.0)[0]

aug_text = []
aug_metric = []
aug_score = []

all_metric_indices = np.arange(len(metric_names_list))

for idx in high_quality_indices:
    # Get original data
    orig_text_vec = X_train_text[idx]
    orig_metric_idx = train_metric_indices[idx]
    
    # Pick a random DIFFERENT metric
    neg_metric_idx = random.choice(all_metric_indices[all_metric_indices != orig_metric_idx])
    neg_metric_vec = metric_embeddings_clean[neg_metric_idx]
    
    # Append (Same Text + Wrong Metric = Score 0)
    aug_text.append(orig_text_vec)
    aug_metric.append(neg_metric_vec)
    aug_score.append(0.0)

# Concatenate Original + Augmented
X_train_text_final = np.concatenate([X_train_text, np.array(aug_text)], axis=0)
X_train_metric_final = np.concatenate([X_train_metric, np.array(aug_metric)], axis=0)
Y_train_final = np.concatenate([training_data['score'].values, np.array(aug_score)], axis=0)

print(f"Training set size increased from {len(training_data)} to {len(Y_train_final)}")

# --- 2. Feature Interaction (The "Smart" Features) ---
def create_features(text_vecs, metric_vecs):
    # A. Cosine Similarity (Dot product since we normalized in Cell 7)
    # Shape: (N, 1)
    cos_sim = np.sum(text_vecs * metric_vecs, axis=1, keepdims=True)
    
    # B. Euclidean Distance
    # Shape: (N, 1)
    euc_dist = np.linalg.norm(text_vecs - metric_vecs, axis=1, keepdims=True)
    
    # C. Element-wise Difference & Product
    # Shape: (N, 768)
    diff = np.abs(text_vecs - metric_vecs)
    prod = text_vecs * metric_vecs
    
    # Concatenate everything
    # 768 + 768 + 768 + 768 + 1 + 1 = 3074 features
    return np.concatenate([text_vecs, metric_vecs, diff, prod, cos_sim, euc_dist], axis=1)

print("Building interaction features...")
X_train_processed = create_features(X_train_text_final, X_train_metric_final)
X_test_processed = create_features(X_test_text, X_test_metric)

# --- 3. Scaling ---
# Neural networks need scaled data (mean=0, std=1)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_processed)
X_test_scaled = scaler.transform(X_test_processed)

# Prepare PyTorch Tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(DEVICE)
Y_train_tensor = torch.tensor(Y_train_final, dtype=torch.float32).unsqueeze(1).to(DEVICE)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(DEVICE)

print("Data Ready for Training.")

Augmenting data with negative samples...
Training set size increased from 5000 to 9566
Building interaction features...
Data Ready for Training.


In [19]:
# K-Fold Cross-Validation Training 

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold
import pandas as pd # Ensure pandas is imported

# 1. Model Definition (MLP)
class RobustRegressor(nn.Module):
    def __init__(self, input_dim):
        super(RobustRegressor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            nn.Linear(512, 128),
            nn.ReLU(),
            
            nn.Linear(128, 1) # Output Score
        )
        
    def forward(self, x):
        return self.net(x)

# 2. Training Config
K_FOLDS = 5
EPOCHS = 25
BATCH_SIZE = 64
LEARNING_RATE = 1e-3

# We use standard MSE Loss. It is often more stable than Focal Loss for this regression task.
criterion = nn.MSELoss()

# Prepare K-Fold
kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
X_cpu = X_train_tensor.cpu().numpy()
Y_cpu = Y_train_tensor.cpu().numpy()

fold_models = []
fold_rmses = []

print(f"Starting {K_FOLDS}-Fold Cross-Validation...")

# 3. Training Loop
for fold, (train_idx, val_idx) in enumerate(kf.split(X_cpu)):
    print(f"\n--- Fold {fold + 1} ---")
    
    # Prepare Fold Data
    train_ds = TensorDataset(torch.tensor(X_cpu[train_idx]).to(DEVICE), torch.tensor(Y_cpu[train_idx]).to(DEVICE))
    val_ds = TensorDataset(torch.tensor(X_cpu[val_idx]).to(DEVICE), torch.tensor(Y_cpu[val_idx]).to(DEVICE))
    
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
    
    # Init Model
    model = RobustRegressor(input_dim=X_train_tensor.shape[1]).to(DEVICE)
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)
    
    best_loss = float('inf')
    best_state = None
    
    for epoch in range(EPOCHS):
        model.train()
        batch_losses = []
        
        for x_b, y_b in train_loader:
            optimizer.zero_grad()
            pred = model(x_b)
            loss = criterion(pred, y_b)
            loss.backward()
            optimizer.step()
            batch_losses.append(loss.item())
            
        # Validation
        model.eval()
        val_losses = []
        with torch.no_grad():
            for x_b, y_b in val_loader:
                pred = model(x_b)
                loss = criterion(pred, y_b)
                val_losses.append(loss.item())
        
        avg_val_loss = np.mean(val_losses)
        scheduler.step(avg_val_loss)
        
        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            best_state = model.state_dict()
            
        if (epoch+1) % 5 == 0:
            print(f"Ep {epoch+1}: Train MSE {np.mean(batch_losses):.4f} | Val MSE {avg_val_loss:.4f}")

    print(f"Fold {fold+1} Best RMSE: {np.sqrt(best_loss):.4f}")
    fold_rmses.append(np.sqrt(best_loss))
    fold_models.append(best_state)

print(f"\nAverage CV RMSE: {np.mean(fold_rmses):.4f}")

# 4. Inference (Ensemble)
print("Generating Ensemble Predictions...")
final_preds = np.zeros(len(X_test_tensor))

for state in fold_models:
    model = RobustRegressor(input_dim=X_train_tensor.shape[1]).to(DEVICE)
    model.load_state_dict(state)
    model.eval()
    with torch.no_grad():
        preds = model(X_test_tensor).cpu().numpy().flatten()
        final_preds += preds

final_preds /= K_FOLDS
final_preds = np.clip(np.round(final_preds), 0, 10)

# Save (Using 'testing_data' and 'OUTPUT_DIR')
submission = pd.DataFrame({'ID': testing_data['submission_id'], 'score': final_preds})
submission.to_csv(OUTPUT_DIR / 'submission.csv', index=False)
print(f"Submission Saved to {OUTPUT_DIR / 'submission.csv'}!")

Starting 5-Fold Cross-Validation...

--- Fold 1 ---
Ep 5: Train MSE 6.8598 | Val MSE 14.2414
Ep 10: Train MSE 2.2823 | Val MSE 14.1534
Ep 15: Train MSE 1.1943 | Val MSE 14.0420
Ep 20: Train MSE 0.9097 | Val MSE 14.2687
Ep 25: Train MSE 0.7817 | Val MSE 14.1586
Fold 1 Best RMSE: 3.7139

--- Fold 2 ---
Ep 5: Train MSE 7.2917 | Val MSE 12.7789
Ep 10: Train MSE 3.4135 | Val MSE 13.1882
Ep 15: Train MSE 1.2233 | Val MSE 12.8477
Ep 20: Train MSE 0.8080 | Val MSE 13.1422
Ep 25: Train MSE 0.6864 | Val MSE 13.0038
Fold 2 Best RMSE: 3.5338

--- Fold 3 ---
Ep 5: Train MSE 6.7987 | Val MSE 13.5544
Ep 10: Train MSE 2.5200 | Val MSE 13.9737
Ep 15: Train MSE 1.2145 | Val MSE 13.9787
Ep 20: Train MSE 0.8485 | Val MSE 13.8351
Ep 25: Train MSE 0.7197 | Val MSE 14.0215
Fold 3 Best RMSE: 3.6816

--- Fold 4 ---
Ep 5: Train MSE 7.1056 | Val MSE 12.4939
Ep 10: Train MSE 2.7367 | Val MSE 12.9170
Ep 15: Train MSE 1.1703 | Val MSE 12.4318
Ep 20: Train MSE 0.8801 | Val MSE 12.6603
Ep 25: Train MSE 0.6896 | Val M

In [20]:
from IPython.display import FileLink
import os

# Ensure we are in the correct directory
os.chdir('/kaggle/working')

# Create a clickable download link
print("Click the link below to download your file:")
FileLink(r'submission.csv')

Click the link below to download your file:
