In [1]:
!pip install sentence-transformers xgboost pandas numpy scikit-learn



In [2]:
import pandas as pd
import numpy as np
import json
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import os

# 1. Load the Metric Names and Embeddings
# This maps the string name of a metric to its pre-calculated vector.
with open('/content/metric_names.json', 'r') as f:
    metric_names_list = json.load(f)

metric_embeddings = np.load('/content/metric_name_embeddings.npy')

# Create a dictionary for fast lookup: {'bias_detection': [vector...]}
metric_map = {name: metric_embeddings[i] for i, name in enumerate(metric_names_list)}

print(f"Loaded {len(metric_map)} metric definitions.")

# 2. Load Training Data
with open('/content/train_data.json', 'r') as f:
    train_data = json.load(f)

# 3. Load Test Data
with open('/content/test_data.json', 'r') as f:
    test_data = json.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

Loaded 145 metric definitions.
Training samples: 5000
Test samples: 3638


In [3]:
# Initialize a fast and powerful sentence encoder
# We use this to encode the "Prompt + Response" text
text_model = SentenceTransformer('all-MiniLM-L6-v2')

def process_dataset(data_list, is_train=True):
    X_text_list = []
    X_metric_list = []
    y_list = []
    ids = []

    print(f"Processing {'Training' if is_train else 'Testing'} data...")

    texts_to_encode = []

    for idx, row in enumerate(data_list):
        # 1. Prepare the input text (Prompt + Response + System Prompt)
        # We join them together so the model understands the full context.
        sys_prompt = row.get('system_prompt')
        if sys_prompt is None:
            sys_prompt = ""

        full_text = f"System: {sys_prompt} | User: {row['user_prompt']} | Response: {row['response']}"
        texts_to_encode.append(full_text)

        # 2. Get the corresponding Metric Embedding (lookup from the .npy file)
        m_name = row['metric_name']
        if m_name in metric_map:
            X_metric_list.append(metric_map[m_name])
        else:
            # Fallback if metric name is missing (unlikely), use zero vector
            print(f"Warning: Metric {m_name} not found in map. Using zeros.")
            X_metric_list.append(np.zeros(768))

        # 3. Get Target Score (only for training data)
        if is_train:
            y_list.append(float(row['score']))

        # Keep track of IDs for submission (1-based index)
        ids.append(idx + 1)

    # Bulk encode texts (much faster than one by one)
    print("Encoding text data (this may take a minute)...")
    X_text_embeddings = text_model.encode(texts_to_encode, show_progress_bar=True)

    # Convert lists to numpy arrays
    X_metric_arr = np.array(X_metric_list)
    X_text_arr = np.array(X_text_embeddings)

    # Concatenate: [Metric_Vector (768 dims)] + [Text_Vector (384 dims)]
    X_combined = np.hstack((X_metric_arr, X_text_arr))

    return X_combined, np.array(y_list), ids

# --- EXECUTE PROCESSING ---
X_train_full, y_train_full, _ = process_dataset(train_data, is_train=True)
X_test, _, test_ids = process_dataset(test_data, is_train=False)

print(f"Feature Matrix Shape: {X_train_full.shape}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Processing Training data...
Encoding text data (this may take a minute)...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Processing Testing data...
Encoding text data (this may take a minute)...


Batches:   0%|          | 0/114 [00:00<?, ?it/s]

Feature Matrix Shape: (5000, 1152)


In [4]:
# 1. Split into Train and Validation sets (80/20 split)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# 2. Initialize XGBoost Regressor
# Objective: squared error (standard for RMSE)
model = xgb.XGBRegressor(
    n_estimators=1000,      # Number of trees
    learning_rate=0.05,     # Step size
    max_depth=6,            # Depth of trees
    subsample=0.8,          # Prevent overfitting
    colsample_bytree=0.8,   # Prevent overfitting
    early_stopping_rounds=50,
    n_jobs=-1,              # Use all CPU cores
    random_state=42
)

print("Training Model...")
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=100
)

# 3. Evaluate on Validation Set
val_preds = model.predict(X_val)

# IMPORTANT: The problem mentions valid scores are Integers (0-10).
# Rounding often improves RMSE in these specific "discrete ground truth" challenges.
val_preds_rounded = np.round(val_preds.clip(0, 10))

rmse = np.sqrt(mean_squared_error(y_val, val_preds))
rmse_rounded = np.sqrt(mean_squared_error(y_val, val_preds_rounded))

print("--- Validation Results ---")
print(f"Raw RMSE: {rmse:.4f}")
print(f"Rounded RMSE (Optimized): {rmse_rounded:.4f}")

Training Model...
[0]	validation_0-rmse:0.94445
[100]	validation_0-rmse:0.90279
[113]	validation_0-rmse:0.90336
--- Validation Results ---
Raw RMSE: 0.9010
Rounded RMSE (Optimized): 0.9418


In [5]:
print("Generating predictions for test set...")
test_preds = model.predict(X_test)

# Apply the "Hack": Clip to 0-10 and Round to nearest integer
# The problem states ground truth is discretized.
final_preds = np.round(test_preds.clip(0, 10))

# Create Dataframe
submission_df = pd.DataFrame({
    'ID': test_ids,
    'score': final_preds
})

# Ensure scores are floats (e.g., 9.0 instead of 9) to match format
submission_df['score'] = submission_df['score'].astype(float)

# Save
submission_df.to_csv('submission.csv', index=False)

print("submission.csv created successfully!")
print(submission_df.head())

Generating predictions for test set...
submission.csv created successfully!
   ID  score
0   1    9.0
1   2    9.0
2   3    9.0
3   4    9.0
4   5    9.0


In [6]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>