## # 🧠 Chatbot Arena Preference Prediction with XGBoost

## 📚 Step 1: Import Libraries

In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from scipy.sparse import hstack
import xgboost as xgb

## 📂 Step 2: Load Training Data

In [16]:
train_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')

## 🧹 Step 3: Preprocess and Clean Text

In [17]:
train_df['text_a'] = (train_df['prompt'] + " " + train_df['response_a']).str.lower().str.strip()
train_df['text_b'] = (train_df['prompt'] + " " + train_df['response_b']).str.lower().str.strip()

## 🎯 Step 4: Create Target Label

In [18]:
def get_label(row):
    if row['winner_model_a'] == 1:
        return 'a'
    elif row['winner_model_b'] == 1:
        return 'b'
    else:
        return 'tie'

train_df['label'] = train_df.apply(get_label, axis=1)

## 🔠 Step 5: TF-IDF Vectorization

In [19]:
tfidf = TfidfVectorizer(max_features=5000)
tfidf.fit(pd.concat([train_df['text_a'], train_df['text_b']]))

X_a = tfidf.transform(train_df['text_a'])
X_b = tfidf.transform(train_df['text_b'])
X_concat = hstack([X_a, X_b])

## 🎯 Step 6: Encode Labels

In [20]:
le = LabelEncoder()
y_encoded = le.fit_transform(train_df['label'])

## 🧪 Step 7: Train/Validation Split

In [21]:
X_train, X_val, y_train, y_val = train_test_split(
    X_concat, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

## 🌲 Step 8: Train XGBoost Model with Early Stopping

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

params = {
    'objective': 'multi:softprob',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'tree_method': 'hist',
    'verbosity': 1
}

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=[(dtrain, 'train'), (dval, 'val')],
    early_stopping_rounds=20,
    verbose_eval=10
)

[0]	train-mlogloss:1.09352	val-mlogloss:1.09508


## 📊 Step 9: Evaluate on Validation Set

In [None]:
# ✅ Evaluate model on validation set
y_val_probs = bst.predict(dval)  # Get predicted probabilities
y_pred = np.argmax(y_val_probs, axis=1)  # Convert to predicted class indices

# Convert numeric predictions back to class labels
y_val_labels = le.inverse_transform(y_val)
y_pred_labels = le.inverse_transform(y_pred)

print("✅ Classification Report:")
print(classification_report(y_val_labels, y_pred_labels))
print("🧱 Confusion Matrix:")
print(confusion_matrix(y_val_labels, y_pred_labels))
print(f"🎯 Accuracy: {accuracy_score(y_val_labels, y_pred_labels):.4f}")

## 🧪 Step 10: Load and Preprocess Test Data

In [None]:
test_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')
test_df['text_a'] = (test_df['prompt'] + " " + test_df['response_a']).str.lower().str.strip()
test_df['text_b'] = (test_df['prompt'] + " " + test_df['response_b']).str.lower().str.strip()

X_test_a = tfidf.transform(test_df['text_a'])
X_test_b = tfidf.transform(test_df['text_b'])
X_test_concat = hstack([X_test_a, X_test_b])
dtest = xgb.DMatrix(X_test_concat)

## 📝 Step 11: Predict and Generate Submission

In [None]:
# Get probabilities
y_test_probs = bst.predict(dtest)  # shape: (num_samples, num_classes)

# Convert to predicted class indices
y_test_pred = np.argmax(y_test_probs, axis=1)

# Convert to class labels ('a', 'b', 'tie')
y_test_labels = le.inverse_transform(y_test_pred)

submission_df = pd.DataFrame({
    'id': test_df['id'],
    'winner_model_a': y_test_probs[:, le.transform(['a'])[0]],
    'winner_model_b': y_test_probs[:, le.transform(['b'])[0]],
    'winner_tie': y_test_probs[:, le.transform(['tie'])[0]],
})

submission_df.to_csv('submission.csv', index=False)
print("✅ submission.csv file is ready for Kaggle upload.")

In [None]:

# 🔄 Load and preprocess test data
test_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')
test_df['text_a'] = (test_df['prompt'] + " " + test_df['response_a']).str.lower().str.strip()
test_df['text_b'] = (test_df['prompt'] + " " + test_df['response_b']).str.lower().str.strip()

X_a_test = tfidf.transform(test_df['text_a'])
X_b_test = tfidf.transform(test_df['text_b'])
X_test_concat = hstack([X_a_test, X_b_test])

# 🔍 Predict class probabilities
dtest = xgb.DMatrix(X_test_concat)
y_test_probs = bst.predict(dtest)  # shape: (num_samples, num_classes)

# 🧾 Generate submission file
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'winner_model_a': y_test_probs[:, le.transform(['a'])[0]],
    'winner_model_b': y_test_probs[:, le.transform(['b'])[0]],
    'winner_tie': y_test_probs[:, le.transform(['tie'])[0]],
})

submission_df.to_csv('submission.csv', index=False)
print("✅ Submission file saved as submission.csv")
