# 03 - Model Baseline
This notebook implements a baseline model for text classification using transformers.

## Import Required Libraries

In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")

  from .autonotebook import tqdm as notebook_tqdm


PyTorch version: 2.9.1+cpu
Device: cpu


## Load Processed Data

In [6]:
# Load processed data
train_df = pd.read_csv('../data/train_processed.csv')
val_df = pd.read_csv('../data/val_processed.csv')
test_df = pd.read_csv('../data/test_processed.csv')

print(f"Train shape: {train_df.shape}")
print(f"Val shape: {val_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nTrain columns: {train_df.columns.tolist()}")

# Create combined text column from prompt and responses for training/validation
# Using all three columns concatenated
if 'prompt_clean' in train_df.columns:
    train_df['text'] = train_df['prompt_clean'].fillna('') + ' [SEP] ' + \
                       train_df['response_a_clean'].fillna('') + ' [SEP] ' + \
                       train_df['response_b_clean'].fillna('')
    val_df['text'] = val_df['prompt_clean'].fillna('') + ' [SEP] ' + \
                     val_df['response_a_clean'].fillna('') + ' [SEP] ' + \
                     val_df['response_b_clean'].fillna('')
    test_df['text'] = test_df['prompt_clean'].fillna('') + ' [SEP] ' + \
                      test_df['response_a_clean'].fillna('') + ' [SEP] ' + \
                      test_df['response_b_clean'].fillna('')
    print("Combined text columns created.")

Train shape: (45981, 10)
Val shape: (11496, 10)
Test shape: (3, 4)

Train columns: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie', 'target']


## Initialize Pretrained Model

In [7]:
# Model configuration
# Use 'target' column for labels (0: Model A wins, 1: Model B wins, 2: Tie)
num_labels = 3  # We have 3 classes: Model A wins, Model B wins, Tie
model_name = "bert-base-uncased"

print(f"Loading model: {model_name}")
print(f"Number of labels: {num_labels}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

print(f"Model loaded successfully!")

Loading model: bert-base-uncased
Number of labels: 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully!


## Create Custom Dataset Class

In [8]:
from torch.utils.data import Dataset

class ClassificationDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }
        
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        
        return item

print("Custom Dataset class defined.")

Custom Dataset class defined.


## Create Datasets

In [1]:
# Ensure data is loaded
import pandas as pd
import torch
from torch.utils.data import Dataset

if 'train_df' not in locals():
    train_df = pd.read_csv('../data/train_processed.csv')
    val_df = pd.read_csv('../data/val_processed.csv')
    test_df = pd.read_csv('../data/test_processed.csv')
    
    # Create combined text column if it doesn't exist
    if 'text' not in train_df.columns:
        # Try to use cleaned columns if available, otherwise use original columns
        if 'prompt_clean' in train_df.columns:
            train_df['text'] = train_df['prompt_clean'].fillna('') + ' [SEP] ' + \
                               train_df['response_a_clean'].fillna('') + ' [SEP] ' + \
                               train_df['response_b_clean'].fillna('')
            val_df['text'] = val_df['prompt_clean'].fillna('') + ' [SEP] ' + \
                             val_df['response_a_clean'].fillna('') + ' [SEP] ' + \
                             val_df['response_b_clean'].fillna('')
            test_df['text'] = test_df['prompt_clean'].fillna('') + ' [SEP] ' + \
                              test_df['response_a_clean'].fillna('') + ' [SEP] ' + \
                              test_df['response_b_clean'].fillna('')
        else:
            # Use original columns if cleaned versions don't exist
            train_df['text'] = train_df['prompt'].fillna('') + ' [SEP] ' + \
                               train_df['response_a'].fillna('') + ' [SEP] ' + \
                               train_df['response_b'].fillna('')
            val_df['text'] = val_df['prompt'].fillna('') + ' [SEP] ' + \
                             val_df['response_a'].fillna('') + ' [SEP] ' + \
                             val_df['response_b'].fillna('')
            test_df['text'] = test_df['prompt'].fillna('') + ' [SEP] ' + \
                              test_df['response_a'].fillna('') + ' [SEP] ' + \
                              test_df['response_b'].fillna('')
    print("Data loaded.")

# Ensure tokenizer is available
if 'tokenizer' not in locals():
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    print("Tokenizer loaded.")

# Define ClassificationDataset class
class ClassificationDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }
        
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        
        return item

# Create datasets
train_dataset = ClassificationDataset(
    train_df['text'].tolist(),
    train_df['target'].tolist() if 'target' in train_df.columns else None,
    tokenizer
)

val_dataset = ClassificationDataset(
    val_df['text'].tolist(),
    val_df['target'].tolist() if 'target' in val_df.columns else None,
    tokenizer
)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Val dataset size: {len(val_dataset)}")

Data loaded.


  from .autonotebook import tqdm as notebook_tqdm


Tokenizer loaded.
Train dataset size: 45981
Val dataset size: 11496


## Configure Training Parameters

In [1]:
# Ensure necessary imports
from transformers import TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir='../models/baseline',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=100,
    save_steps=500,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

print("Training arguments configured.")

  from .autonotebook import tqdm as notebook_tqdm


Training arguments configured.


## Define Metrics Function

In [2]:
def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted', zero_division=0)
    precision = precision_score(labels, predictions, average='weighted', zero_division=0)
    recall = recall_score(labels, predictions, average='weighted', zero_division=0)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

print("Metrics function defined.")

Metrics function defined.


## Train the Model

In [None]:
# Ensure necessary imports and variables
import torch
import numpy as np
import pandas as pd
from transformers import Trainer, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import Dataset

# Ensure data is loaded
if 'train_df' not in locals():
    train_df = pd.read_csv('../data/train_processed.csv')
    val_df = pd.read_csv('../data/val_processed.csv')
    test_df = pd.read_csv('../data/test_processed.csv')
    
    # Create combined text column if it doesn't exist
    if 'text' not in train_df.columns:
        if 'prompt_clean' in train_df.columns:
            train_df['text'] = train_df['prompt_clean'].fillna('') + ' [SEP] ' + \
                               train_df['response_a_clean'].fillna('') + ' [SEP] ' + \
                               train_df['response_b_clean'].fillna('')
            val_df['text'] = val_df['prompt_clean'].fillna('') + ' [SEP] ' + \
                             val_df['response_a_clean'].fillna('') + ' [SEP] ' + \
                             val_df['response_b_clean'].fillna('')
        else:
            train_df['text'] = train_df['prompt'].fillna('') + ' [SEP] ' + \
                               train_df['response_a'].fillna('') + ' [SEP] ' + \
                               train_df['response_b'].fillna('')
            val_df['text'] = val_df['prompt'].fillna('') + ' [SEP] ' + \
                             val_df['response_a'].fillna('') + ' [SEP] ' + \
                             val_df['response_b'].fillna('')
    print("Data loaded.")

# Ensure tokenizer is available
if 'tokenizer' not in locals():
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    print("Tokenizer loaded.")

# Define ClassificationDataset class
class ClassificationDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }
        
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        
        return item

# Ensure datasets are created
if 'train_dataset' not in locals():
    train_dataset = ClassificationDataset(
        train_df['text'].tolist(),
        train_df['target'].tolist() if 'target' in train_df.columns else None,
        tokenizer
    )
    val_dataset = ClassificationDataset(
        val_df['text'].tolist(),
        val_df['target'].tolist() if 'target' in val_df.columns else None,
        tokenizer
    )
    print(f"Datasets created. Train: {len(train_dataset)}, Val: {len(val_dataset)}")

# Ensure model is available
if 'model' not in locals():
    model_name = "distilbert-base-uncased"


    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
    print(f"Model loaded.")

# Ensure training_args are available
training_args = TrainingArguments(
    output_dir="../models/baseline",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=200,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
    disable_tqdm=False
)


# Ensure compute_metrics function is defined
if 'compute_metrics' not in locals():
    def compute_metrics(eval_preds):
        predictions, labels = eval_preds
        predictions = np.argmax(predictions, axis=1)
        
        accuracy = accuracy_score(labels, predictions)
        f1 = f1_score(labels, predictions, average='weighted', zero_division=0)
        precision = precision_score(labels, predictions, average='weighted', zero_division=0)
        recall = recall_score(labels, predictions, average='weighted', zero_division=0)
        
        return {
            'accuracy': accuracy,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

print("Starting training...")
trainer.train()

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

## Evaluate Model Performance

In [None]:
# Evaluate on validation set
print("Evaluating on validation set...")
eval_results = trainer.evaluate()

print("\nValidation Results:")
for key, value in eval_results.items():
    if not key.startswith('runtime'):
        print(f"  {key}: {value:.4f}")

## Generate Predictions on Test Set

In [None]:
# Create test dataset without labels
test_dataset = ClassificationDataset(
    test_df['text'].tolist(),
    labels=None,
    tokenizer=tokenizer
)

# Make predictions
print("Making predictions on test set...")
predictions = trainer.predict(test_dataset)
test_preds = np.argmax(predictions.predictions, axis=1)

print(f"Predictions shape: {test_preds.shape}")
print(f"Unique predictions: {np.unique(test_preds)}")
print(f"Prediction mapping: 0=Model A wins, 1=Model B wins, 2=Tie")

## Create Submission File

In [None]:
# Create submission dataframe
submission_df = test_df[['id']].copy() if 'id' in test_df.columns else test_df.copy()
submission_df['prediction'] = test_preds

# Map predictions to labels
label_map = {0: 'Model A wins', 1: 'Model B wins', 2: 'Tie'}
submission_df['prediction_label'] = submission_df['prediction'].map(label_map)

# Save submission
submission_df.to_csv('../submissions/submission.csv', index=False)

print(f"Submission saved to ../submissions/submission.csv")
print(f"Submission shape: {submission_df.shape}")
print(f"\nFirst few rows:")
print(submission_df.head())

## Save Fine-tuned Model

In [None]:
# Save model and tokenizer
print("Saving fine-tuned model...")
trainer.save_model('../models/baseline')
tokenizer.save_pretrained('../models/baseline')

print("Model and tokenizer saved successfully!")
print("Model saved to: ../models/baseline")