In [1]:
import warnings
warnings.filterwarnings('ignore', message='.*overflowing tokens.*')

import os
import random
from pathlib import Path
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader    
from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

print("Libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"PyTorch Lightning version: {pl.__version__}")

  from .autonotebook import tqdm as notebook_tqdm


Libraries imported successfully!
PyTorch version: 2.6.0+cu124
PyTorch Lightning version: 2.5.5


In [2]:
class Config:
    # Paths - Kaggle format
    BASE_DIR = Path.cwd()
    DATA_PATH = BASE_DIR
    
    # Model - Download directly from HuggingFace
    MODEL_NAME = 'bert-base-uncased'
    
    # Training parameters
    BATCH_SIZE = 16
    MAX_LENGTH = 512
    LEARNING_RATE = 5e-5
    MAX_EPOCHS = 2
    NUM_WORKERS = 4
    HIDDEN_DIM = 256
    NUM_CLASSES = 3
    DROPOUT_RATE = 0.1
    
    # Other settings
    SEED = 42
    TEST_SIZE = 0.2

def set_seed(seed=42):
    """Set random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# Initialize configuration and set seed
config = Config()
set_seed(config.SEED)

print(f"Configuration initialized")
print(f"Data path: {config.DATA_PATH}")
print(f"Model: {config.MODEL_NAME}")
print(f"Random seed: {config.SEED}")

Configuration initialized
Data path: /home/obliviontrek/Documents/llm-finetuning-kaggle
Model: bert-base-uncased
Random seed: 42


In [7]:
def load_data(config):
    
    train_file_path = config.DATA_PATH / 'train.csv'
    if not train_file_path.exists():
        raise FileNotFoundError(f"Training data not found at {train_file_path}")
    
    train_data = pd.read_csv(train_file_path)
    print(f"Training data shape: {train_data.shape}")
    print(f"Training data columns: {train_data.columns.tolist()}")
    
    train_data_split, validation_data = train_test_split(
        train_data, 
        test_size=config.TEST_SIZE, 
        random_state=config.SEED, 
        stratify=train_data[['winner_model_a', 'winner_model_b', 'winner_tie']].values
    )
    print(f"Training samples: {len(train_data_split)}")
    

    print(f"Model A wins: {train_data_split['winner_model_a'].sum()}")
    print(f"Model B wins: {train_data_split['winner_model_b'].sum()}")
    print(f"Ties: {train_data_split['winner_tie'].sum()}")
    
    print(f"Validation samples: {len(validation_data)}")
    print(f"Model A wins: {validation_data['winner_model_a'].sum()}")
    print(f"Model B wins: {validation_data['winner_model_b'].sum()}")
    print(f"Ties: {validation_data['winner_tie'].sum()}")
    
    return train_data_split, validation_data

# Load and explore the data
train_df, val_df = load_data(config)
train_df.head(5)

Training data shape: (57477, 9)
Training data columns: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie']
Training samples: 45981
Model A wins: 16051
Model B wins: 15721
Ties: 14209
Validation samples: 11496
Model A wins: 4013
Model B wins: 3931
Ties: 3552


Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
53973,4032881709,gpt-3.5-turbo-0125,mixtral-8x7b-instruct-v0.1,"[""In a room I have only 3 sisters. Eve is read...","[""Based on the information provided, Susan wou...","[""The information about Susan's activities is ...",0,0,1
38792,2890889868,zephyr-7b-beta,mistral-medium,"[""\/\/ \u67e5\u627e\u88ab\u6295\u7968\u73a9\u5...","[""No, this code only checks if the targeted pl...","[""\u662f\u7684\uff0c\u8fd9\u6bb5\u4ee3\u7801\u...",0,1,0
46462,3466836316,gpt-4-1106-preview,mixtral-8x7b-instruct-v0.1,"[""Hello there. Can you tell me what is the cur...","[""Hello! The current capital of Brazil is Bras...","[""The current capital of Brazil is Bras\u00edl...",0,0,1
16654,1243922820,zephyr-7b-beta,gpt-3.5-turbo-0613,"[""can you predict the next year jee advanced p...","[""I'm sorry but I'm not capable of predicting ...","[""I'm sorry, but as an AI assistant, I do not ...",1,0,0
45609,3399655443,claude-2.1,gpt-4-0613,"[""how do turn the touchpad off in linux x11?""]","[""Here are a few ways to disable the touchpad ...","[""Turning off the touchpad in Linux X11 enviro...",1,0,0


In [8]:
print(f"Loading tokenizer: {config.MODEL_NAME}")
tokenizer = BertTokenizer.from_pretrained(config.MODEL_NAME)

print(f"Tokenizer loaded successfully!")
print(f"Vocabulary size: {len(tokenizer)}")
print(f"Max length: {config.MAX_LENGTH}")

Loading tokenizer: bert-base-uncased




Tokenizer loaded successfully!
Vocabulary size: 30522
Max length: 512


In [5]:
class LLMClassificationDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # Prepare the three text inputs
        prompt = row['prompt']
        model_a_response = row['response_a']
        model_b_response = row['response_b']
        
        # Convert labels to single class
        if row['winner_model_a'] == 1:
            label = 0
        elif row['winner_model_b'] == 1:
            label = 1
        else:  # winner_tie == 1
            label = 2
            
        return {
            'prompt': prompt,
            'model_a_response': model_a_response,
            'model_b_response': model_b_response,
            'label': label
        }

class TestDataset(Dataset):
    """Test dataset for inference"""
    
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # Prepare the three text inputs
        prompt = row['prompt']
        model_a_response = row['response_a']
        model_b_response = row['response_b']
            
        return {
            'prompt': prompt,
            'model_a_response': model_a_response,
            'model_b_response': model_b_response
        }

In [6]:
def collate_fn(batch, tokenizer, max_length=512):
    """Custom collate function for batch tokenization"""
    prompts = [item['prompt'] for item in batch]
    model_a_responses = [item['model_a_response'] for item in batch]
    model_b_responses = [item['model_b_response'] for item in batch]
    labels = torch.tensor([item['label'] for item in batch], dtype=torch.long)
    
    # Tokenize each input type
    prompt_encoding = tokenizer(
        prompts, padding=True, truncation=True, 
        max_length=max_length, return_tensors='pt'
    )
    
    model_a_encoding = tokenizer(
        model_a_responses, padding=True, truncation=True,
        max_length=max_length, return_tensors='pt'
    )
    
    model_b_encoding = tokenizer(
        model_b_responses, padding=True, truncation=True,
        max_length=max_length, return_tensors='pt'
    )
    
    return {
        'prompt_input_ids': prompt_encoding['input_ids'],
        'prompt_attention_mask': prompt_encoding['attention_mask'],
        'model_a_input_ids': model_a_encoding['input_ids'],
        'model_a_attention_mask': model_a_encoding['attention_mask'],
        'model_b_input_ids': model_b_encoding['input_ids'],
        'model_b_attention_mask': model_b_encoding['attention_mask'],
        'labels': labels
    }

def test_collate_fn(batch, tokenizer, max_length=512):
    """Custom collate function for test data"""
    prompts = [item['prompt'] for item in batch]
    model_a_responses = [item['model_a_response'] for item in batch]
    model_b_responses = [item['model_b_response'] for item in batch]
    
    # Tokenize each input type
    prompt_encoding = tokenizer(
        prompts, padding=True, truncation=True, 
        max_length=max_length, return_tensors='pt'
    )
    
    model_a_encoding = tokenizer(
        model_a_responses, padding=True, truncation=True,
        max_length=max_length, return_tensors='pt'
    )
    
    model_b_encoding = tokenizer(
        model_b_responses, padding=True, truncation=True,
        max_length=max_length, return_tensors='pt'
    )
    
    return {
        'prompt_input_ids': prompt_encoding['input_ids'],
        'prompt_attention_mask': prompt_encoding['attention_mask'],
        'model_a_input_ids': model_a_encoding['input_ids'],
        'model_a_attention_mask': model_a_encoding['attention_mask'],
        'model_b_input_ids': model_b_encoding['input_ids'],
        'model_b_attention_mask': model_b_encoding['attention_mask']
    }

In [None]:
class LLMClassificationModel(pl.LightningModule):
    """PyTorch Lightning model for LLM Classification"""
    
    def __init__(self, model_name='bert-base-uncased', learning_rate=5e-5, 
                 hidden_dim=256, num_classes=3, dropout_rate=0.1):
        super().__init__()
        self.save_hyperparameters()
        
        # Load BERT model - Download directly from HuggingFace
        print(f"🤖 Loading BERT model: {model_name}")
        self.bert = BertModel.from_pretrained(model_name)
        
        # Freeze most BERT parameters
        for param in self.bert.parameters():
            param.requires_grad = False
            
        # Classification head
        bert_dim = self.bert.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Linear(bert_dim * 3, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim // 2, num_classes)
        )
        
        # Loss function
        self.loss_fn = nn.CrossEntropyLoss()
        
    def forward(self, prompt_input_ids, prompt_attention_mask,
                model_a_input_ids, model_a_attention_mask,
                model_b_input_ids, model_b_attention_mask):
        
        # Get embeddings for each input
        prompt_outputs = self.bert(input_ids=prompt_input_ids, attention_mask=prompt_attention_mask)
        prompt_embedding = prompt_outputs.last_hidden_state[:, 0, :]  # CLS token
        
        model_a_outputs = self.bert(input_ids=model_a_input_ids, attention_mask=model_a_attention_mask)
        model_a_embedding = model_a_outputs.last_hidden_state[:, 0, :]  # CLS token
        
        model_b_outputs = self.bert(input_ids=model_b_input_ids, attention_mask=model_b_attention_mask)
        model_b_embedding = model_b_outputs.last_hidden_state[:, 0, :]  # CLS token
        
        # Concatenate embeddings
        concatenated = torch.cat([prompt_embedding, model_a_embedding, model_b_embedding], dim=1)
        
        # Classification
        logits = self.classifier(concatenated)
        return logits
    
    def training_step(self, batch, batch_idx):
        logits = self.forward(
            batch['prompt_input_ids'], batch['prompt_attention_mask'],
            batch['model_a_input_ids'], batch['model_a_attention_mask'],
            batch['model_b_input_ids'], batch['model_b_attention_mask']
        )
        
        loss = self.loss_fn(logits, batch['labels'])
        
        # Calculate accuracy
        preds = torch.argmax(logits, dim=1)
        acc = (preds == batch['labels']).float().mean()
        
        # Logging
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('train_acc', acc, on_step=True, on_epoch=True, prog_bar=True)
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        logits = self.forward(
            batch['prompt_input_ids'], batch['prompt_attention_mask'],
            batch['model_a_input_ids'], batch['model_a_attention_mask'],
            batch['model_b_input_ids'], batch['model_b_attention_mask']
        )
        
        loss = self.loss_fn(logits, batch['labels'])
        
        # Calculate accuracy
        preds = torch.argmax(logits, dim=1)
        acc = (preds == batch['labels']).float().mean()
        
        # Logging
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
        self.log('val_acc', acc, on_epoch=True, prog_bar=True)
        
        return {'val_loss': loss, 'preds': preds, 'labels': batch['labels']}
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.learning_rate)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)
        return [optimizer], [scheduler]

In [None]:
class LLMDataModule(pl.LightningDataModule):
    """PyTorch Lightning DataModule for LLM Classification"""
    
    def __init__(self, train_df, val_df, tokenizer, batch_size=16, max_length=512, num_workers=4):
        super().__init__()
        self.train_df = train_df
        self.val_df = val_df
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_length = max_length
        self.num_workers = num_workers
        
    def setup(self, stage=None):
        if stage == 'fit' or stage is None:
            self.train_dataset = LLMClassificationDataset(self.train_df, self.tokenizer, self.max_length)
            self.val_dataset = LLMClassificationDataset(self.val_df, self.tokenizer, self.max_length)
    
    def _collate_fn(self, batch):
        """Wrapper for collate_fn that can be pickled"""
        return collate_fn(batch, self.tokenizer, self.max_length)
    
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4,  
            collate_fn=self._collate_fn,
            pin_memory=False
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=4,  
            collate_fn=self._collate_fn,
            pin_memory=False
        )

print("Data module defined successfully!")

📊 Data module defined successfully!


In [10]:
data_module = LLMDataModule(
    train_df=train_df,
    val_df=val_df,
    tokenizer=tokenizer,
    batch_size=config.BATCH_SIZE,
    max_length=config.MAX_LENGTH,
    num_workers=config.NUM_WORKERS
)

# Initialize model
model = LLMClassificationModel(
    model_name=config.MODEL_NAME,
    learning_rate=config.LEARNING_RATE,
    hidden_dim=config.HIDDEN_DIM,
    num_classes=config.NUM_CLASSES,
    dropout_rate=config.DROPOUT_RATE
)

# Set up callbacks
checkpoint_callback = ModelCheckpoint(
    monitor='val_acc',
    dirpath='./checkpoints',
    filename='llm-classification-{epoch:02d}-{val_acc:.2f}',
    save_top_k=3,
    mode='max'
)

# Set up logger
logger = TensorBoardLogger('tb_logs', name='llm_classification')

🤖 Loading BERT model: bert-base-uncased


In [11]:
trainer = pl.Trainer(
    max_epochs=config.MAX_EPOCHS,
    precision='16-mixed',  # Enable mixed precision training
    accelerator='auto',    # Automatically detect GPU/CPU
    devices='auto',        # Use all available devices
    callbacks=[checkpoint_callback],
    logger=logger,
    log_every_n_steps=50,
    val_check_interval=1.0,  # Validate once per epoch
    gradient_clip_val=1.0,   # Gradient clipping
    accumulate_grad_batches=1,
    deterministic=True
)

print("Starting training with PyTorch Lightning...")
print(f"Using device: {trainer.strategy.root_device}")
print(f"Mixed precision: {'Enabled' if trainer.precision == '16-mixed' else 'Disabled'}")
print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Validation frequency: Once per epoch")

# Train the model
trainer.fit(model, data_module)

print("Training completed!")
print(f"Best model saved at: {checkpoint_callback.best_model_path}")
print(f"Best validation accuracy: {checkpoint_callback.best_model_score:.4f}")

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..
You are using a CUDA device ('NVIDIA GeForce RTX 4060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
2025-10-14 20:33:28.740553: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered


Starting training with PyTorch Lightning...
Using device: cuda:0
Mixed precision: Enabled
Training samples: 45981
Validation samples: 11496
Validation frequency: Once per epoch


E0000 00:00:1760454208.803371    3572 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760454208.823206    3572 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1760454208.970753    3572 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1760454208.970777    3572 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1760454208.970778    3572 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1760454208.970779    3572 computation_placer.cc:177] computation placer already registered. Please check linka

                                                                           



Epoch 1: 100%|██████████| 2874/2874 [31:34<00:00,  1.52it/s, v_num=0, train_loss_step=0.915, train_acc_step=0.615, val_loss=1.040, val_acc=0.455, train_loss_epoch=1.040, train_acc_epoch=0.461] 

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 2874/2874 [31:35<00:00,  1.52it/s, v_num=0, train_loss_step=0.915, train_acc_step=0.615, val_loss=1.040, val_acc=0.455, train_loss_epoch=1.040, train_acc_epoch=0.461]
Training completed!
Best model saved at: /home/obliviontrek/Documents/llm-finetuning-kaggle/checkpoints/llm-classification-epoch=01-val_acc=0.46.ckpt
Best validation accuracy: 0.4554


In [13]:
def load_test_data(config):
    """Load test data for inference"""
    test_file_path = config.DATA_PATH / 'test.csv'
    if not test_file_path.exists():
        raise FileNotFoundError(f"Test data not found at {test_file_path}")
    
    test_data = pd.read_csv(test_file_path)
    print(f"Test data shape: {test_data.shape}")
    print(f"Test data columns: {test_data.columns.tolist()}")
    
    return test_data

print("Loading test data...")
test_data = load_test_data(config)

print("\nSample test data:")
test_data.head(2)

Loading test data...
Test data shape: (3, 4)
Test data columns: ['id', 'prompt', 'response_a', 'response_b']

Sample test data:


Unnamed: 0,id,prompt,response_a,response_b
0,136060,"[""I have three oranges today, I ate an orange ...","[""You have two oranges today.""]","[""You still have three oranges. Eating an oran..."
1,211333,"[""You are a mediator in a heated political deb...","[""Thank you for sharing the details of the sit...","[""Mr Reddy and Ms Blue both have valid points ..."


In [14]:
def run_inference(test_data, checkpoint_path, tokenizer, config):
    """Run inference on test data"""
    print("Starting inference...")
    
    # Load the best model checkpoint
    best_model = LLMClassificationModel.load_from_checkpoint(checkpoint_path)
    best_model.eval()
    best_model.freeze()
    
    # Create test dataset and dataloader
    test_dataset = TestDataset(test_data, tokenizer, max_length=config.MAX_LENGTH)
    
    def _test_collate_fn(batch):
        """Wrapper for test_collate_fn that can be pickled"""
        return test_collate_fn(batch, tokenizer, max_length=config.MAX_LENGTH)
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=32,
        shuffle=False,
        num_workers=4, 
        collate_fn=_test_collate_fn,
        pin_memory=False
    )
    
    # Perform inference
    all_predictions = []
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            # Move batch to device
            batch = {k: v.to(best_model.device) for k, v in batch.items()}
            
            # Get logits
            logits = best_model(
                batch['prompt_input_ids'], batch['prompt_attention_mask'],
                batch['model_a_input_ids'], batch['model_a_attention_mask'],
                batch['model_b_input_ids'], batch['model_b_attention_mask']
            )
            
            # Apply softmax to get probabilities
            probabilities = F.softmax(logits, dim=-1)
            all_predictions.append(probabilities.cpu())
            
            if (batch_idx + 1) % 10 == 0:
                print(f"Processed {batch_idx + 1}/{len(test_loader)} batches")
    
    # Concatenate all predictions
    all_predictions = torch.cat(all_predictions, dim=0)
    
    # Extract probabilities for each class
    winner_model_a_probs = all_predictions[:, 0].numpy()  # Class 0: model_a wins
    winner_model_b_probs = all_predictions[:, 1].numpy()  # Class 1: model_b wins
    winner_tie_probs = all_predictions[:, 2].numpy()      # Class 2: tie
    
    # Create submission dataframe
    submission = pd.DataFrame({
        'id': test_data['id'],
        'winner_model_a': winner_model_a_probs,
        'winner_model_b': winner_model_b_probs,
        'winner_tie': winner_tie_probs
    })
    
    # Verify probabilities sum to 1
    prob_sums = submission[['winner_model_a', 'winner_model_b', 'winner_tie']].sum(axis=1)
    print(f"Probability sums - Min: {prob_sums.min():.4f}, Max: {prob_sums.max():.4f}, Mean: {prob_sums.mean():.4f}")
    
    print(submission.head())
    
    # Save submission as requested filename
    submission.to_csv('submission.csv', index=False)
    print("Submission file saved as 'submission.csv'")
    
    return submission

# Run inference and create submission
submission = run_inference(test_data, checkpoint_callback.best_model_path, tokenizer, config)

Starting inference...
🤖 Loading BERT model: bert-base-uncased




Probability sums - Min: 1.0000, Max: 1.0000, Mean: 1.0000
        id  winner_model_a  winner_model_b  winner_tie
0   136060        0.376628        0.200569    0.422803
1   211333        0.555219        0.177617    0.267163
2  1233961        0.293212        0.365676    0.341112
Submission file saved as 'submission.csv'
