In [18]:
import torch
from transformers import AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, random_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import os
import logging

# --- Configure Logging ---
# Create a logger object.
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) # Set the logging level (e.g., INFO, DEBUG, WARNING, ERROR, CRITICAL)

# Create a console handler and set its level to INFO
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)

# Create a file handler, set its level, and specify the log file name
# You can change 'finetuning.log' to any desired log file name.
fh = logging.FileHandler('finetuning.log')
fh.setLevel(logging.INFO)

# Create a formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
fh.setFormatter(formatter)

# Add the handlers to the logger
logger.addHandler(ch)
logger.addHandler(fh)


## 2. Global Configuration Constants

This cell defines all the global constants used throughout the script, such as model name, number of labels, batch size, epochs, and learning rate. **Remember to adjust `DATA_DIR` to your data's location.**

In [19]:
DATA_DIR = "./"
MODEL_NAME = "ProsusAI/finbert"
NUM_LABELS = 3  # Number of output classes for classification (e.g., positive, negative, neutral)
BATCH_SIZE = 16  # Number of samples per batch in DataLoader
EPOCHS = 3  # Number of full passes through the training dataset
LEARNING_RATE = 2e-5  # Learning rate for the AdamW optimizer
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Automatically use GPU if available, else CPU
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Ensures CUDA errors are reported on the exact line for easier debugging

logger.info(f"Using device: {DEVICE}")
logger.info(f"Looking for data files in: {os.path.abspath(DATA_DIR)}")

2025-07-09 21:09:25,484 - __main__ - INFO - Using device: cuda
2025-07-09 21:09:25,484 - __main__ - INFO - Using device: cuda
2025-07-09 21:09:25,484 - __main__ - INFO - Using device: cuda
INFO:__main__:Using device: cuda
2025-07-09 21:09:25,489 - __main__ - INFO - Looking for data files in: /content
2025-07-09 21:09:25,489 - __main__ - INFO - Looking for data files in: /content
2025-07-09 21:09:25,489 - __main__ - INFO - Looking for data files in: /content
INFO:__main__:Looking for data files in: /content


## 3. Data Loading Function

This function handles loading the preprocessed `input_ids`, `attention_mask`, and `labels` tensors. It includes robust error handling for missing files and an assertion to ensure labels are within the expected range, which helps prevent common CUDA errors during training.

In [20]:
# --- Data Loading Function ---
def load_preprocessed_tensors(data_directory, num_labels):
    """
    Loads preprocessed input IDs, attention masks, and labels from specified files.

    Args:
        data_directory (str): The directory path where the tensor files are stored.
        num_labels (int): The expected number of unique labels for validation.

    Returns:
        tuple: A tuple containing (input_ids, attention_mask, labels) tensors.

    Raises:
        FileNotFoundError: If any of the required tensor files are not found.
        AssertionError: If loaded labels are outside the expected range [0, num_labels-1].
    """
    logger.info(f"Loading preprocessed tensors from: {os.path.abspath(data_directory)}...")
    try:
        input_ids = torch.load(os.path.join(data_directory, "tokenized_input_ids.pt"))
        attention_mask = torch.load(os.path.join(data_directory, "tokenized_attention_mask.pt"))
        labels = torch.load(os.path.join(data_directory, "labels.pt"))
    except FileNotFoundError as e:
        logger.error(
            f"Error loading tensor file: {e}. "
            f"Please ensure 'tokenized_input_ids.pt', 'tokenized_attention_mask.pt', "
            f"and 'labels.pt' are located in the specified DATA_DIR: '{os.path.abspath(data_directory)}'."
        )
        raise # Re-raise the exception after logging

    # Validate label range to prevent CUDA device-side assert errors during training.
    logger.info(f"Unique labels loaded: {torch.unique(labels)}")
    if not (labels.min() >= 0 and labels.max() < num_labels):
        logger.error(
            f"Labels loaded for DataLoader are out of range: {torch.unique(labels)}. "
            f"Expected range [0, {num_labels-1}]."
        )
        raise AssertionError("Labels are out of the expected range.")
    logger.info("Preprocessed tensors loaded and labels validated successfully.")
    return input_ids, attention_mask, labels


## 4. DataLoader Creation Function

This function takes the loaded tensors and creates `DataLoader` objects for both the training and validation datasets, handling the data splitting and batching.

In [21]:
# --- DataLoader Creation Function ---
def create_dataloaders(input_ids, attention_mask, labels, batch_size, train_ratio=0.8):
    """
    Creates PyTorch DataLoaders for training and validation datasets.

    Args:
        input_ids (torch.Tensor): Tensor containing input token IDs.
        attention_mask (torch.Tensor): Tensor containing attention masks.
        labels (torch.Tensor): Tensor containing corresponding labels.
        batch_size (int): The batch size for the DataLoaders.
        train_ratio (float): The proportion of data to be used for training.

    Returns:
        tuple: A tuple containing (train_loader, val_loader).
    """
    logger.info("Creating DataLoaders...")
    dataset = TensorDataset(input_ids, attention_mask, labels)

    train_size = int(train_ratio * len(dataset))
    val_size = len(dataset) - train_size

    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    logger.info(f"Train dataset size: {len(train_dataset)}, Validation dataset size: {len(val_dataset)}")
    logger.info("DataLoaders created successfully.")
    return train_loader, val_loader


## 5. Model, Optimizer, and Scheduler Setup Function

This function initializes the pre-trained Hugging Face model for sequence classification, sets up the `AdamW` optimizer, and configures a linear learning rate scheduler.

In [22]:
# --- Model, Optimizer, Scheduler Initialization Function ---
def setup_model_optimizer_scheduler(model_name, num_labels, learning_rate, train_loader, epochs, device):
    """
    Initializes the model, optimizer, and learning rate scheduler.

    Args:
        model_name (str): The name of the pre-trained model from Hugging Face.
        num_labels (int): The number of output classes for the model.
        learning_rate (float): The learning rate for the optimizer.
        train_loader (DataLoader): The DataLoader for the training set, used to calculate total training steps.
        epochs (int): The number of training epochs.
        device (torch.device): The device (CPU or GPU) to load the model onto.

    Returns:
        tuple: A tuple containing (model, optimizer, lr_scheduler).
    """
    logger.info("Loading model, optimizer, and scheduler...")
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=learning_rate)

    num_training_steps = epochs * len(train_loader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )
    logger.info("Model, optimizer, and scheduler initialized successfully.")
    return model, optimizer, lr_scheduler


## 6. Metrics Computation Function

This function defines how evaluation metrics (accuracy and F1-score) are calculated from the model's output logits and the true labels.

In [23]:
# --- Metrics Function ---
def compute_metrics(logits, labels):
    """
    Computes accuracy and weighted F1-score given model logits and true labels.

    Args:
        logits (torch.Tensor): The raw output logits from the model.
        labels (torch.Tensor): The true labels.

    Returns:
        tuple: A tuple containing (accuracy, f1_score).
    """
    preds = logits.argmax(dim=1)
    acc = accuracy_score(labels.cpu(), preds.cpu())
    f1 = f1_score(labels.cpu(), preds.cpu(), average='weighted')
    return acc, f1


## 7. Training and Validation Loop Function

This is the core training function that iterates through epochs, performs forward and backward passes, updates model weights, and evaluates the model's performance on the validation set after each epoch. Progress bars are included for better visualization.

In [24]:
# --- Training and Validation Loop Function ---
def train_and_validate(model, train_loader, val_loader, optimizer, lr_scheduler, epochs, device):
    """
    Executes the training and validation loop for the model.

    Args:
        model (torch.nn.Module): The PyTorch model to train.
        train_loader (DataLoader): DataLoader for the training dataset.
        val_loader (DataLoader): DataLoader for the validation dataset.
        optimizer (torch.optim.Optimizer): The optimizer for model parameters.
        lr_scheduler (torch.optim.lr_scheduler._LRScheduler): The learning rate scheduler.
        epochs (int): The number of training epochs.
        device (torch.device): The device (CPU or GPU) to perform training on.
    """
    logger.info("Starting training and validation loop...")
    for epoch in range(epochs):
        # --- Training Phase ---
        model.train()  # Set model to training mode
        total_loss = 0
        for batch_idx, (input_ids_batch, attention_mask_batch, labels_batch) in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch + 1}")):
            # Move batch data to the specified device
            input_ids_batch = input_ids_batch.to(device)
            attention_mask_batch = attention_mask_batch.to(device)
            labels_batch = labels_batch.to(device)

            # Forward pass: compute model output and loss
            outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch, labels=labels_batch)
            loss = outputs.loss
            total_loss += loss.item()

            # Backward pass: compute gradients and update weights
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()  # Reset gradients for the next iteration

            # Optional: Log batch loss for detailed tracking
            if (batch_idx + 1) % 100 == 0: # Log every 100 batches
                logger.debug(f"Epoch {epoch + 1}, Batch {batch_idx + 1} - Training Loss: {loss.item():.4f}")

        avg_loss = total_loss / len(train_loader)
        logger.info(f"Epoch {epoch + 1} - Average Training Loss: {avg_loss:.4f}")

        # --- Validation Phase ---
        model.eval()  # Set model to evaluation mode
        all_logits = []
        all_labels = []
        with torch.no_grad():  # Disable gradient calculations during validation
            for input_ids_batch, attention_mask_batch, labels_batch in tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}"):
                # Move batch data to the specified device
                input_ids_batch = input_ids_batch.to(device)
                attention_mask_batch = attention_mask_batch.to(device)
                labels_batch = labels_batch.to(device)

                # Forward pass: compute model output (no labels for loss in validation pass)
                outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)
                all_logits.append(outputs.logits)
                all_labels.append(labels_batch)

        # Concatenate all collected logits and labels from validation batches
        logits = torch.cat(all_logits)
        labels_all = torch.cat(all_labels)
        # Compute metrics for the validation set
        val_acc, val_f1 = compute_metrics(logits, labels_all)

        logger.info(f"Epoch {epoch + 1} | Validation Accuracy: {val_acc:.4f} | Validation F1-Score: {val_f1:.4f}")

    logger.info("Training and validation complete.")


## 8. Main Execution Block

This block orchestrates the entire fine-tuning process by calling the functions defined above. It also includes error handling for the main execution flow and an optional section to save the fine-tuned model.

In [25]:
# --- Main Execution Block ---
if __name__ == "__main__":
    logger.info(f"Starting fine-tuning script. Current device: {DEVICE}")

    try:
        # 1. Load Data
        input_ids, attention_mask, labels = load_preprocessed_tensors(DATA_DIR, NUM_LABELS)

        # 2. Create DataLoaders
        train_loader, val_loader = create_dataloaders(input_ids, attention_mask, labels, BATCH_SIZE)

        # 3. Setup Model, Optimizer, and Scheduler
        model, optimizer, lr_scheduler = setup_model_optimizer_scheduler(
            MODEL_NAME, NUM_LABELS, LEARNING_RATE, train_loader, EPOCHS, DEVICE
        )

        # 4. Train and Validate Model
        train_and_validate(model, train_loader, val_loader, optimizer, lr_scheduler, EPOCHS, DEVICE)

        # Save the fine-tuned model
        model_save_path = "./finetuned_finbert"
        model.save_pretrained(model_save_path)
        logger.info(f"Fine-tuned model saved to {model_save_path}")

    except Exception as e:
        logger.critical(f"An unhandled error occurred during the fine-tuning process: {e}", exc_info=True)


2025-07-09 21:09:25,535 - __main__ - INFO - Starting fine-tuning script. Current device: cuda
2025-07-09 21:09:25,535 - __main__ - INFO - Starting fine-tuning script. Current device: cuda
2025-07-09 21:09:25,535 - __main__ - INFO - Starting fine-tuning script. Current device: cuda
INFO:__main__:Starting fine-tuning script. Current device: cuda
2025-07-09 21:09:25,539 - __main__ - INFO - Loading preprocessed tensors from: /content...
2025-07-09 21:09:25,539 - __main__ - INFO - Loading preprocessed tensors from: /content...
2025-07-09 21:09:25,539 - __main__ - INFO - Loading preprocessed tensors from: /content...
INFO:__main__:Loading preprocessed tensors from: /content...
2025-07-09 21:09:25,570 - __main__ - INFO - Unique labels loaded: tensor([0, 1, 2])
2025-07-09 21:09:25,570 - __main__ - INFO - Unique labels loaded: tensor([0, 1, 2])
2025-07-09 21:09:25,570 - __main__ - INFO - Unique labels loaded: tensor([0, 1, 2])
INFO:__main__:Unique labels loaded: tensor([0, 1, 2])
2025-07-09 21:

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

2025-07-09 21:09:39,361 - __main__ - INFO - Model, optimizer, and scheduler initialized successfully.
2025-07-09 21:09:39,361 - __main__ - INFO - Model, optimizer, and scheduler initialized successfully.
2025-07-09 21:09:39,361 - __main__ - INFO - Model, optimizer, and scheduler initialized successfully.
INFO:__main__:Model, optimizer, and scheduler initialized successfully.
2025-07-09 21:09:39,364 - __main__ - INFO - Starting training and validation loop...
2025-07-09 21:09:39,364 - __main__ - INFO - Starting training and validation loop...
2025-07-09 21:09:39,364 - __main__ - INFO - Starting training and validation loop...
INFO:__main__:Starting training and validation loop...

Training Epoch 1:   0%|          | 0/114 [00:00<?, ?it/s][A
Training Epoch 1:   1%|          | 1/114 [00:01<02:57,  1.57s/it][A
Training Epoch 1:   2%|▏         | 2/114 [00:02<01:44,  1.07it/s][A
Training Epoch 1:   3%|▎         | 3/114 [00:02<01:15,  1.47it/s][A
Training Epoch 1:   4%|▎         | 4/114 [0