# Enhanced CodeBERT for Swift Code Understanding (Debug Version)

This notebook provides a debug training implementation for the CodeBERT model on Swift code classification. It uses a smaller dataset and more frequent logging to help troubleshoot training issues.

In [None]:
# Install required libraries
!pip install transformers datasets evaluate torch scikit-learn tqdm dropbox requests psutil

In [None]:
# Important: These imports must be properly separated
import os
import json
import torch
import random
import numpy as np
import time
import gc
import re
import collections
import psutil
from tqdm.auto import tqdm
from datasets import load_dataset, ClassLabel
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    RobertaForSequenceClassification,
    Trainer, 
    TrainingArguments,
    set_seed,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    get_scheduler
)

# Import AdamW from torch.optim instead of transformers.optimization
from torch.optim import AdamW
from transformers.trainer_utils import get_last_checkpoint

# Import our custom trainer module
import sys
sys.path.append('..')
from notebooks.codebert_trainer import get_trainer, monitor_resources, cleanup_memory

# Set a seed for reproducibility
set_seed(42)

In [None]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("Using CPU - Note: Training will be much slower on CPU")

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

## Dataset and Model Configuration

Let's define the model and dataset we'll be using with debug settings:

In [None]:
# Dataset configuration
DATASET_ID = "mvasiliniuc/iva-swift-codeint"

# Model configuration for debug training
MODEL_NAME = "microsoft/codebert-base"
MAX_LENGTH = 512
BATCH_SIZE = 4  # Reduced batch size for debugging
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01
NUM_EPOCHS = 2  # Reduced number of epochs for debugging
WARMUP_STEPS = 100  # Reduced warmup steps for debugging
GRADIENT_ACCUMULATION_STEPS = 2  # Reduced gradient accumulation for debugging

# Training mode configuration
DEBUG_MODE = True  # Enable debug mode
DEBUG_SAMPLE_SIZE = 10000  # Small sample size for debugging

print(f"Training mode: {'DEBUG' if DEBUG_MODE else 'FULL'}")
print(f"Debug sample size: {DEBUG_SAMPLE_SIZE} examples")

In [None]:
# Function to load dataset with retry logic
def load_dataset_with_retry(dataset_id, max_retries=3, retry_delay=5):
    """Load a dataset with retry logic."""
    for attempt in range(max_retries):
        try:
            print(f"Loading dataset (attempt {attempt+1}/{max_retries})...")
            data = load_dataset(dataset_id, trust_remote_code=True)
            print(f"Dataset loaded successfully with {len(data['train'])} examples")
            return data
        except Exception as e:
            print(f"Error loading dataset (attempt {attempt+1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print("Maximum retries reached. Could not load dataset.")
                raise

# Load the dataset with retry logic
try:
    print(f"Loading dataset: {DATASET_ID}")
    data = load_dataset_with_retry(DATASET_ID)
    print("Dataset structure:")
    print(data)
    
    # If in debug mode, take a small sample of the dataset
    if DEBUG_MODE and 'train' in data:
        print(f"DEBUG MODE: Sampling {DEBUG_SAMPLE_SIZE} examples from dataset")
        # Take a stratified sample if possible
        data['train'] = data['train'].shuffle(seed=42).select(range(min(DEBUG_SAMPLE_SIZE, len(data['train']))))
        print(f"Reduced dataset size: {len(data['train'])} examples")
        
except Exception as e:
    print(f"Fatal error loading dataset: {e}")
    raise

In [None]:
# Verify dataset structure and column names
def verify_dataset_structure(dataset):
    """Verify that the dataset has the expected structure and columns."""
    required_columns = ['repo_name', 'path', 'content']
    if 'train' not in dataset:
        print("WARNING: Dataset does not have a 'train' split.")
        return False
    
    missing_columns = [col for col in required_columns if col not in dataset['train'].column_names]
    if missing_columns:
        print(f"WARNING: Dataset is missing required columns: {missing_columns}")
        return False
    
    print("Dataset structure verification passed.")
    return True

# Verify dataset structure
dataset_valid = verify_dataset_structure(data)
if not dataset_valid:
    print("Dataset structure is not as expected. Proceeding with caution.")

In [None]:
# Load the CodeBERT tokenizer with error handling
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    print(f"Tokenizer vocabulary size: {len(tokenizer)}")
    print(f"Tokenizer type: {tokenizer.__class__.__name__}")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

In [None]:
def extract_file_type(path):
    """
    Extract the file type/category based on the file path and naming conventions in Swift projects.
    
    Args:
        path (str): The file path
        
    Returns:
        int: The category label (0-5)
    """
    path_lower = path.lower()
    filename = path.split('/')[-1].lower()
    
    # Category 0: Models - Data structures and model definitions
    if ('model' in path_lower or 
        'struct' in path_lower or 
        'entity' in path_lower or
        'data' in path_lower and 'class' in path_lower):
        return 0
    
    # Category 1: Views - UI related files
    elif ('view' in path_lower or 
          'ui' in path_lower or 
          'screen' in path_lower or 
          'page' in path_lower or
          'controller' in path_lower and 'view' in path_lower):
        return 1
    
    # Category 2: Controllers - Application logic
    elif ('controller' in path_lower or 
          'manager' in path_lower or 
          'coordinator' in path_lower or
          'service' in path_lower):
        return 2
    
    # Category 3: Utilities - Helper functions and extensions
    elif ('util' in path_lower or 
          'helper' in path_lower or 
          'extension' in path_lower or
          'common' in path_lower):
        return 3
    
    # Category 4: Tests - Test files
    elif ('test' in path_lower or 
          'spec' in path_lower or 
          'mock' in path_lower):
        return 4
    
    # Category 5: Configuration - Package and configuration files
    elif ('package.swift' in path_lower or 
          'config' in path_lower or 
          'settings' in path_lower or
          'info.plist' in path_lower):
        return 5
    
    # Default to category 3 (Utilities) if no clear category is found
    return 3

# Apply the function to create labels
try:
    # Create a new column with the extracted labels
    labeled_data = data['train'].map(lambda example: {
        **example,
        'label': extract_file_type(example['path'])
    })
    
    # Count the distribution of labels
    label_counts = collections.Counter(labeled_data['label'])
    
    # Define category names for better readability
    category_names = {
        0: "Models",
        1: "Views",
        2: "Controllers",
        3: "Utilities",
        4: "Tests",
        5: "Configuration"
    }
    
    print("Label distribution:")
    for label, count in sorted(label_counts.items()):
        category_name = category_names.get(label, f"Unknown-{label}")
        print(f"Label {label} ({category_name}): {count} examples ({count/len(labeled_data)*100:.2f}%)")
    
    # Get unique labels
    unique_labels = sorted(label_counts.keys())
    num_labels = len(unique_labels)
    
    print(f"\nTotal unique labels: {num_labels}")
except Exception as e:
    print(f"Error in data preparation: {e}")
    raise

In [None]:
# Split the data into train, validation, and test sets
try:
    # Shuffle the data
    shuffled_data = labeled_data.shuffle(seed=42)
    
    # Split into train (80%), validation (10%), and test (10%)
    train_size = int(0.8 * len(shuffled_data))
    val_size = int(0.1 * len(shuffled_data))
    
    train_data = shuffled_data.select(range(train_size))
    val_data = shuffled_data.select(range(train_size, train_size + val_size))
    test_data = shuffled_data.select(range(train_size + val_size, len(shuffled_data)))
    
    print(f"Training set size: {len(train_data)}")
    print(f"Training set label distribution: {collections.Counter(train_data['label'])}")
    print(f"Validation set size: {len(val_data)}")
    print(f"Validation set label distribution: {collections.Counter(val_data['label'])}")
    print(f"Test set size: {len(test_data)}")
    print(f"Test set label distribution: {collections.Counter(test_data['label'])}")
except Exception as e:
    print(f"Error splitting data: {e}")
    raise

In [None]:
# Tokenize the data
def tokenize_function(examples):
    """Tokenize the code content with proper truncation."""
    # Tokenize the code content
    return tokenizer(
        examples['content'],
        padding='max_length',
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

try:
    # Apply tokenization to each split
    tokenized_train_data = train_data.map(
        tokenize_function,
        batched=True,
        remove_columns=['repo_name', 'path', 'content']
    )
    
    tokenized_val_data = val_data.map(
        tokenize_function,
        batched=True,
        remove_columns=['repo_name', 'path', 'content']
    )
    
    tokenized_test_data = test_data.map(
        tokenize_function,
        batched=True,
        remove_columns=['repo_name', 'path', 'content']
    )
    
    print(f"Tokenized {len(tokenized_train_data)} training examples")
    print(f"Tokenized {len(tokenized_val_data)} validation examples")
    print(f"Tokenized {len(tokenized_test_data)} test examples")
    
    # Set the format for PyTorch
    tokenized_train_data.set_format("torch")
    tokenized_val_data.set_format("torch")
    tokenized_test_data.set_format("torch")
    
    print("Data tokenization complete")
except Exception as e:
    print(f"Error tokenizing data: {e}")
    raise

In [None]:
# Load the model
try:
    # Load the model with the correct number of labels
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, 
        num_labels=num_labels,
        problem_type="single_label_classification"
    )
    
    # Move model to the appropriate device
    model.to(device)
    
    print(f"Model loaded with {num_labels} output classes")
    print(f"Model type: {model.__class__.__name__}")
    
    # Print model size
    model_size = sum(p.numel() for p in model.parameters())
    print(f"Model has {model_size:,} parameters")
    
    # Check memory usage
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    print(f"Current memory usage: {memory_info.rss / 1024 / 1024:.2f} MB")
except Exception as e:
    print(f"Error loading model: {e}")
    raise

In [None]:
# Compute class weights to handle imbalanced data
try:
    # Extract labels for computing class weights
    labels = train_data['label']
    
    # Compute balanced class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
    
    print("Class weights:")
    for i, weight in enumerate(class_weights):
        category_name = category_names.get(i, f"Unknown-{i}")
        print(f"Class {i} ({category_name}): {weight:.4f}")
except Exception as e:
    print(f"Error computing class weights: {e}")
    raise

In [None]:
# Define debug training arguments with more frequent evaluation and logging
training_args = TrainingArguments(
    output_dir="./debug_results",
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*2,
    warmup_steps=WARMUP_STEPS,
    weight_decay=WEIGHT_DECAY,
    learning_rate=LEARNING_RATE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    eval_strategy="steps",  # More frequent evaluation for debugging
    eval_steps=50,  # Evaluate every 50 steps
    save_strategy="steps",
    save_steps=50,  # Save every 50 steps
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./debug_logs",
    logging_steps=10,  # Log every 10 steps for more visibility
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    report_to="none",
    # Debug options
    disable_tqdm=False,  # Show progress bars
    dataloader_num_workers=0,  # No multiprocessing for debugging
    dataloader_pin_memory=False  # Disable pin memory for debugging
)
print("Using debug training arguments with more frequent evaluation and logging")

# Define early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.01
)

# Define compute_metrics function for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Create data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')

# Create the debug trainer
trainer = get_trainer(
    debug_mode=DEBUG_MODE,  # This should be True for debug mode
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

print("Debug training setup complete")

In [None]:
# Run training with resource monitoring
try:
    print("Starting DEBUG training...")
    
    # Monitor resources before training
    print("Resources before training:")
    monitor_resources()
    
    # Start training with a timeout
    start_time = time.time()
    
    # Run training
    train_result = trainer.train()
    
    # Monitor resources after training
    print("Resources after training:")
    monitor_resources()
    
    # Print training results
    print(f"Training completed in {train_result.metrics['train_runtime']:.2f} seconds")
    print(f"Training loss: {train_result.metrics['train_loss']:.4f}")
    
    # Save the model
    trainer.save_model("./debug_model")
    print("Model saved to ./debug_model")
    
    # Clean up memory
    cleanup_memory()
    
except Exception as e:
    print(f"Error during training: {e}")
    
    # Print stack trace for debugging
    import traceback
    traceback.print_exc()
    
    # Monitor resources after error
    print("Resources after error:")
    monitor_resources()
    
    raise

In [None]:
# Evaluate the model on the test set
try:
    print("Evaluating model on test set...")
    test_results = trainer.evaluate(tokenized_test_data)
    
    print("Test results:")
    for metric_name, value in test_results.items():
        print(f"{metric_name}: {value:.4f}")
        
except Exception as e:
    print(f"Error during evaluation: {e}")
    raise