# 02 ¬∑ Fine-tune on OpenStack Logs

This notebook adapts the HDFS-pretrained DistilBERT model to OpenStack anomaly detection with optional replay and LoRA support. Hugging Face Accelerate drives training on multi-GPU Linux or falls back to Apple MPS when available.


## Notebook Goals

- Load OpenStack fine-tuning hyperparameters and reuse the Accelerate configuration (skipped for MPS).
- Optionally replay a slice of HDFS data and/or enable LoRA adapters via config toggles.
- Train with early stopping, checkpoint cadence, and GPU/MPS memory hygiene utilities.
- Evaluate on validation/test splits with F1, ROC-AUC, PR-AUC, and confusion matrices.
- Export TorchScript and ONNX artifacts and capture a MODEL_CARD snippet.


## 1. Imports and Configuration


In [1]:
import json
import math
import os
import gc
import time
from pathlib import Path
from typing import Dict, Tuple

import torch
from torch.utils.data import DataLoader
from accelerate import Accelerator
from datasets import load_from_disk, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    get_scheduler
)
from peft import LoraConfig, get_peft_model
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import yaml

  import pynvml  # type: ignore[import]


### Load YAML configs


In [2]:
def load_yaml(path: Path) -> Dict:
    with path.open('r') as fh:
        return yaml.safe_load(fh)

data_cfg = load_yaml(Path('../configs/data.yaml'))
train_cfg = load_yaml(Path('../configs/train_openstack.yaml'))
print('Configs loaded.')

Configs loaded.


### Device detection


In [3]:
IS_MPS = torch.backends.mps.is_available()
if IS_MPS:
    os.environ.setdefault('ACCELERATE_USE_MPS_DEVICE', '1')
    print('Apple Silicon (MPS) detected. Accelerate will use the MPS backend.')
else:
    print('MPS not available; using CUDA/CPU settings from training config.')

MPS not available; using CUDA/CPU settings from training config.


## 2. Prepare Datasets


In [4]:
parquet_dir = Path(data_cfg['preprocessing']['parquet_dir'])
metadata_path = Path(data_cfg['preprocessing']['dataset_metadata'])
metadata = json.loads(metadata_path.read_text()) if metadata_path.exists() else {}

openstack_train = load_from_disk(str(parquet_dir / 'openstack_train_hf'))
openstack_val = load_from_disk(str(parquet_dir / 'openstack_val_hf'))
openstack_test = load_from_disk(str(parquet_dir / 'openstack_test_hf'))

replay_cfg = train_cfg['replay']
if replay_cfg['enabled']:
    hdfs_dataset = load_from_disk(str(parquet_dir / 'hdfs_train_hf'))
    replay_size = max(1, int(len(openstack_train) * replay_cfg['ratio']))
    replay_subset = hdfs_dataset.shuffle(seed=train_cfg['seed']).select(range(replay_size))
    
    # Convert to pandas and fix schema compatibility
    openstack_df = openstack_train.to_pandas()
    hdfs_df = replay_subset.to_pandas()
    
    # Ensure compatible schemas
    print(f"OpenStack columns: {list(openstack_df.columns)}")
    print(f"HDFS columns: {list(hdfs_df.columns)}")
    
    # Align columns - keep only common columns or add missing ones with default values
    common_columns = set(openstack_df.columns) & set(hdfs_df.columns)
    
    # For missing columns in HDFS, add defaults
    for col in openstack_df.columns:
        if col not in hdfs_df.columns:
            if col == 'anomaly_label':
                hdfs_df[col] = 0  # Normal logs by default
            elif 'template_id' in col:
                hdfs_df[col] = -1  # Default template ID
            else:
                hdfs_df[col] = None  # Default to None for other missing columns
    
    # For missing columns in OpenStack, add defaults  
    for col in hdfs_df.columns:
        if col not in openstack_df.columns:
            if 'template_id' in col:
                openstack_df[col] = -1
            else:
                openstack_df[col] = None
    
    # Ensure template_id columns are properly typed as integers
    for col in openstack_df.columns:
        if 'template_id' in col:
            # Handle string template_ids that can't convert to int
            if col in hdfs_df.columns:
                hdfs_df[col] = pd.to_numeric(hdfs_df[col], errors='coerce').fillna(-1).astype('int64')
            openstack_df[col] = pd.to_numeric(openstack_df[col], errors='coerce').fillna(-1).astype('int64')
    
    # Reorder columns to match
    column_order = list(openstack_df.columns)
    hdfs_df = hdfs_df.reindex(columns=column_order)
    
    print(f"Schema alignment completed. Columns: {column_order}")
    
    # Combine datasets
    combined = pd.concat([openstack_df, hdfs_df], ignore_index=True)
    combined = combined.sample(frac=1.0, random_state=train_cfg['seed']).reset_index(drop=True)
    train_dataset = Dataset.from_pandas(combined, preserve_index=False)
    print(f'Replay enabled: mixed {replay_size} HDFS rows with {len(openstack_train)} OpenStack rows.')
else:
    train_dataset = openstack_train
    print('Replay disabled.')

OpenStack columns: ['input_ids', 'attention_mask', 'labels', 'template_id', 'anomaly_label', 'timestamp']
HDFS columns: ['input_ids', 'attention_mask', 'labels', 'template_id', 'anomaly_label', 'timestamp']
Schema alignment completed. Columns: ['input_ids', 'attention_mask', 'labels', 'template_id', 'anomaly_label', 'timestamp']
Replay enabled: mixed 16625 HDFS rows with 166256 OpenStack rows.


## 3. Tokenizer and Base Checkpoint


In [5]:
tokenizer_dir = Path(train_cfg['artifacts']['tokenizer_dir'])
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=True)

base_dir = Path(train_cfg['base_checkpoint_dir'])
if not base_dir.exists():
    raise FileNotFoundError(f'Base checkpoint directory not found: {base_dir}')

# Look for the completed HDFS pretraining checkpoint (final epoch)
print("üîç Searching for HDFS pretrained model checkpoints...")
candidate_checkpoints = []

# Check for epoch checkpoint (preferred - final trained model)
epoch_checkpoints = sorted(base_dir.glob('epoch_epoch*_step*/config.json'))
if epoch_checkpoints:
    candidate_checkpoints.extend(epoch_checkpoints)
    print(f"üìä Found {len(epoch_checkpoints)} epoch checkpoints")

# Check for step checkpoints as fallback
step_checkpoints = sorted(base_dir.glob('step_epoch*_step*/config.json'))
if step_checkpoints:
    candidate_checkpoints.extend(step_checkpoints)
    print(f"üìä Found {len(step_checkpoints)} step checkpoints")

if not candidate_checkpoints:
    raise FileNotFoundError(f'No pretrained checkpoints found in {base_dir}')

# Select the most recent checkpoint (highest step number)
model_path = candidate_checkpoints[-1].parent
checkpoint_name = model_path.name

print(f'üöÄ Loading HDFS pretrained DistilBERT from: {checkpoint_name}')
print(f'üìÇ Full path: {model_path}')

# Verify checkpoint has required files
required_files = ['config.json', 'model.safetensors']
missing_files = [f for f in required_files if not (model_path / f).exists()]
if missing_files:
    # Try pytorch_model.bin as fallback
    if (model_path / 'pytorch_model.bin').exists():
        print("‚úÖ Found pytorch_model.bin (using instead of model.safetensors)")
    else:
        raise FileNotFoundError(f'Missing required files in checkpoint: {missing_files}')

config = AutoConfig.from_pretrained(model_path)
model = AutoModelForMaskedLM.from_pretrained(model_path, config=config)
model.resize_token_embeddings(len(tokenizer))

print(f'‚úÖ Successfully loaded pretrained HDFS model!')
print(f'   Model vocab size: {model.config.vocab_size}')
print(f'   Tokenizer vocab size: {len(tokenizer)}')
print(f'   Model parameters: {sum(p.numel() for p in model.parameters()):,}')

peft_cfg = train_cfg['peft']
if peft_cfg['lora_enabled']:
    lora_config = LoraConfig(
        r=peft_cfg['r'],
        lora_alpha=peft_cfg['alpha'],
        target_modules=peft_cfg['target_modules'],
        lora_dropout=peft_cfg['dropout'],
        bias=peft_cfg['bias']
    )
    model = get_peft_model(model, lora_config)
    print('LoRA adapters enabled.')
else:
    print('LoRA disabled; full fine-tuning will run.')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


üîç Searching for HDFS pretrained model checkpoints...
üìä Found 1 epoch checkpoints
üìä Found 10 step checkpoints
üöÄ Loading HDFS pretrained DistilBERT from: step_epoch3_step800000
üìÇ Full path: artifacts/logbert-mlm-hdfs/step_epoch3_step800000
‚úÖ Successfully loaded pretrained HDFS model!
   Model vocab size: 30531
   Tokenizer vocab size: 30531
   Model parameters: 66,992,451
LoRA disabled; full fine-tuning will run.


## 4. DataLoaders


In [6]:
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=train_cfg['sequence']['mlm_probability'],
    pad_to_multiple_of=8  # Efficient padding for GPU memory
)

def collate_train(examples):
    # Remove anomaly_label for MLM training
    for example in examples:
        example.pop('anomaly_label', None)
    
    # Ensure proper tokenization with padding and truncation
    processed_examples = []
    for example in examples:
        # Handle potential nested inputs by extracting the core tokenized data
        if 'input_ids' in example:
            processed_example = {
                'input_ids': example['input_ids'][:train_cfg['sequence']['max_length']],  # Truncate if needed
                'attention_mask': example.get('attention_mask', [1] * len(example['input_ids']))[:train_cfg['sequence']['max_length']]
            }
        else:
            # Fallback for unexpected format
            processed_example = {k: v for k, v in example.items() if k not in ['anomaly_label']}
        processed_examples.append(processed_example)
    
    return collator(processed_examples)

def collate_eval(examples):
    labels = [example.get('anomaly_label', 0) for example in examples]
    
    # Process features similar to training
    processed_features = []
    for example in examples:
        if 'input_ids' in example:
            processed_feature = {
                'input_ids': example['input_ids'][:train_cfg['sequence']['max_length']],
                'attention_mask': example.get('attention_mask', [1] * len(example['input_ids']))[:train_cfg['sequence']['max_length']]
            }
        else:
            processed_feature = {k: v for k, v in example.items() if k != 'anomaly_label'}
        processed_features.append(processed_feature)
    
    batch = collator(processed_features)
    batch['anomaly_label'] = torch.tensor(labels, dtype=torch.long)
    return batch

train_loader = DataLoader(train_dataset, batch_size=train_cfg['training']['train_batch_size_per_device'], shuffle=True, collate_fn=collate_train)
val_loader = DataLoader(openstack_val, batch_size=train_cfg['training']['eval_batch_size_per_device'], shuffle=False, collate_fn=collate_eval)
test_loader = DataLoader(openstack_test, batch_size=train_cfg['training']['eval_batch_size_per_device'], shuffle=False, collate_fn=collate_eval)

## 5. Accelerator and Optimizer


In [7]:
mixed_precision = 'no' if IS_MPS else train_cfg['precision']['mixed_precision']
accelerator = Accelerator(
    gradient_accumulation_steps=train_cfg['training']['grad_accumulation_steps'],
    mixed_precision=mixed_precision
)
print(accelerator.state)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=train_cfg['optimizer']['lr'],
    betas=tuple(train_cfg['optimizer']['betas']),
    eps=train_cfg['optimizer']['eps'],
    weight_decay=train_cfg['optimizer']['weight_decay']
)

model, optimizer, train_loader, val_loader, test_loader = accelerator.prepare(
    model, optimizer, train_loader, val_loader, test_loader
)

total_steps = math.ceil(len(train_loader) / train_cfg['training']['grad_accumulation_steps']) * train_cfg['training']['epochs']
lr_scheduler = get_scheduler(
    name=train_cfg['optimizer']['scheduler'],
    optimizer=optimizer,
    num_warmup_steps=train_cfg['optimizer']['warmup_steps'],
    num_training_steps=total_steps
)

Distributed environment: DistributedType.NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda

Mixed precision type: fp16



In [8]:
# Test batch creation to verify collation works
print("üß™ Testing batch creation...")
try:
    test_batch = next(iter(train_loader))
    print(f"‚úÖ Batch created successfully!")
    print(f"   Input IDs shape: {test_batch['input_ids'].shape}")
    print(f"   Attention mask shape: {test_batch['attention_mask'].shape}")
    print(f"   Labels shape: {test_batch['labels'].shape}")
    print(f"   Sequence lengths are consistent: {test_batch['input_ids'].shape[1]} tokens")
except Exception as e:
    print(f"‚ùå Batch creation failed: {e}")
    print("This needs to be fixed before training can proceed.")
    raise e

üß™ Testing batch creation...
‚úÖ Batch created successfully!
   Input IDs shape: torch.Size([8, 88])
   Attention mask shape: torch.Size([8, 88])
   Labels shape: torch.Size([8, 88])
   Sequence lengths are consistent: 88 tokens


In [9]:
# Reset training state variables
print("üîÑ Resetting training state...")

history = {'epoch': [], 'train_loss': [], 'val_loss': []}
best_val = float('inf')
best_checkpoint_path = None
wait = 0

print("‚úÖ Training state reset. Ready to begin fine-tuning!")

üîÑ Resetting training state...
‚úÖ Training state reset. Ready to begin fine-tuning!


In [10]:
def free_cuda():
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
    gc.collect()


def log_gpu_memory(tag: str):
    if torch.cuda.is_available():
        alloc = torch.cuda.memory_allocated() / (1024 ** 3)
        reserved = torch.cuda.memory_reserved() / (1024 ** 3)
        accelerator.print(f'[{tag}] gpu allocated={alloc:.2f} GB reserved={reserved:.2f} GB')
    elif IS_MPS:
        try:
            import torch.mps
            stats = torch.mps.current_allocated_memory() / (1024 ** 3)
            accelerator.print(f'[{tag}] mps allocated={stats:.2f} GB')
        except Exception:
            accelerator.print(f'[{tag}] mps memory stats unavailable.')

In [11]:
# Initialize training setup
checkpoint_cfg = train_cfg['checkpointing']
metrics_dir = Path(train_cfg['artifacts']['metrics_dir'])
metrics_dir.mkdir(parents=True, exist_ok=True)
eval_dir = Path(train_cfg['artifacts']['eval_dir'])
eval_dir.mkdir(parents=True, exist_ok=True)
run_config_path = Path(train_cfg['artifacts']['run_config_path'])
run_config_path.parent.mkdir(parents=True, exist_ok=True)

# Training state variables
history = {'epoch': [], 'train_loss': [], 'val_loss': []}
best_val = float('inf')
best_checkpoint_path = None
wait = 0
patience = train_cfg['training']['patience']
min_delta = train_cfg['training']['min_delta']
epochs_total = train_cfg['training']['epochs']
max_grad_norm = train_cfg['training']['max_grad_norm']
log_steps = train_cfg['logging']['log_steps'] if 'logging' in train_cfg else None

# Calculate checkpoint frequency for every 15% of dataset
total_batches_per_epoch = len(train_loader)
checkpoint_every_batches = max(1, int(total_batches_per_epoch * 0.15))

print("‚úÖ Training setup initialized!")
print(f"   üéØ Learning rate: {train_cfg['optimizer']['lr']:.1e}")
print(f"   ‚è∞ Warmup steps: {train_cfg['optimizer']['warmup_steps']:,}")
print(f"   üöÄ Epochs: {epochs_total}")
print(f"   üõë Patience: {patience}")
print(f"   üìè Max grad norm: {max_grad_norm}")
print(f"   üîÑ Grad accumulation: {train_cfg['training']['grad_accumulation_steps']}")
print(f"   üìä Checkpoint every: {checkpoint_every_batches} batches (15% of dataset)")

‚úÖ Training setup initialized!
   üéØ Learning rate: 2.0e-06
   ‚è∞ Warmup steps: 2,000
   üöÄ Epochs: 3
   üõë Patience: 3
   üìè Max grad norm: 0.5
   üîÑ Grad accumulation: 8
   üìä Checkpoint every: 3429 batches (15% of dataset)


In [12]:
# Create evaluation collate function that preserves anomaly labels
def collate_eval(examples):
    """Evaluation collate function that preserves anomaly labels for evaluation"""
    labels = [example.get('anomaly_label', 0) for example in examples]
    
    # Process features for MLM
    processed_features = []
    for example in examples:
        if 'input_ids' in example:
            processed_feature = {
                'input_ids': example['input_ids'][:train_cfg['sequence']['max_length']],
                'attention_mask': example.get('attention_mask', [1] * len(example['input_ids']))[:train_cfg['sequence']['max_length']]
            }
        else:
            processed_feature = {k: v for k, v in example.items() if k != 'anomaly_label'}
        processed_features.append(processed_feature)
    
    # Use MLM collator for the features
    batch = collator(processed_features)
    # Add back the anomaly labels for evaluation
    batch['anomaly_label'] = torch.tensor(labels, dtype=torch.long)
    return batch

# Recreate evaluation loaders with the fixed collate function
val_loader_eval = DataLoader(openstack_val, batch_size=train_cfg['training']['eval_batch_size_per_device'], shuffle=False, collate_fn=collate_eval)
test_loader_eval = DataLoader(openstack_test, batch_size=train_cfg['training']['eval_batch_size_per_device'], shuffle=False, collate_fn=collate_eval)

# Prepare the evaluation loaders
_, _, _, val_loader_eval, test_loader_eval = accelerator.prepare(
    None, None, None, val_loader_eval, test_loader_eval
)

print("‚úÖ Evaluation dataloaders created with anomaly labels preserved")

‚úÖ Evaluation dataloaders created with anomaly labels preserved


In [13]:
# üõ°Ô∏è ULTRA-STABLE TRAINING LOOP - No More Overshooting
print("üõ°Ô∏è Starting ultra-stable training loop...")
accelerator.print(f"üìä Ultra-conservative settings active")
accelerator.print(f"üìä Training samples: {len(train_dataset):,}")
accelerator.print(f"üìä Validation samples: {len(openstack_val):,}")
accelerator.print(f"üìä Batches per epoch: {total_batches_per_epoch:,}")

# Create a validation loader without anomaly labels for MLM training
val_loader_mlm = DataLoader(openstack_val, batch_size=train_cfg['training']['eval_batch_size_per_device'], 
                        shuffle=False, collate_fn=collate_train)  # Use collate_train to remove anomaly_label
val_loader_mlm = accelerator.prepare(val_loader_mlm)
accelerator.print("‚úÖ Created MLM-specific validation loader (without anomaly labels)")

for epoch in range(epochs_total):
    model.train()
    accelerator.print(f'==== Epoch {epoch+1}/{epochs_total} (Ultra-Stable Mode) ====')
    
    # Force enable progress bar
    progress = tqdm(
        total=len(train_loader), 
        desc=f"Epoch {epoch+1} - Stable", 
        disable=False,
        leave=True,
        position=0,
        dynamic_ncols=True
    )
    
    step_losses = []
    for step, batch in enumerate(train_loader, start=1):
        with accelerator.accumulate(model):
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            
            # Ultra-strict gradient clipping (0.1 instead of 0.5)
            if accelerator.sync_gradients and max_grad_norm:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            
        step_losses.append(loss.detach().item())
        global_step = epoch * len(train_loader) + step
        
        # Update progress bar with ultra-detailed info
        current_lr = lr_scheduler.get_last_lr()[0]
        progress.set_postfix({
            'loss': f'{loss.item():.4f}',
            'avg_loss': f'{np.mean(step_losses):.4f}',
            'lr': f'{current_lr:.1e}',
            'step': f'{step}/{len(train_loader)}'
        })
        progress.update(1)

        # More frequent logging for stability monitoring
        if step % 25 == 0:  # Every 25 steps instead of 50
            avg_loss = np.mean(step_losses[-25:])  # Rolling average of last 25 steps
            accelerator.print(f'üìà Step {step:5d} | Loss: {loss.item():.4f} | Avg: {avg_loss:.4f} | LR: {current_lr:.1e}')

        # Save checkpoint every 15% of dataset processed
        if step % checkpoint_every_batches == 0:
            percent_complete = (step / total_batches_per_epoch) * 100
            ckpt_dir = Path(checkpoint_cfg['output_dir']) / f'epoch{epoch+1}_step{step}_pct{percent_complete:.0f}'
            if accelerator.is_main_process:
                ckpt_dir.mkdir(parents=True, exist_ok=True)
                accelerator.unwrap_model(model).save_pretrained(ckpt_dir)
                tokenizer.save_pretrained(ckpt_dir / 'tokenizer')
                accelerator.print(f'üíæ Stable checkpoint saved at {percent_complete:.0f}% of epoch {epoch+1}')
            accelerator.wait_for_everyone()
            free_cuda()

    progress.close()
    train_loss = float(np.mean(step_losses))
    accelerator.print(f'üìä Epoch {epoch+1} - Average train loss: {train_loss:.4f} (Stable!)')

    # Save end-of-epoch checkpoint
    epoch_ckpt_dir = Path(checkpoint_cfg['output_dir']) / f'epoch{epoch+1}_final'
    if accelerator.is_main_process:
        epoch_ckpt_dir.mkdir(parents=True, exist_ok=True)
        accelerator.unwrap_model(model).save_pretrained(epoch_ckpt_dir)
        tokenizer.save_pretrained(epoch_ckpt_dir / 'tokenizer')
        accelerator.print(f'üíæ End-of-epoch checkpoint saved: {epoch_ckpt_dir.name}')
    accelerator.wait_for_everyone()

    # Validation loop with MLM-only dataloader (without anomaly_label)
    model.eval()
    accelerator.print(f'üîç Running validation for epoch {epoch+1}...')
    val_progress = tqdm(
        total=len(val_loader_mlm), 
        desc="Validation", 
        disable=False, 
        leave=False
    )
    
    val_losses = []
    for batch in val_loader_mlm:  # Use the MLM-specific dataloader
        with torch.no_grad():
            outputs = model(**batch)  # Now the batch doesn't include anomaly_label
            val_losses.append(accelerator.gather(outputs.loss.detach()).mean().item())
        val_progress.update(1)
    val_progress.close()
    
    val_loss = float(np.mean(val_losses))

    history['epoch'].append(epoch+1)
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    accelerator.print(f'‚úÖ Epoch {epoch+1}: train_loss={train_loss:.4f} val_loss={val_loss:.4f}')

    # Early stopping check
    if val_loss + min_delta < best_val:
        best_val = val_loss
        wait = 0
        best_checkpoint_path = Path(checkpoint_cfg['output_dir']) / 'best'
        if accelerator.is_main_process:
            best_checkpoint_path.mkdir(parents=True, exist_ok=True)
            accelerator.unwrap_model(model).save_pretrained(best_checkpoint_path)
            tokenizer.save_pretrained(best_checkpoint_path / 'tokenizer')
        accelerator.wait_for_everyone()
        free_cuda()
        accelerator.print('‚úÖ Best checkpoint updated!')
    else:
        wait += 1
        accelerator.print(f'‚è≥ No improvement, patience {wait}/{patience}')
        if wait >= patience:
            accelerator.print('üõë Early stopping triggered.')
            break

free_cuda()

if best_checkpoint_path and best_checkpoint_path.exists():
    accelerator.print(f'üîÑ Loading best checkpoint from {best_checkpoint_path}')
    best_model = AutoModelForMaskedLM.from_pretrained(best_checkpoint_path, config=config)
    accelerator.unwrap_model(model).load_state_dict(best_model.state_dict())
    del best_model
    free_cuda()
    accelerator.wait_for_everyone()
    
accelerator.print('üéâ Fine-tuning completed successfully!')

üõ°Ô∏è Starting ultra-stable training loop...
üìä Ultra-conservative settings active
üìä Training samples: 182,881
üìä Validation samples: 20,782
üìä Batches per epoch: 22,861
‚úÖ Created MLM-specific validation loader (without anomaly labels)
==== Epoch 1/3 (Ultra-Stable Mode) ====


Epoch 1 - Stable:   0%|          | 0/22861 [00:00<?, ?it/s]

üìà Step    25 | Loss: 7.9509 | Avg: 7.7425 | LR: 2.5e-08
üìà Step    50 | Loss: 6.9753 | Avg: 7.6525 | LR: 5.0e-08
üìà Step    75 | Loss: 8.5835 | Avg: 8.0105 | LR: 7.5e-08
üìà Step   100 | Loss: 8.2300 | Avg: 7.6603 | LR: 1.0e-07
üìà Step   125 | Loss: 8.7046 | Avg: 7.6113 | LR: 1.2e-07
üìà Step   150 | Loss: 8.3361 | Avg: 7.6327 | LR: 1.5e-07
üìà Step   175 | Loss: 9.1738 | Avg: 7.6767 | LR: 1.7e-07
üìà Step   200 | Loss: 6.5370 | Avg: 7.8188 | LR: 2.0e-07
üìà Step   225 | Loss: 7.6995 | Avg: 7.7456 | LR: 2.2e-07
üìà Step   250 | Loss: 8.5709 | Avg: 7.9315 | LR: 2.5e-07
üìà Step   275 | Loss: 8.2594 | Avg: 7.6343 | LR: 2.8e-07
üìà Step   300 | Loss: 8.0072 | Avg: 7.6323 | LR: 3.0e-07
üìà Step   325 | Loss: 8.3055 | Avg: 8.0554 | LR: 3.3e-07
üìà Step   350 | Loss: 7.8392 | Avg: 7.4493 | LR: 3.5e-07
üìà Step   375 | Loss: 6.5561 | Avg: 7.6315 | LR: 3.8e-07
üìà Step   400 | Loss: 7.1656 | Avg: 7.8862 | LR: 4.0e-07
üìà Step   425 | Loss: 6.5033 | Avg: 7.7086 | LR: 4.2e-

Validation:   0%|          | 0/1299 [00:00<?, ?it/s]

‚úÖ Epoch 1: train_loss=3.8794 val_loss=2.2446
‚úÖ Best checkpoint updated!
==== Epoch 2/3 (Ultra-Stable Mode) ====


Epoch 2 - Stable:   0%|          | 0/22861 [00:00<?, ?it/s]

üìà Step    25 | Loss: 3.2407 | Avg: 2.6725 | LR: 1.5e-07
üìà Step    50 | Loss: 2.2645 | Avg: 2.4665 | LR: 1.6e-07
üìà Step    75 | Loss: 2.3702 | Avg: 2.5575 | LR: 1.6e-07
üìà Step   100 | Loss: 2.1984 | Avg: 2.5338 | LR: 1.7e-07
üìà Step   125 | Loss: 3.3773 | Avg: 2.6119 | LR: 1.8e-07
üìà Step   150 | Loss: 3.7514 | Avg: 2.6584 | LR: 1.8e-07
üìà Step   175 | Loss: 2.7201 | Avg: 2.7567 | LR: 1.9e-07
üìà Step   200 | Loss: 2.5715 | Avg: 2.6099 | LR: 2.0e-07
üìà Step   225 | Loss: 1.9143 | Avg: 2.5596 | LR: 2.1e-07
üìà Step   250 | Loss: 1.7183 | Avg: 2.4486 | LR: 2.1e-07
üìà Step   275 | Loss: 2.4030 | Avg: 2.7347 | LR: 2.2e-07
üìà Step   300 | Loss: 1.8046 | Avg: 2.4877 | LR: 2.3e-07
üìà Step   325 | Loss: 1.9950 | Avg: 2.5506 | LR: 2.3e-07
üìà Step   350 | Loss: 2.8187 | Avg: 2.6090 | LR: 2.4e-07
üìà Step   375 | Loss: 3.4929 | Avg: 2.4743 | LR: 2.5e-07
üìà Step   400 | Loss: 2.8538 | Avg: 2.6624 | LR: 2.6e-07
üìà Step   425 | Loss: 2.2952 | Avg: 2.5963 | LR: 2.7e-

Validation:   0%|          | 0/1299 [00:00<?, ?it/s]

‚úÖ Epoch 2: train_loss=2.0622 val_loss=1.4827
‚úÖ Best checkpoint updated!
==== Epoch 3/3 (Ultra-Stable Mode) ====


Epoch 3 - Stable:   0%|          | 0/22861 [00:00<?, ?it/s]

üìà Step    25 | Loss: 1.8999 | Avg: 1.7587 | LR: 5.3e-07
üìà Step    50 | Loss: 2.5936 | Avg: 1.8653 | LR: 5.2e-07
üìà Step    75 | Loss: 1.6664 | Avg: 1.7487 | LR: 5.1e-07
üìà Step   100 | Loss: 1.5564 | Avg: 1.9354 | LR: 5.0e-07
üìà Step   125 | Loss: 1.8583 | Avg: 1.6046 | LR: 4.9e-07
üìà Step   150 | Loss: 1.3506 | Avg: 1.5569 | LR: 4.8e-07
üìà Step   175 | Loss: 1.5452 | Avg: 1.6287 | LR: 4.7e-07
üìà Step   200 | Loss: 3.2693 | Avg: 1.8176 | LR: 4.6e-07
üìà Step   225 | Loss: 1.7411 | Avg: 1.7012 | LR: 4.5e-07
üìà Step   250 | Loss: 0.7183 | Avg: 1.5720 | LR: 4.4e-07
üìà Step   275 | Loss: 1.9585 | Avg: 1.6376 | LR: 4.3e-07
üìà Step   300 | Loss: 1.9801 | Avg: 1.9550 | LR: 4.2e-07
üìà Step   325 | Loss: 2.1208 | Avg: 1.6199 | LR: 4.1e-07
üìà Step   350 | Loss: 1.0944 | Avg: 1.6983 | LR: 4.0e-07
üìà Step   375 | Loss: 1.5609 | Avg: 1.5659 | LR: 3.9e-07
üìà Step   400 | Loss: 1.2918 | Avg: 1.9483 | LR: 3.8e-07
üìà Step   425 | Loss: 0.7959 | Avg: 1.6613 | LR: 3.7e-

Validation:   0%|          | 0/1299 [00:00<?, ?it/s]

‚úÖ Epoch 3: train_loss=1.5284 val_loss=1.0978
‚úÖ Best checkpoint updated!
üîÑ Loading best checkpoint from artifacts/logbert-mlm-os/best
üéâ Fine-tuning completed successfully!


## 8. Evaluation


In [None]:
model.eval()

@torch.no_grad()
def collect_scores(dataloader) -> Tuple[np.ndarray, np.ndarray]:
    scores, labels = [], []
    for batch in dataloader:
        # Extract and remove anomaly_label from batch before passing to model
        label_tensor = batch.pop('anomaly_label')
        
        # Pass the batch to model (now without anomaly_label)
        outputs = model(**batch)
        loss = accelerator.gather(outputs.loss)
        scores.extend(loss.cpu().numpy())
        
        # Process the labels we extracted earlier
        labels.extend(accelerator.gather(label_tensor).cpu().numpy())
    return np.asarray(scores), np.asarray(labels)

# Use the evaluation loaders that preserve anomaly labels
val_scores, val_labels = collect_scores(val_loader_eval)
test_scores, test_labels = collect_scores(test_loader_eval)

threshold_candidates = np.percentile(val_scores, np.linspace(50, 99, 25))
best_threshold, best_f1 = threshold_candidates[-1], 0.0
for candidate in threshold_candidates:
    preds = (val_scores >= candidate).astype(int)
    f1 = f1_score(val_labels, preds, zero_division=0)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = candidate

val_probs = np.exp(-val_scores)
test_probs = np.exp(-test_scores)

pred_test = (test_scores >= best_threshold).astype(int)
f1 = f1_score(test_labels, pred_test, zero_division=0)
roc_auc = roc_auc_score(test_labels, test_probs)
pr_auc = average_precision_score(test_labels, test_probs)

metrics = {
    'val_threshold_loss': float(best_threshold),
    'val_best_f1': float(best_f1),
    'test_f1': float(f1),
    'test_roc_auc': float(roc_auc),
    'test_pr_auc': float(pr_auc)
}
metrics_path = metrics_dir / 'openstack_metrics.json'
metrics_path.write_text(json.dumps(metrics, indent=2))
print(json.dumps(metrics, indent=2))

cm = confusion_matrix(test_labels, pred_test)
fig, ax = plt.subplots(figsize=(4, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
plt.tight_layout()
cm_path = eval_dir / 'confusion_matrix.png'
fig.savefig(cm_path)
plt.close(fig)
print(f'Confusion matrix saved to {cm_path}')

TypeError: DistilBertForMaskedLM.forward() got an unexpected keyword argument 'anomaly_label'

## 9. Export TorchScript and ONNX


In [None]:
export_dir = Path(train_cfg['export']['output_dir'])
export_dir.mkdir(parents=True, exist_ok=True)

accelerator.wait_for_everyone()
model_to_export = accelerator.unwrap_model(model).cpu()
model_to_export.eval()

seq_len = train_cfg['sequence']['max_length']
dummy_ids = torch.ones((1, seq_len), dtype=torch.long)
dummy_mask = torch.ones((1, seq_len), dtype=torch.long)

with torch.no_grad():
    traced = torch.jit.trace(model_to_export, (dummy_ids, dummy_mask))
    ts_path = export_dir / train_cfg['export']['torchscript_filename']
    traced.save(str(ts_path))
    print(f'TorchScript saved to {ts_path}')

onnx_path = export_dir / train_cfg['export']['onnx_filename']
torch.onnx.export(
    model_to_export,
    (dummy_ids, dummy_mask),
    str(onnx_path),
    input_names=['input_ids', 'attention_mask'],
    output_names=['logits'],
    dynamic_axes={'input_ids': {0: 'batch'}, 'attention_mask': {0: 'batch'}, 'logits': {0: 'batch'}},
    opset_version=train_cfg['export']['opset']
)
print(f'ONNX saved to {onnx_path}')

## 10. Model Card Snippet


In [None]:
model_card_path = Path(train_cfg['checkpointing']['output_dir']) / 'MODEL_CARD.md'
model_card = (
    f"# LogBERT OpenStack Fine-tune

"
    f"- Base checkpoint: {train_cfg['base_checkpoint_dir']}
"
    f"- Sequence length: {train_cfg['sequence']['max_length']}
"
    f"- Replay enabled: {train_cfg['replay']['enabled']}
"
    f"- LoRA enabled: {train_cfg['peft']['lora_enabled']}

"
    f"## Eval Metrics
"
    f"- F1: {metrics['test_f1']:.4f}
"
    f"- ROC-AUC: {metrics['test_roc_auc']:.4f}
"
    f"- PR-AUC: {metrics['test_pr_auc']:.4f}
"
    f"- Threshold (loss): {metrics['val_threshold_loss']:.6f}

"
    f"## Artifacts
"
    f"- TorchScript: {train_cfg['export']['torchscript_filename']}
"
    f"- ONNX: {train_cfg['export']['onnx_filename']}
"
)
model_card_path.write_text(model_card)
print(f'Model card snippet written to {model_card_path}')

## 11. Persist Run Config


In [None]:
state_summary = {
    'num_processes': accelerator.state.num_processes,
    'device': str(accelerator.device),
    'mixed_precision': accelerator.state.mixed_precision
}
run_payload = {
    'train_openstack': train_cfg,
    'data_config': data_cfg,
    'accelerator_state': state_summary,
    'metrics': metrics,
    'is_mps': IS_MPS
}
run_config_path.write_text(json.dumps(run_payload, indent=2))
print(f'Run config stored at {run_config_path}')

## Artifacts Produced

- Fine-tuned checkpoints -> `artifacts/logbert-mlm-os/`
- Metrics JSON and confusion matrix -> `artifacts/metrics/openstack/openstack_metrics.json`, `artifacts/eval/confusion_matrix.png`
- Exported models -> `artifacts/exported_models/`
- Model card snippet -> `artifacts/logbert-mlm-os/MODEL_CARD.md`

Pipeline complete.
