# 02 · Fine-tune on OpenStack Logs

This notebook adapts the HDFS-pretrained DistilBERT model to OpenStack anomaly detection with optional replay and LoRA support. Hugging Face Accelerate drives training on multi-GPU Linux or falls back to Apple MPS when available.

## Notebook Goals
- Load OpenStack fine-tuning hyperparameters and reuse the Accelerate configuration (skipped for MPS).
- Optionally replay a slice of HDFS data and/or enable LoRA adapters via config toggles.
- Train with early stopping, checkpoint cadence, and GPU/MPS memory hygiene utilities.
- Evaluate on validation/test splits with F1, ROC-AUC, PR-AUC, and confusion matrices.
- Export TorchScript and ONNX artifacts and capture a MODEL_CARD snippet.

## 1. Imports and Configuration

In [None]:
import json
import math
import os
import gc
import time
from pathlib import Path
from typing import Dict, Tuple

import torch
from torch.utils.data import DataLoader
from accelerate import Accelerator
from datasets import load_from_disk, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    get_scheduler
)
from peft import LoraConfig, get_peft_model
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import yaml

### Load YAML configs

In [None]:
def load_yaml(path: Path) -> Dict:
    with path.open('r') as fh:
        return yaml.safe_load(fh)

data_cfg = load_yaml(Path('configs/data.yaml'))
train_cfg = load_yaml(Path('configs/train_openstack.yaml'))
print('Configs loaded.')

### Device detection

In [None]:
IS_MPS = torch.backends.mps.is_available()
if IS_MPS:
    os.environ.setdefault('ACCELERATE_USE_MPS_DEVICE', '1')
    print('Apple Silicon (MPS) detected. Accelerate will use the MPS backend.')
else:
    print('MPS not available; using CUDA/CPU settings from training config.')

## 2. Prepare Datasets

In [None]:
parquet_dir = Path(data_cfg['preprocessing']['parquet_dir'])
metadata_path = Path(data_cfg['preprocessing']['dataset_metadata'])
metadata = json.loads(metadata_path.read_text()) if metadata_path.exists() else {}

openstack_train = load_from_disk(str(parquet_dir / 'openstack_train_hf'))
openstack_val = load_from_disk(str(parquet_dir / 'openstack_val_hf'))
openstack_test = load_from_disk(str(parquet_dir / 'openstack_test_hf'))

replay_cfg = train_cfg['replay']
if replay_cfg['enabled']:
    hdfs_dataset = load_from_disk(str(parquet_dir / 'hdfs_train_hf'))
    replay_size = max(1, int(len(openstack_train) * replay_cfg['ratio']))
    replay_subset = hdfs_dataset.shuffle(seed=train_cfg['seed']).select(range(replay_size))
    combined = pd.concat([
        openstack_train.to_pandas(),
        replay_subset.to_pandas()
    ], ignore_index=True)
    combined = combined.sample(frac=1.0, random_state=train_cfg['seed']).reset_index(drop=True)
    train_dataset = Dataset.from_pandas(combined, preserve_index=False)
    print(f'Replay enabled: mixed {replay_size} HDFS rows with {len(openstack_train)} OpenStack rows.')
else:
    train_dataset = openstack_train
    print('Replay disabled.')

## 3. Tokenizer and Base Checkpoint

In [None]:
tokenizer_dir = Path(train_cfg['artifacts']['tokenizer_dir'])
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=True)

base_dir = Path(train_cfg['base_checkpoint_dir'])
if not base_dir.exists():
    raise FileNotFoundError(f'Base checkpoint directory not found: {base_dir}')

candidate_configs = sorted(base_dir.glob('**/config.json'))
model_path = candidate_configs[-1].parent if candidate_configs else base_dir
print(f'Loading base checkpoint from {model_path}')

config = AutoConfig.from_pretrained(model_path)
model = AutoModelForMaskedLM.from_pretrained(model_path, config=config)
model.resize_token_embeddings(len(tokenizer))

peft_cfg = train_cfg['peft']
if peft_cfg['lora_enabled']:
    lora_config = LoraConfig(
        r=peft_cfg['r'],
        lora_alpha=peft_cfg['alpha'],
        target_modules=peft_cfg['target_modules'],
        lora_dropout=peft_cfg['dropout'],
        bias=peft_cfg['bias']
    )
    model = get_peft_model(model, lora_config)
    print('LoRA adapters enabled.')
else:
    print('LoRA disabled; full fine-tuning will run.')

## 4. DataLoaders

In [None]:
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=train_cfg['sequence']['mlm_probability']
)

def collate_train(examples):
    for example in examples:
        example.pop('anomaly_label', None)
    return collator(examples)

def collate_eval(examples):
    labels = [example.get('anomaly_label', 0) for example in examples]
    features = [{k: v for k, v in example.items() if k != 'anomaly_label'} for example in examples]
    batch = collator(features)
    batch['anomaly_label'] = torch.tensor(labels, dtype=torch.long)
    return batch

train_loader = DataLoader(train_dataset, batch_size=train_cfg['training']['train_batch_size_per_device'], shuffle=True, collate_fn=collate_train)
val_loader = DataLoader(openstack_val, batch_size=train_cfg['training']['eval_batch_size_per_device'], shuffle=False, collate_fn=collate_eval)
test_loader = DataLoader(openstack_test, batch_size=train_cfg['training']['eval_batch_size_per_device'], shuffle=False, collate_fn=collate_eval)

## 5. Accelerator and Optimizer

In [None]:
mixed_precision = 'no' if IS_MPS else train_cfg['precision']['mixed_precision']
accelerator = Accelerator(
    gradient_accumulation_steps=train_cfg['training']['grad_accumulation_steps'],
    mixed_precision=mixed_precision
)
print(accelerator.state)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=train_cfg['optimizer']['lr'],
    betas=tuple(train_cfg['optimizer']['betas']),
    eps=train_cfg['optimizer']['eps'],
    weight_decay=train_cfg['optimizer']['weight_decay']
)

model, optimizer, train_loader, val_loader, test_loader = accelerator.prepare(
    model, optimizer, train_loader, val_loader, test_loader
)

total_steps = math.ceil(len(train_loader) / train_cfg['training']['grad_accumulation_steps']) * train_cfg['training']['epochs']
lr_scheduler = get_scheduler(
    name=train_cfg['optimizer']['scheduler'],
    optimizer=optimizer,
    num_warmup_steps=train_cfg['optimizer']['warmup_steps'],
    num_training_steps=total_steps
)

## 6. Memory Utilities

In [None]:
def free_cuda():
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
    gc.collect()


def log_gpu_memory(tag: str):
    if torch.cuda.is_available():
        alloc = torch.cuda.memory_allocated() / (1024 ** 3)
        reserved = torch.cuda.memory_reserved() / (1024 ** 3)
        accelerator.print(f'[{tag}] gpu allocated={alloc:.2f} GB reserved={reserved:.2f} GB')
    elif IS_MPS:
        try:
            import torch.mps
            stats = torch.mps.current_allocated_memory() / (1024 ** 3)
            accelerator.print(f'[{tag}] mps allocated={stats:.2f} GB')
        except Exception:
            accelerator.print(f'[{tag}] mps memory stats unavailable.')

## 7. Training with Early Stopping

In [None]:
checkpoint_cfg = train_cfg['checkpointing']
metrics_dir = Path(train_cfg['artifacts']['metrics_dir'])
metrics_dir.mkdir(parents=True, exist_ok=True)
eval_dir = Path(train_cfg['artifacts']['eval_dir'])
eval_dir.mkdir(parents=True, exist_ok=True)
run_config_path = Path(train_cfg['artifacts']['run_config_path'])
run_config_path.parent.mkdir(parents=True, exist_ok=True)

history = {'epoch': [], 'train_loss': [], 'val_loss': []}

best_val = float('inf')
best_checkpoint_path = None
patience = train_cfg['training']['patience']
min_delta = train_cfg['training']['min_delta']
wait = 0

epochs_total = train_cfg['training']['epochs']
save_steps = checkpoint_cfg['save_steps']
max_grad_norm = train_cfg['training']['max_grad_norm']
log_steps = train_cfg['logging']['log_steps'] if 'logging' in train_cfg else None

for epoch in range(epochs_total):
    model.train()
    accelerator.print(f'==== Epoch {epoch+1}/{epochs_total} ====')
    progress = tqdm(total=len(train_loader), disable=not accelerator.is_local_main_process)
    step_losses = []
    for step, batch in enumerate(train_loader, start=1):
        with accelerator.accumulate(model):
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            if max_grad_norm:
                accelerator.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
        step_losses.append(loss.detach().item())
        global_step = epoch * len(train_loader) + step
        progress.set_description(f'loss={loss.item():.4f}')
        progress.update(1)

        if log_steps and global_step % log_steps == 0 and accelerator.is_main_process:
            log_gpu_memory(f'step {global_step}')

        if save_steps and global_step % save_steps == 0:
            ckpt_dir = Path(checkpoint_cfg['output_dir']) / f'step_epoch{epoch+1}_step{global_step}'
            if accelerator.is_main_process:
                ckpt_dir.mkdir(parents=True, exist_ok=True)
                accelerator.unwrap_model(model).save_pretrained(ckpt_dir)
                tokenizer.save_pretrained(ckpt_dir / 'tokenizer')
            accelerator.wait_for_everyone()
            free_cuda()

    progress.close()
    train_loss = float(np.mean(step_losses))

    model.eval()
    val_losses = []
    for batch in val_loader:
        with torch.no_grad():
            outputs = model(**batch)
            val_losses.append(accelerator.gather(outputs.loss.detach()).mean().item())
    val_loss = float(np.mean(val_losses))

    history['epoch'].append(epoch+1)
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    accelerator.print(f'Epoch {epoch+1}: train_loss={train_loss:.4f} val_loss={val_loss:.4f}')

    if val_loss + min_delta < best_val:
        best_val = val_loss
        wait = 0
        best_checkpoint_path = Path(checkpoint_cfg['output_dir']) / 'best'
        if accelerator.is_main_process:
            best_checkpoint_path.mkdir(parents=True, exist_ok=True)
            accelerator.unwrap_model(model).save_pretrained(best_checkpoint_path)
            tokenizer.save_pretrained(best_checkpoint_path / 'tokenizer')
        accelerator.wait_for_everyone()
        free_cuda()
        accelerator.print('Best checkpoint updated.')
    else:
        wait += 1
        accelerator.print(f'No improvement, patience {wait}/{patience}')
        if wait >= patience:
            accelerator.print('Early stopping triggered.')
            break

free_cuda()

if best_checkpoint_path and best_checkpoint_path.exists():
    accelerator.print(f'Loading best checkpoint from {best_checkpoint_path}')
    best_model = AutoModelForMaskedLM.from_pretrained(best_checkpoint_path, config=config)
    accelerator.unwrap_model(model).load_state_dict(best_model.state_dict())
    del best_model
    free_cuda()
    accelerator.wait_for_everyone()

## 8. Evaluation

In [None]:
model.eval()

@torch.no_grad()
def collect_scores(dataloader) -> Tuple[np.ndarray, np.ndarray]:
    scores, labels = [], []
    for batch in dataloader:
        outputs = model(**batch)
        loss = accelerator.gather(outputs.loss)
        scores.extend(loss.cpu().numpy())
        label_tensor = accelerator.gather(batch['anomaly_label'])
        labels.extend(label_tensor.cpu().numpy())
    return np.asarray(scores), np.asarray(labels)

val_scores, val_labels = collect_scores(val_loader)
test_scores, test_labels = collect_scores(test_loader)

threshold_candidates = np.percentile(val_scores, np.linspace(50, 99, 25))
best_threshold, best_f1 = threshold_candidates[-1], 0.0
for candidate in threshold_candidates:
    preds = (val_scores >= candidate).astype(int)
    f1 = f1_score(val_labels, preds, zero_division=0)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = candidate

val_probs = np.exp(-val_scores)
test_probs = np.exp(-test_scores)

pred_test = (test_scores >= best_threshold).astype(int)
f1 = f1_score(test_labels, pred_test, zero_division=0)
roc_auc = roc_auc_score(test_labels, test_probs)
pr_auc = average_precision_score(test_labels, test_probs)

metrics = {
    'val_threshold_loss': float(best_threshold),
    'val_best_f1': float(best_f1),
    'test_f1': float(f1),
    'test_roc_auc': float(roc_auc),
    'test_pr_auc': float(pr_auc)
}
metrics_path = metrics_dir / 'openstack_metrics.json'
metrics_path.write_text(json.dumps(metrics, indent=2))
print(json.dumps(metrics, indent=2))

cm = confusion_matrix(test_labels, pred_test)
fig, ax = plt.subplots(figsize=(4, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
plt.tight_layout()
cm_path = eval_dir / 'confusion_matrix.png'
fig.savefig(cm_path)
plt.close(fig)
print(f'Confusion matrix saved to {cm_path}')

## 9. Export TorchScript and ONNX

In [None]:
export_dir = Path(train_cfg['export']['output_dir'])
export_dir.mkdir(parents=True, exist_ok=True)

accelerator.wait_for_everyone()
model_to_export = accelerator.unwrap_model(model).cpu()
model_to_export.eval()

seq_len = train_cfg['sequence']['max_length']
dummy_ids = torch.ones((1, seq_len), dtype=torch.long)
dummy_mask = torch.ones((1, seq_len), dtype=torch.long)

with torch.no_grad():
    traced = torch.jit.trace(model_to_export, (dummy_ids, dummy_mask))
    ts_path = export_dir / train_cfg['export']['torchscript_filename']
    traced.save(str(ts_path))
    print(f'TorchScript saved to {ts_path}')

onnx_path = export_dir / train_cfg['export']['onnx_filename']
torch.onnx.export(
    model_to_export,
    (dummy_ids, dummy_mask),
    str(onnx_path),
    input_names=['input_ids', 'attention_mask'],
    output_names=['logits'],
    dynamic_axes={'input_ids': {0: 'batch'}, 'attention_mask': {0: 'batch'}, 'logits': {0: 'batch'}},
    opset_version=train_cfg['export']['opset']
)
print(f'ONNX saved to {onnx_path}')

## 10. Model Card Snippet

In [None]:
model_card_path = Path(train_cfg['checkpointing']['output_dir']) / 'MODEL_CARD.md'
model_card = (
    f"# LogBERT OpenStack Fine-tune

"
    f"- Base checkpoint: {train_cfg['base_checkpoint_dir']}
"
    f"- Sequence length: {train_cfg['sequence']['max_length']}
"
    f"- Replay enabled: {train_cfg['replay']['enabled']}
"
    f"- LoRA enabled: {train_cfg['peft']['lora_enabled']}

"
    f"## Eval Metrics
"
    f"- F1: {metrics['test_f1']:.4f}
"
    f"- ROC-AUC: {metrics['test_roc_auc']:.4f}
"
    f"- PR-AUC: {metrics['test_pr_auc']:.4f}
"
    f"- Threshold (loss): {metrics['val_threshold_loss']:.6f}

"
    f"## Artifacts
"
    f"- TorchScript: {train_cfg['export']['torchscript_filename']}
"
    f"- ONNX: {train_cfg['export']['onnx_filename']}
"
)
model_card_path.write_text(model_card)
print(f'Model card snippet written to {model_card_path}')

## 11. Persist Run Config

In [None]:
state_summary = {
    'num_processes': accelerator.state.num_processes,
    'device': str(accelerator.device),
    'mixed_precision': accelerator.state.mixed_precision
}
run_payload = {
    'train_openstack': train_cfg,
    'data_config': data_cfg,
    'accelerator_state': state_summary,
    'metrics': metrics,
    'is_mps': IS_MPS
}
run_config_path.write_text(json.dumps(run_payload, indent=2))
print(f'Run config stored at {run_config_path}')

## Artifacts Produced
- Fine-tuned checkpoints -> `artifacts/logbert-mlm-os/`
- Metrics JSON and confusion matrix -> `artifacts/metrics/openstack/openstack_metrics.json`, `artifacts/eval/confusion_matrix.png`
- Exported models -> `artifacts/exported_models/`
- Model card snippet -> `artifacts/logbert-mlm-os/MODEL_CARD.md`

Pipeline complete.