# 01 · Pretrain DistilBERT on HDFS (MLM)

Masked language modeling pretraining with Hugging Face Accelerate on two RTX 6000 GPUs (or Apple MPS fallback). This notebook consumes the artifacts from `00_prepare_data.ipynb`, configures multi-device training, and saves checkpoints plus diagnostics for fine-tuning.

## Notebook Goals
- Load training hyperparameters from `configs/train_hdfs.yaml` and dataset metadata.
- Auto-generate `accelerate_config.yaml` for multi-GPU Linux or skip when running on Apple Silicon MPS.
- Build PyTorch DataLoaders backed by tokenized Parquet splits with dynamic masking.
- Run a custom Accelerate training loop with throughput EMA, p95 step latency, VRAM logging, and gradient norms.
- Save checkpoints every N steps and at epoch end while calling `free_cuda()` for memory hygiene.
- Persist run metadata (`run_config.json`) and validation metrics for downstream fine-tuning.

## 1. Imports and Configuration

In [None]:
import json
import math
import os
import time
import gc
from collections import deque
from pathlib import Path
from typing import Dict

import torch
from torch.utils.data import DataLoader
from accelerate import Accelerator
from datasets import load_from_disk
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    get_scheduler
)
import numpy as np
from tqdm.auto import tqdm
import yaml

### Load YAML configs

In [None]:
def load_yaml(path: Path) -> Dict:
    with path.open('r') as fh:
        return yaml.safe_load(fh)

data_cfg = load_yaml(Path('configs/data.yaml'))
train_cfg = load_yaml(Path('configs/train_hdfs.yaml'))
accelerate_config_path = Path(train_cfg['accelerate']['config_path'])
print(json.dumps({
    'data_cfg': str(Path('configs/data.yaml')),
    'train_cfg': str(Path('configs/train_hdfs.yaml')),
    'accelerate_cfg': str(accelerate_config_path)
}, indent=2))

### Device detection

In [None]:
IS_MPS = torch.backends.mps.is_available()
if IS_MPS:
    os.environ.setdefault('ACCELERATE_USE_MPS_DEVICE', '1')
    print('Apple Silicon (MPS) detected. Accelerate will target the MPS backend; multi-GPU config will be skipped.')
else:
    print('MPS not available; defaulting to CUDA/CPU configuration.')

### Auto-generate Accelerate configuration

In [None]:
def build_accelerate_config(cfg: Dict, is_mps: bool) -> Dict:
    if is_mps:
        return {
            'compute_environment': 'LOCAL_MACHINE',
            'distributed_type': 'MPS',
            'mixed_precision': 'no',
            'num_processes': 1,
            'gradient_accumulation_steps': cfg['gradient_accumulation_steps'],
            'main_process_ip': '127.0.0.1',
            'main_process_port': 29500,
            'gpu_ids': '0'
        }
    return {
        'compute_environment': 'LOCAL_MACHINE',
        'distributed_type': 'MULTI_GPU',
        'mixed_precision': cfg['mixed_precision'],
        'num_processes': cfg['num_processes'],
        'machine_rank': 0,
        'main_process_ip': '127.0.0.1',
        'main_process_port': 29500,
        'deepspeed_config': cfg['zero_stage2_toggle'],
        'dynamo_backend': 'NO',
        'gradient_accumulation_steps': cfg['gradient_accumulation_steps']
    }

if IS_MPS:
    if accelerate_config_path.exists():
        print('MPS mode: skipping accelerate_config.yaml regeneration (existing file will be ignored).')
else:
    payload = build_accelerate_config(train_cfg['accelerate'], IS_MPS)
    accelerate_config_path.parent.mkdir(parents=True, exist_ok=True)
    with accelerate_config_path.open('w') as fh:
        yaml.safe_dump(payload, fh, sort_keys=False)
    print(f'Accelerate config written to {accelerate_config_path}')
    with accelerate_config_path.open('r') as fh:
        print(fh.read())

## 2. Dataset Loading

In [None]:
metadata_path = Path(data_cfg['preprocessing']['dataset_metadata'])
if not metadata_path.exists():
    raise FileNotFoundError(f'Metadata not found: {metadata_path}. Run 00_prepare_data.ipynb first.')
metadata = json.loads(metadata_path.read_text())
print(json.dumps(metadata, indent=2))

parquet_dir = Path(data_cfg['preprocessing']['parquet_dir'])
train_dataset = load_from_disk(str(parquet_dir / 'hdfs_train_hf'))
val_dataset = load_from_disk(str(parquet_dir / 'hdfs_val_hf'))
test_dataset = load_from_disk(str(parquet_dir / 'hdfs_test_hf'))
print(train_dataset)

## 3. Tokenizer and Model

In [None]:
tokenizer_dir = Path(train_cfg['artifacts']['tokenizer_dir'])
if not tokenizer_dir.exists():
    raise FileNotFoundError(f'Tokenizer directory missing: {tokenizer_dir}')

tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=True)
tokenizer.add_special_tokens({'additional_special_tokens': data_cfg['tokens']['special']})

model_name = train_cfg['model']['name']
config = AutoConfig.from_pretrained(model_name)
config.vocab_size = len(tokenizer)
config.gradient_checkpointing = train_cfg['model']['gradient_checkpointing']
model = AutoModelForMaskedLM.from_pretrained(model_name, config=config)
model.resize_token_embeddings(len(tokenizer))
print(f'Model {model_name} initialized with vocab size {len(tokenizer)}')

## 4. DataLoaders

In [None]:
seq_cfg = train_cfg['sequence']
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=seq_cfg['mlm_probability']
)

def collate_without_anomaly(examples):
    for example in examples:
        example.pop('anomaly_label', None)
    return collator(examples)

per_device_train_bs = train_cfg['training']['train_batch_size_per_device']
per_device_eval_bs = train_cfg['training']['eval_batch_size_per_device']

train_dataloader = DataLoader(train_dataset, batch_size=per_device_train_bs, shuffle=True, collate_fn=collate_without_anomaly)
val_dataloader = DataLoader(val_dataset, batch_size=per_device_eval_bs, shuffle=False, collate_fn=collate_without_anomaly)
test_dataloader = DataLoader(test_dataset, batch_size=per_device_eval_bs, shuffle=False, collate_fn=collate_without_anomaly)

## 5. Accelerator Setup

In [None]:
mixed_precision = 'no' if IS_MPS else train_cfg['precision']['mixed_precision']
accelerator = Accelerator(
    gradient_accumulation_steps=train_cfg['training']['grad_accumulation_steps'],
    mixed_precision=mixed_precision
)
print(accelerator.state)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=train_cfg['optimizer']['lr'],
    betas=tuple(train_cfg['optimizer']['betas']),
    eps=train_cfg['optimizer']['eps'],
    weight_decay=train_cfg['optimizer']['weight_decay']
)

model, optimizer, train_dataloader, val_dataloader, test_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, val_dataloader, test_dataloader
)

num_update_steps = math.ceil(len(train_dataloader) / train_cfg['training']['grad_accumulation_steps'])
total_steps = num_update_steps * train_cfg['training']['epochs']
print(f'Total training steps: {total_steps}')

lr_scheduler = get_scheduler(
    name=train_cfg['optimizer']['scheduler'],
    optimizer=optimizer,
    num_warmup_steps=train_cfg['optimizer']['warmup_steps'],
    num_training_steps=total_steps
)

## 6. Memory Utilities

In [None]:
def free_cuda():
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
    gc.collect()


def log_gpu_memory(tag: str):
    if torch.cuda.is_available():
        alloc = torch.cuda.memory_allocated() / (1024 ** 3)
        reserved = torch.cuda.memory_reserved() / (1024 ** 3)
        accelerator.print(f'[{tag}] gpu allocated={alloc:.2f} GB reserved={reserved:.2f} GB')
    elif IS_MPS:
        try:
            import torch.mps
            stats = torch.mps.current_allocated_memory() / (1024 ** 3)
            accelerator.print(f'[{tag}] mps allocated={stats:.2f} GB')
        except Exception:
            accelerator.print(f'[{tag}] mps memory stats unavailable.')

## 7. Training Loop

In [None]:
checkpoint_cfg = train_cfg['checkpointing']
metrics_dir = Path(train_cfg['artifacts']['metrics_dir'])
metrics_dir.mkdir(parents=True, exist_ok=True)
run_config_path = Path(train_cfg['artifacts']['run_config_path'])
run_config_path.parent.mkdir(parents=True, exist_ok=True)

history = {'step': [], 'loss': [], 'lr': [], 'throughput': []}
step_times = deque(maxlen=200)
ema_throughput = None


def save_checkpoint(model, optimizer, scheduler, epoch: int, step: int, tag: str):
    accelerator.wait_for_everyone()
    ckpt_dir = Path(checkpoint_cfg['output_dir']) / f'{tag}_epoch{epoch}_step{step}'
    if accelerator.is_main_process:
        ckpt_dir.mkdir(parents=True, exist_ok=True)
        unwrapped = accelerator.unwrap_model(model)
        unwrapped.save_pretrained(ckpt_dir)
        tokenizer.save_pretrained(ckpt_dir / 'tokenizer')
        torch.save(optimizer.state_dict(), ckpt_dir / 'optimizer.pt')
        torch.save(scheduler.state_dict(), ckpt_dir / 'scheduler.pt')
        with (ckpt_dir / 'training_state.json').open('w') as fh:
            json.dump({'epoch': epoch, 'step': step}, fh)
        accelerator.print(f'[checkpoint] saved -> {ckpt_dir}')
    accelerator.wait_for_everyone()
    free_cuda()

train_epochs = train_cfg['training']['epochs']
save_steps = checkpoint_cfg['save_steps']
max_grad_norm = train_cfg['training']['max_grad_norm']

for epoch in range(train_epochs):
    model.train()
    accelerator.print(f'==== Epoch {epoch+1}/{train_epochs} ====')
    progress = tqdm(total=len(train_dataloader), disable=not accelerator.is_local_main_process)
    for step, batch in enumerate(train_dataloader, start=1):
        start_time = time.perf_counter()
        with accelerator.accumulate(model):
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            if max_grad_norm:
                accelerator.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
        duration = time.perf_counter() - start_time
        step_times.append(duration)

        global_step = epoch * len(train_dataloader) + step
        tokens_processed = batch['input_ids'].numel()
        throughput = tokens_processed / max(duration, 1e-6)
        beta = train_cfg['logging']['throughput_ema_beta']
        ema_throughput = throughput if ema_throughput is None else (beta * ema_throughput + (1 - beta) * throughput)

        history['step'].append(int(global_step))
        history['loss'].append(float(loss.detach().item()))
        history['lr'].append(float(lr_scheduler.get_last_lr()[0]))
        history['throughput'].append(float(ema_throughput))

        if accelerator.is_local_main_process:
            p95 = float(np.percentile(step_times, 95)) if len(step_times) >= 5 else float(duration)
            progress.set_description(f'loss={loss.item():.4f} ema_tok_s={ema_throughput:,.0f} p95={p95:.3f}s')
        progress.update(1)

        if train_cfg['logging']['log_steps'] and global_step % train_cfg['logging']['log_steps'] == 0 and accelerator.is_main_process:
            log_gpu_memory(f'step {global_step}')

        if save_steps and global_step % save_steps == 0:
            save_checkpoint(model, optimizer, lr_scheduler, epoch+1, global_step, tag='step')

    progress.close()
    epoch_steps = (epoch + 1) * len(train_dataloader)
    save_checkpoint(model, optimizer, lr_scheduler, epoch+1, epoch_steps, tag='epoch')

    model.eval()
    losses = []
    for batch in val_dataloader:
        with torch.no_grad():
            outputs = model(**batch)
            losses.append(accelerator.gather(outputs.loss.detach()).mean().item())
    val_loss = float(np.mean(losses))
    accelerator.print(f'Validation loss after epoch {epoch+1}: {val_loss:.4f}')

accelerator.wait_for_everyone()
free_cuda()

## 8. Evaluation and Metrics

In [None]:
model.eval()

@torch.no_grad()
def evaluate_loader(dataloader) -> float:
    losses = []
    for batch in dataloader:
        outputs = model(**batch)
        gathered = accelerator.gather(outputs.loss)
        losses.extend(gathered.cpu().numpy())
    return float(np.mean(losses))

val_loss = evaluate_loader(val_dataloader)
test_loss = evaluate_loader(test_dataloader)
perplexity = math.exp(test_loss)

metrics = {
    'val_loss': val_loss,
    'test_loss': test_loss,
    'test_perplexity': perplexity,
    'steps_tracked': len(history['step'])
}
metrics_path = metrics_dir / 'hdfs_pretraining_metrics.json'
metrics_path.write_text(json.dumps(metrics, indent=2))
print(json.dumps(metrics, indent=2))

## 9. Persist Run Config

In [None]:
state_summary = {
    'num_processes': accelerator.state.num_processes,
    'process_index': accelerator.state.process_index,
    'local_process_index': accelerator.state.local_process_index,
    'device': str(accelerator.device),
    'mixed_precision': accelerator.state.mixed_precision,
    'distributed_type': str(accelerator.state.distributed_type)
}
run_payload = {
    'train_config': train_cfg,
    'data_config': data_cfg,
    'accelerator_state': state_summary,
    'is_mps': IS_MPS
}
run_config_path.write_text(json.dumps(run_payload, indent=2))
print(f'Run configuration written to {run_config_path}')

## Artifacts Produced
- Checkpoints saved under `artifacts/logbert-mlm-hdfs/`
- Validation/test metrics stored at `artifacts/metrics/hdfs/hdfs_pretraining_metrics.json`
- Run configuration captured at `artifacts/logbert-mlm-hdfs/run_config.json`

Continuing pipeline: open `02_finetune_openstack.ipynb`.