# 00 · Prepare Data for LogBERT Pipeline

This notebook downloads public log datasets, applies regex normalization, mines templates with Drain3, and materializes tokenized Parquet splits ready for training and evaluation.

## Notebook Goals
- Install pinned dependencies for the workstation environment.
- Fetch HDFS and OpenStack log corpora with checksum verification and automatic mirror fallback.
- Apply regex-based normalization rules from `configs/data.yaml` and preview before/after examples.
- Mine log templates in streaming mode with Drain3 and persist template transitions as Parquet.
- Build Hugging Face datasets with BERT-compatible tokenization, time-based splits (80/10/10), and truncation stats.
- Save artifacts (tokenizer, processed Parquet, metadata) for downstream notebooks.

## 1. Environment Setup

In [None]:
import os, sys, subprocess
from pathlib import Path

if os.environ.get('SKIP_REQUIREMENTS', '0') == '1':
    print('SKIP_REQUIREMENTS=1 -> skipping pip install from requirements.txt')
else:
    req_path = Path('requirements.txt')
    if req_path.exists():
        print('[setup] Installing dependencies from requirements.txt ...')
        completed = subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', str(req_path)])
        if completed.returncode != 0:
            raise RuntimeError('pip installation failed; inspect output above.')
    else:
        print('requirements.txt not found; skipping installation')

In [1]:
import json
import math
import tarfile
import hashlib
import shutil
import textwrap
import gc
from pathlib import Path
from datetime import datetime
from typing import Dict, Iterable, List, Optional, Tuple

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig
from transformers import AutoTokenizer

import yaml
import requests

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

### Load Configuration and Prepare Folders

In [5]:
!ls

00_prepare_data.ipynb       02_finetune_openstack.ipynb
01_pretrain_hdfs.ipynb


In [25]:
def load_yaml(path: Path) -> Dict:
    with path.open('r') as fh:
        return yaml.safe_load(fh)

data_config = load_yaml(Path('../configs/data.yaml'))
RAW_HDFS_DIR = Path(data_config['raw_paths']['hdfs'])
RAW_OPENSTACK_DIR = Path(data_config['raw_paths']['openstack'])
ARTIFACTS_DIR = Path(data_config['artifacts_dir'])
TOKENIZER_DIR = ARTIFACTS_DIR / 'tokenizer'
PARQUET_DIR = Path(data_config['preprocessing']['parquet_dir'])
METADATA_PATH = Path(data_config['preprocessing']['dataset_metadata'])
for folder in [RAW_HDFS_DIR, RAW_OPENSTACK_DIR, ARTIFACTS_DIR, TOKENIZER_DIR, PARQUET_DIR, METADATA_PATH.parent]:
    folder.mkdir(parents=True, exist_ok=True)
print('[setup] Configuration loaded and folders prepared.')

[setup] Configuration loaded and folders prepared.


## 2. Download Public Datasets

In [18]:
def stream_download(url: str, destination: Path) -> bool:
    chunk = 1 << 20
    try:
        with requests.get(url, stream=True, timeout=60) as resp:
            resp.raise_for_status()
            total = int(resp.headers.get('content-length', 0))
            with destination.open('wb') as fh, tqdm(total=total, unit='B', unit_scale=True, desc=f'download {destination.name}') as bar:
                for part in resp.iter_content(chunk_size=chunk):
                    if part:
                        fh.write(part)
                        bar.update(len(part))
        return True
    except Exception as exc:
        print(f'[warn] download failed from {url}: {exc}')
        return False

def sha256sum(path: Path) -> str:
    hash_obj = hashlib.sha256()
    with path.open('rb') as fh:
        for chunk in iter(lambda: fh.read(1 << 20), b''):
            hash_obj.update(chunk)
    return hash_obj.hexdigest()

def ensure_download(urls: List[str], target: Path, expected_sha: Optional[str] = None) -> Path:
    for url in urls:
        if stream_download(url, target):
            break
    else:
        raise RuntimeError(f'all download mirrors failed for {target.name}')
    checksum = sha256sum(target)
    if expected_sha and checksum != expected_sha:
        raise ValueError(f'sha mismatch for {target.name}: expected {expected_sha}, got {checksum}')
    return target

def maybe_update_sha(config_section: Dict, key: str, computed_sha: str) -> None:
    current = config_section.get(key)
    if isinstance(current, str) and current:
        return
    config_section[key] = computed_sha
    with Path('../configs/data.yaml').open('w') as fh:
        yaml.safe_dump(data_config, fh, sort_keys=False)
    print(f'[sha] recorded SHA256 for {key}: {computed_sha}')

In [19]:
hdfs_cfg = data_config['datasets']['hdfs']
archive_path = RAW_HDFS_DIR / hdfs_cfg['archive_name']
archive_path.parent.mkdir(parents=True, exist_ok=True)
print('[download] HDFS corpus')
ensure_download(hdfs_cfg['urls'], archive_path, hdfs_cfg.get('sha256') or None)
computed_sha = sha256sum(archive_path)
maybe_update_sha(hdfs_cfg, 'sha256', computed_sha)

if not (RAW_HDFS_DIR / hdfs_cfg['log_file']).exists():
    print('[extract] HDFS archive')
    with tarfile.open(archive_path, 'r:gz') as tf:
        tf.extractall(RAW_HDFS_DIR)
else:
    print('[extract] HDFS log already present')

hdfs_log_path = RAW_HDFS_DIR / hdfs_cfg['log_file']
print(f'[ready] HDFS log at {hdfs_log_path}')

[download] HDFS corpus


download HDFS_1.tar.gz:   0%|          | 0.00/162M [00:00<?, ?B/s]

[sha] recorded SHA256 for sha256: 6ca6c5bc2671c66afecee9369a2fdac606bf33997a2494ac66aa411fe3e95169
[extract] HDFS log already present
[ready] HDFS log at data/hdfs/raw/HDFS.log


In [27]:
openstack_cfg = data_config['datasets']['openstack']
RAW_OPENSTACK_DIR.mkdir(parents=True, exist_ok=True)
paths: Dict[str, Path] = {}
print('[download] OpenStack normal logs')
normal_target = RAW_OPENSTACK_DIR / 'openstack_normal.log'
normal_urls = openstack_cfg['normal_urls']
normal_sha_cfg = (openstack_cfg.get('sha256') or {}).get('normal')
normal_acquired = False
if normal_target.exists():
    print('[cache] Detected existing openstack_normal.log, skipping download.')
    normal_acquired = True
    normal_urls = []
    maybe_update_sha(openstack_cfg['sha256'], 'normal', sha256sum(normal_target))
if not normal_acquired:
    for url in normal_urls:
        try:
            if url.endswith('.tar.gz'):
                tmp_tar = RAW_OPENSTACK_DIR / 'openstack_bundle.tar.gz'
                ensure_download([url], tmp_tar, None)
                with tarfile.open(tmp_tar, 'r:gz') as tf:
                    member = next((m for m in tf.getmembers() if m.name.endswith('openstack_normal.log')), None)
                    if member is None:
                        raise ValueError('openstack_normal.log not found inside archive')
                    tf.extract(member, RAW_OPENSTACK_DIR)
                    extracted_path = RAW_OPENSTACK_DIR / member.name
                    final_path = RAW_OPENSTACK_DIR / 'openstack_normal.log'
                    final_path.write_bytes(extracted_path.read_bytes())
                    if extracted_path.exists():
                        extracted_path.unlink()
                    extracted_dir = extracted_path.parent
                    if extracted_dir != RAW_OPENSTACK_DIR and extracted_dir.exists():
                        shutil.rmtree(extracted_dir, ignore_errors=True)
                tmp_tar.unlink(missing_ok=True)
                normal_acquired = True
            else:
                ensure_download([url], normal_target, normal_sha_cfg)
                normal_acquired = True
            if normal_acquired:
                maybe_update_sha(openstack_cfg['sha256'], 'normal', sha256sum(normal_target))
                break
        except Exception as exc:
            print(f'[warn] fallback download failed from {url}: {exc}')
if not normal_acquired:
    raise RuntimeError('All download mirrors failed for OpenStack normal logs')
paths['normal'] = normal_target

print('[download] OpenStack abnormal logs')
abnormal_target = RAW_OPENSTACK_DIR / 'openstack_abnormal.log'
if abnormal_target.exists():
    print('[cache] Detected existing openstack_abnormal.log, skipping download.')
else:
    ensure_download(openstack_cfg['abnormal_urls'], abnormal_target, (openstack_cfg.get('sha256') or {}).get('abnormal'))
    maybe_update_sha(openstack_cfg['sha256'], 'abnormal', sha256sum(abnormal_target))
paths['abnormal'] = abnormal_target
print(f'[ready] OpenStack logs -> {paths}')


[download] OpenStack normal logs
[cache] Detected existing openstack_normal.log, skipping download.
[sha] recorded SHA256 for normal: 81bc1a64d6788efe1a6f30b9a9958b14eb53c1bfe6a2d166bf23c821d8b77b71
[download] OpenStack abnormal logs
[cache] Detected existing openstack_abnormal.log, skipping download.
[ready] OpenStack logs -> {'normal': PosixPath('data/openstack/raw/openstack_normal.log'), 'abnormal': PosixPath('data/openstack/raw/openstack_abnormal.log')}


## 3. Load and Inspect Raw Logs

In [None]:
def read_hdfs_log(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, header=None, names=['raw'], sep='', engine='python')
    df['timestamp'] = pd.to_datetime(df['raw'].str.slice(0, 19), errors='coerce')
    df['content'] = df['raw']
    return df.dropna(subset=['timestamp']).reset_index(drop=True)

def read_openstack_logs(paths: Dict[str, Path]) -> pd.DataFrame:
    frames = []
    for label, path in paths.items():
        df = pd.read_csv(path, header=None, names=['raw'], sep='', engine='python')
        df['timestamp'] = pd.to_datetime(df['raw'].str.extract(r'(\d{4}-\d{2}-\d{2} [^ ]+)')[0], errors='coerce')
        df['label'] = 1 if label == 'abnormal' else 0
        df['content'] = df['raw']
        frames.append(df.dropna(subset=['timestamp']))
    return pd.concat(frames, ignore_index=True).sort_values('timestamp').reset_index(drop=True)

hdfs_df = read_hdfs_log(hdfs_log_path)
openstack_df = read_openstack_logs(paths)
print(f'[dataset] HDFS records: {len(hdfs_df):,} | OpenStack records: {len(openstack_df):,}')

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x107fda990>>
Traceback (most recent call last):
  File "/Users/ashrafshahreyar/miniconda3/envs/logAnom311/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


In [None]:
display(hdfs_df[['timestamp','content']].head())
display(openstack_df[['timestamp','label','content']].head())

## 4. Regex Normalization

In [None]:
import re

class LogNormalizer:
    def __init__(self, rules: Iterable[Dict[str, str]]):
        self.rules = [(rule['name'], re.compile(rule['pattern']), rule['replace']) for rule in rules]

    def apply(self, text: str) -> str:
        result = text
        for _, pattern, repl in self.rules:
            result = pattern.sub(repl, result)
        return result

    def normalize_series(self, series: pd.Series) -> pd.Series:
        return series.astype(str).apply(self.apply)

normalizer = LogNormalizer(data_config['normalizer']['rules'])
for df in (hdfs_df, openstack_df):
    df['normalized'] = normalizer.normalize_series(df['content'])

preview = pd.DataFrame({'original': hdfs_df['content'].head(5), 'normalized': hdfs_df['normalized'].head(5)})
display(preview)

## 5. Template Mining with Drain3

In [None]:
drain_cfg = data_config['drain3']
config_text = f"[Drain]
depth = {drain_cfg['depth']}
st = {drain_cfg['st']}
max_children = {drain_cfg['max_children']}
extra_delimiters = {','.join(drain_cfg['extra_delimiters'])}"
template_config = TemplateMinerConfig()
template_config.load_from_string(config_text)
template_miner = TemplateMiner(template_config=template_config)

records = []
transition_counts = {}
ngram = drain_cfg.get('template_transition_ngram', 3)
print('[drain3] streaming normalized HDFS logs')
cluster_trace: List[str] = []
for row in tqdm(hdfs_df.to_dict('records'), desc='Drain3 HDFS'):
    result = template_miner.add_log_message(row['normalized'])
    cluster_id = result['cluster_id']
    records.append({
        'timestamp': row['timestamp'],
        'template_id': cluster_id,
        'template': result['template_mined'],
        'content': row['normalized']
    })
    cluster_trace.append(cluster_id)

for idx in range(len(cluster_trace) - ngram + 1):
    key = tuple(cluster_trace[idx: idx + ngram])
    transition_counts[key] = transition_counts.get(key, 0) + 1

transition_output = Path(drain_cfg['transition_output'])
transition_output.parent.mkdir(parents=True, exist_ok=True)

template_df = pd.DataFrame(records)
index_path = transition_output.with_name('template_index.parquet')
template_df.to_parquet(index_path, index=False)

if transition_counts:
    transition_df = pd.DataFrame([
        {'ngram': '->'.join(key), 'count': count} for key, count in transition_counts.items()
    ])
    transition_df.to_parquet(transition_output, index=False)
    display(transition_df.sort_values('count', ascending=False).head(10))
else:
    print('[drain3] no transitions recorded; dataset may be small')

## 6. Tokenizer with Special Tokens

In [None]:
tokenizer_cfg = data_config['tokenizer']
tokenizer = AutoTokenizer.from_pretrained(tokenizer_cfg['base_model'], use_fast=True)
special_tokens = data_config['tokens']['special']
added = tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
TOKENIZER_DIR.mkdir(parents=True, exist_ok=True)
tokenizer.save_pretrained(TOKENIZER_DIR)
print(f'[tokenizer] added {added} special tokens and saved to {TOKENIZER_DIR}')
print('sample tokens:', tokenizer.tokenize(hdfs_df['normalized'].iloc[0])[:30])

## 7. Build Tokenized Parquet Splits

In [None]:
from datasets import Dataset

def tokenize_dataframe(df: pd.DataFrame, max_length: int) -> Tuple[pd.DataFrame, float]:
    encodings = tokenizer(
        list(df['normalized']),
        padding=False,
        truncation=True,
        max_length=max_length,
        return_attention_mask=True
    )
    truncated = sum(len(ids) == max_length for ids in encodings['input_ids'])
    tokens = pd.DataFrame({
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': encodings['input_ids'],
        'template_id': df.get('template_id', pd.Series([''] * len(df)))
    })
    if 'label' in df.columns:
        tokens['anomaly_label'] = df['label'].astype(int).values
    else:
        tokens['anomaly_label'] = 0
    tokens['timestamp'] = df['timestamp'].values
    tokens['raw'] = df['content'].values
    tokens['normalized'] = df['normalized'].values
    trunc_rate = truncated / max(len(df), 1)
    return tokens, trunc_rate

def time_splits(df: pd.DataFrame, splits_cfg: Dict[str, float]) -> Dict[str, pd.DataFrame]:
    df_sorted = df.sort_values('timestamp').reset_index(drop=True)
    n = len(df_sorted)
    train_end = int(n * splits_cfg['train'])
    val_end = train_end + int(n * splits_cfg['val'])
    return {'train': df_sorted.iloc[:train_end],'val': df_sorted.iloc[train_end:val_end],'test': df_sorted.iloc[val_end:]}

splits_cfg = data_config['splits']
hdfs_df = hdfs_df.merge(template_df[['timestamp','template_id']], on='timestamp', how='left')
hdfs_splits = time_splits(hdfs_df, splits_cfg)

hdfs_stats = {}
for split_name, split_df in hdfs_splits.items():
    tokens_df, trunc = tokenize_dataframe(split_df, data_config['tokenizer']['max_length'])
    file_path = PARQUET_DIR / f'hdfs_{split_name}.parquet'
    tokens_df.to_parquet(file_path, index=False)
    ds = Dataset.from_pandas(tokens_df.drop(columns=['raw','normalized']), preserve_index=False)
    ds.save_to_disk(str(PARQUET_DIR / f'hdfs_{split_name}_hf'))
    hdfs_stats[split_name] = {
        'count': len(tokens_df),
        'avg_length': float(tokens_df['input_ids'].map(len).mean()),
        'truncation_rate': round(trunc, 4)
    }
    print(f'[dataset] saved HDFS {split_name} split -> {file_path}')

display(pd.DataFrame(hdfs_stats).T)

In [None]:
openstack_splits = time_splits(openstack_df, splits_cfg)
openstack_stats = {}
for split_name, split_df in openstack_splits.items():
    tokens_df, trunc = tokenize_dataframe(split_df, data_config['tokenizer']['max_length'])
    file_path = PARQUET_DIR / f'openstack_{split_name}.parquet'
    tokens_df.to_parquet(file_path, index=False)
    ds = Dataset.from_pandas(tokens_df.drop(columns=['raw','normalized']), preserve_index=False)
    ds.save_to_disk(str(PARQUET_DIR / f'openstack_{split_name}_hf'))
    openstack_stats[split_name] = {
        'count': len(tokens_df),
        'avg_length': float(tokens_df['input_ids'].map(len).mean()),
        'truncation_rate': round(trunc, 4)
    }
    print(f'[dataset] saved OpenStack {split_name} split -> {file_path}')

display(pd.DataFrame(openstack_stats).T)

## 8. Persist Metadata

In [None]:
metadata = {
    'generated_at': datetime.utcnow().isoformat() + 'Z',
    'hdfs': hdfs_stats,
    'openstack': openstack_stats,
    'tokenizer_dir': str(TOKENIZER_DIR),
    'template_index': str(Path(data_config['drain3']['transition_output']).with_name('template_index.parquet')),
    'template_transition': str(Path(data_config['drain3']['transition_output']))
}
METADATA_PATH.write_text(json.dumps(metadata, indent=2))
print(f'[metadata] saved to {METADATA_PATH}')

## Artifacts Produced
- Tokenizer with special tokens -> `artifacts/tokenizer/`
- HDFS Parquet splits and HF datasets -> `artifacts/datasets/hdfs_*.parquet`, `*_hf/`
- OpenStack Parquet splits and HF datasets -> `artifacts/datasets/openstack_*.parquet`, `*_hf/`
- Drain3 template index and transitions -> `artifacts/drain3/`
- Dataset metadata summary -> `artifacts/metadata/datasets.json`

Continue with notebook `01_pretrain_hdfs.ipynb` for multi-GPU MLM pretraining.