From 55fb6b721df341ef68f33a3aedea0f7c6b9a1863 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Tue, 4 Nov 2025 17:45:46 -0800 Subject: [PATCH 01/66] bd-resilient-kv: add versioned store Adds a new variant of the resilient-kv that supports automatically rotating versioned entries --- bd-resilient-kv/{CLAUDE.md => AGENTS.md} | 0 bd-resilient-kv/VERSIONED_FORMAT.md | 237 ++++++++ .../src/kv_journal/memmapped_versioned.rs | 223 ++++++++ bd-resilient-kv/src/kv_journal/mod.rs | 4 + bd-resilient-kv/src/kv_journal/versioned.rs | 532 ++++++++++++++++++ bd-resilient-kv/src/lib.rs | 11 +- bd-resilient-kv/src/tests/mod.rs | 1 + .../src/tests/versioned_kv_store_test.rs | 396 +++++++++++++ bd-resilient-kv/src/versioned_kv_store.rs | 364 ++++++++++++ 9 files changed, 1767 insertions(+), 1 deletion(-) rename bd-resilient-kv/{CLAUDE.md => AGENTS.md} (100%) create mode 100644 bd-resilient-kv/VERSIONED_FORMAT.md create mode 100644 bd-resilient-kv/src/kv_journal/memmapped_versioned.rs create mode 100644 bd-resilient-kv/src/kv_journal/versioned.rs create mode 100644 bd-resilient-kv/src/tests/versioned_kv_store_test.rs create mode 100644 bd-resilient-kv/src/versioned_kv_store.rs diff --git a/bd-resilient-kv/CLAUDE.md b/bd-resilient-kv/AGENTS.md similarity index 100% rename from bd-resilient-kv/CLAUDE.md rename to bd-resilient-kv/AGENTS.md diff --git a/bd-resilient-kv/VERSIONED_FORMAT.md b/bd-resilient-kv/VERSIONED_FORMAT.md new file mode 100644 index 00000000..5a896786 --- /dev/null +++ b/bd-resilient-kv/VERSIONED_FORMAT.md @@ -0,0 +1,237 @@ +# Versioned Journal Format Design + +## Overview + +This document describes the versioned journal format (VERSION 2) that enables point-in-time recovery by tracking write versions for each operation. + +## Goals + +1. **Version Tracking**: Each write operation gets a unique, monotonically increasing version number +2. **Point-in-Time Recovery**: Ability to reconstruct exact state at any version +3. **Journal Rotation**: Periodic compaction with self-contained state in each journal +4. **Backward Compatible**: New format coexists with existing VERSION 1 + +## Design Philosophy + +Unlike traditional journal systems that use separate snapshot files, this design uses a **unified format** where: +- Each journal is self-contained with complete state embedded as regular entries +- No special "snapshot entry" format needed +- First N entries in a rotated journal are just regular versioned entries (all at same version) +- Simpler file structure and uniform entry format throughout + +## File Types + +### 1. Active Journal (`my_store.jrn`) +The current active journal receiving new writes. + +### 2. Archived Journals (`my_store.jrn.v00020000`, `my_store.jrn.v00030000`, etc.) +Previous journals, archived during rotation. Each contains complete state at rotation version plus subsequent incremental writes. The version number in the filename indicates the rotation/snapshot version. + +## Format Specification + +### Journal Format (VERSION 2) + +``` +| Position | Data | Type | Size | +|----------|--------------------------|----------------|---------| +| 0 | Format Version | u64 | 8 bytes | +| 8 | Position | u64 | 8 bytes | +| 16 | Type Code: Array Start | u8 | 1 byte | +| 17 | Metadata Object | BONJSON Object | varies | +| ... | Versioned Journal Entry | BONJSON Object | varies | +| ... | Versioned Journal Entry | BONJSON Object | varies | +``` + +**Metadata Object** (first entry in array): +```json +{ + "initialized": , + "format_version": 2, + "base_version": +} +``` + +**Versioned Journal Entry**: +```json +{ + "v": , + "t": , + "k": "", + "o": +} +``` + +Fields: +- `v` (version): Monotonic write version number +- `t` (timestamp): When the write occurred (ns since UNIX epoch) +- `k` (key): The key being written +- `o` (operation): The value (for SET) or null (for DELETE) + +## Journal Structure + +### Initial Journal +When first created with base version 1: +```json +{"initialized": 1699564800000000000, "format_version": 2, "base_version": 1} +{"v": 2, "t": 1699564801000000000, "k": "key1", "o": "value1"} +{"v": 3, "t": 1699564802000000000, "k": "key2", "o": "value2"} +... +``` + +### Rotated Journal +After rotation at version 30000, the new journal contains: +```json +{"initialized": 1699564900000000000, "format_version": 2, "base_version": 30000} +{"v": 30000, "t": 1699564900000000000, "k": "key1", "o": "value1"} // Compacted state +{"v": 30000, "t": 1699564900000000000, "k": "key2", "o": "value2"} // Compacted state +{"v": 30000, "t": 1699564900000000000, "k": "key3", "o": "value3"} // Compacted state +{"v": 30001, "t": 1699564901000000000, "k": "key4", "o": "value4"} // New write +{"v": 30002, "t": 1699564902000000000, "k": "key1", "o": "updated1"} // New write +... +``` + +Key observations: +- All compacted state entries have the same version (30000) +- These are regular journal entries, not a special format +- Incremental writes continue with version 30001+ +- Each rotated journal is self-contained and can be read independently + +## Rotation Process + +When high water mark is reached at version N: + +1. **Create New Journal**: Initialize fresh journal file (e.g., `my_store.jrn.tmp`) +2. **Write Compacted State**: Write all current key-value pairs as versioned entries at version N +3. **Archive Old Journal**: Rename `my_store.jrn` → `my_store.jrn.v{N}` +4. **Activate New Journal**: Rename `my_store.jrn.tmp` → `my_store.jrn` +5. **Callback**: Notify application for upload/cleanup of archived journal + +Example: +``` +Before rotation at v30000: + my_store.jrn # Active, base_version=20000, contains v20000-v30000 + +After rotation: + my_store.jrn # Active, base_version=30000, contains compacted state at v30000 + my_store.jrn.v30000 # Archived, contains v20000-v30000 +``` + +## Recovery Process + +### Current State Recovery +Simply read the active journal (`my_store.jrn`) and replay all entries. + +### Point-in-Time Recovery + +To recover state at target version T: + +1. **Find Correct Journal**: + - Check active journal's base_version and current_version range + - If T is in active journal range, use active journal + - Otherwise, find archived journal with appropriate version range + +2. **Replay Entries**: + - Read all entries from the journal + - Apply entries with version <= T + - Stop when reaching entries with version > T + +3. **Result**: Exact state at version T + +### Example Recovery Scenarios + +**File Structure:** +``` +my_store.jrn # Active, base_version=30000, current=35000 +my_store.jrn.v30000 # Archived, contains v20000-v30000 +my_store.jrn.v20000 # Archived, contains v10000-v20000 +``` + +**Recover at v25000:** +1. Load `my_store.jrn.v30000` (archived journal) +2. Replay entries with version <= 25000 +3. Result: State at v25000 + +**Recover at v30000:** +1. Load `my_store.jrn.v30000` (archived journal) +2. Replay all entries up to v30000 +3. Result: State at v30000 + +**Recover at v32000:** +1. Load `my_store.jrn` (active journal, base_version=30000) +2. Replay entries with version <= 32000 +3. Result: State at v32000 + +## Storage Efficiency + +**Space Requirements:** +- Active journal: Compacted state + recent writes since rotation +- Archived journals: Full history for their version ranges + +**Benefits of Unified Format:** +- Simpler file management (no separate snapshot + journal pairs) +- Each archived journal is self-contained +- Uniform entry format reduces code complexity +- Easy to understand and debug + +**Cleanup Strategy:** +- Keep N most recent archived journals for recovery +- Upload archived journals to remote storage +- Delete old archived journals after successful upload + +## API Usage + +### Basic Operations + +```rust +use bd_resilient_kv::VersionedKVStore; +use bd_bonjson::Value; + +// Create or open store +let mut store = VersionedKVStore::new("mystore.jrn", 1024 * 1024, None)?; + +// Writes return version numbers +let v1 = store.insert("key1".to_string(), Value::from(42))?; +let v2 = store.insert("key2".to_string(), Value::from("hello"))?; + +// Point-in-time recovery +let state_at_v1 = store.as_hashmap_at_version(v1)?; +``` + +### Rotation Callback + +```rust +// Set callback for rotation events +store.set_rotation_callback(Box::new(|old_path, new_path, version| { + println!("Rotated at version {}", version); + println!("Archived journal: {:?}", old_path); + println!("New active journal: {:?}", new_path); + // Upload old_path to remote storage... +})); +``` + +### Manual Rotation + +```rust +// Automatic rotation on high water mark +let version = store.insert("key".to_string(), Value::from("value"))?; +// Rotation happens automatically if high water mark exceeded + +// Or manually trigger rotation +store.rotate_journal()?; +``` + +## Migration from VERSION 1 + +VERSION 1 journals (without versioning) can coexist with VERSION 2: +- Existing VERSION 1 files continue to work with current `KVStore` +- New `VersionedKVStore` creates VERSION 2 journals +- No automatic migration (opt-in by using `VersionedKVStore`) + +## Implementation Notes + +1. **Version Counter Persistence**: Stored in metadata, initialized from journal on restart +2. **Atomicity**: Version increments are atomic with writes +3. **Monotonicity**: Versions never decrease or skip +4. **Concurrency**: Not thread-safe by design (same as current implementation) +5. **Format Field Names**: Use short names (`v`, `t`, `k`, `o`) to minimize storage overhead +6. **Self-Contained Journals**: Each rotated journal can be read independently without dependencies diff --git a/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs b/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs new file mode 100644 index 00000000..5eb55cff --- /dev/null +++ b/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs @@ -0,0 +1,223 @@ +// shared-core - bitdrift's common client/server libraries +// Copyright Bitdrift, Inc. All rights reserved. +// +// Use of this source code is governed by a source available license that can be found in the +// LICENSE file or at: +// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt + +use super::versioned::VersionedKVJournal; +use ahash::AHashMap; +use bd_bonjson::Value; +use memmap2::{MmapMut, MmapOptions}; +use std::fs::OpenOptions; +use std::path::Path; + +/// Memory-mapped implementation of a versioned key-value journal. +/// +/// This implementation uses memory-mapped files to provide persistence while maintaining +/// the efficiency of in-memory operations. All changes are automatically synced to disk. +/// Each write operation receives a unique version number for point-in-time recovery. +/// +/// # Safety +/// During construction, we unsafely declare mmap's internal buffer as having a static +/// lifetime, but it's actually tied to the lifetime of `versioned_kv`. This works because +/// nothing external holds a reference to the buffer. +#[derive(Debug)] +pub struct MemMappedVersionedKVJournal { + // Note: mmap MUST de-init AFTER versioned_kv because mmap uses it. + mmap: MmapMut, + versioned_kv: VersionedKVJournal<'static>, +} + +impl MemMappedVersionedKVJournal { + /// Create a memory-mapped buffer from a file and convert it to a static lifetime slice. + /// + /// # Safety + /// The returned slice has a static lifetime, but it's actually tied to the lifetime of the + /// `MmapMut`. This is safe as long as the `MmapMut` is kept alive for the entire lifetime of + /// the slice usage. + #[allow(clippy::needless_pass_by_value)] + unsafe fn create_mmap_buffer( + file: std::fs::File, + ) -> anyhow::Result<(MmapMut, &'static mut [u8])> { + let mut mmap = unsafe { MmapOptions::new().map_mut(&file)? }; + + // Convert the mmap slice to a static lifetime slice + // This is safe because we keep the mmap alive for the lifetime of the struct + let buffer: &'static mut [u8] = + unsafe { std::slice::from_raw_parts_mut(mmap.as_mut_ptr(), mmap.len()) }; + + Ok((mmap, buffer)) + } + + /// Create a new memory-mapped versioned KV journal using the provided file path. + /// + /// The file will be created if it doesn't exist, or opened if it does. + /// The file will be resized to the specified size if it's different. + /// + /// # Arguments + /// * `file_path` - Path to the file to use for storage + /// * `size` - Minimum size of the file in bytes + /// * `base_version` - The starting version for this journal + /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 + /// + /// # Errors + /// Returns an error if the file cannot be created/opened or memory-mapped. + pub fn new>( + file_path: P, + size: usize, + base_version: u64, + high_water_mark_ratio: Option, + ) -> anyhow::Result { + let file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(false) + .open(file_path)?; + + let file_len = file.metadata()?.len(); + if file_len != size as u64 { + file.set_len(size as u64)?; + } + + let (mmap, buffer) = unsafe { Self::create_mmap_buffer(file)? }; + + let versioned_kv = VersionedKVJournal::new(buffer, base_version, high_water_mark_ratio)?; + + Ok(Self { mmap, versioned_kv }) + } + + /// Create a new memory-mapped versioned KV journal from an existing file. + /// + /// The file must already exist and contain a properly formatted versioned KV journal. + /// The file will be resized to the specified size if it's different. + /// + /// # Arguments + /// * `file_path` - Path to the existing file + /// * `size` - Size to resize the file to in bytes + /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 + /// + /// # Errors + /// Returns an error if the file cannot be opened, memory-mapped, or contains invalid data. + /// Note: If the new size is smaller than the current file size, data may be truncated. + pub fn from_file>( + file_path: P, + size: usize, + high_water_mark_ratio: Option, + ) -> anyhow::Result { + let file = OpenOptions::new().read(true).write(true).open(file_path)?; + + let file_len = file.metadata()?.len(); + if file_len != size as u64 { + file.set_len(size as u64)?; + } + + let (mmap, buffer) = unsafe { Self::create_mmap_buffer(file)? }; + + let versioned_kv = VersionedKVJournal::from_buffer(buffer, high_water_mark_ratio)?; + + Ok(Self { mmap, versioned_kv }) + } + + /// Set a key-value pair with automatic version increment. + /// + /// Returns the version number assigned to this write. + /// + /// # Errors + /// Returns an error if the journal entry cannot be written. + pub fn set_versioned(&mut self, key: &str, value: &Value) -> anyhow::Result { + self.versioned_kv.set_versioned(key, value) + } + + /// Delete a key with automatic version increment. + /// + /// Returns the version number assigned to this deletion. + /// + /// # Errors + /// Returns an error if the journal entry cannot be written. + pub fn delete_versioned(&mut self, key: &str) -> anyhow::Result { + self.versioned_kv.delete_versioned(key) + } + + /// Get the current version number. + #[must_use] + pub fn current_version(&self) -> u64 { + self.versioned_kv.current_version() + } + + /// Get the base version (first version in this journal). + #[must_use] + pub fn base_version(&self) -> u64 { + self.versioned_kv.base_version() + } + + /// Get the current high water mark position. + #[must_use] + pub fn high_water_mark(&self) -> usize { + self.versioned_kv.high_water_mark() + } + + /// Check if the high water mark has been triggered. + #[must_use] + pub fn is_high_water_mark_triggered(&self) -> bool { + self.versioned_kv.is_high_water_mark_triggered() + } + + /// Get the current buffer usage as a percentage (0.0 to 1.0). + #[must_use] + pub fn buffer_usage_ratio(&self) -> f32 { + self.versioned_kv.buffer_usage_ratio() + } + + /// Get the time when the journal was initialized (nanoseconds since UNIX epoch). + #[must_use] + pub fn get_init_time(&self) -> u64 { + self.versioned_kv.get_init_time() + } + + /// Reconstruct the hashmap by replaying all journal entries. + /// + /// # Errors + /// Returns an error if the buffer cannot be decoded. + pub fn as_hashmap(&self) -> anyhow::Result> { + self.versioned_kv.as_hashmap() + } + + /// Reconstruct the hashmap at a specific version by replaying entries up to that version. + /// + /// # Errors + /// Returns an error if the buffer cannot be decoded. + pub fn as_hashmap_at_version( + &self, + target_version: u64, + ) -> anyhow::Result> { + self.versioned_kv.as_hashmap_at_version(target_version) + } + + /// Synchronize changes to disk. + /// + /// This forces any changes in the memory-mapped region to be written to the underlying file. + /// Note that changes are typically synced automatically by the OS, but this provides + /// explicit control when needed. + /// + /// # Errors + /// Returns an error if the sync operation fails. + pub fn sync(&self) -> anyhow::Result<()> { + self.mmap.flush()?; + Ok(()) + } + + /// Get the size of the underlying file in bytes. + #[must_use] + pub fn file_size(&self) -> usize { + self.mmap.len() + } + + /// Get a copy of the buffer for testing purposes + #[cfg(test)] + #[must_use] + pub fn buffer_copy(&self) -> Vec { + self.versioned_kv.buffer_copy() + } +} diff --git a/bd-resilient-kv/src/kv_journal/mod.rs b/bd-resilient-kv/src/kv_journal/mod.rs index 050d030e..866f5c27 100644 --- a/bd-resilient-kv/src/kv_journal/mod.rs +++ b/bd-resilient-kv/src/kv_journal/mod.rs @@ -94,7 +94,11 @@ pub trait KVJournal { pub mod double_buffered; pub mod in_memory; pub mod memmapped; +pub mod memmapped_versioned; +pub mod versioned; pub use double_buffered::DoubleBufferedKVJournal; pub use in_memory::InMemoryKVJournal; pub use memmapped::MemMappedKVJournal; +pub use memmapped_versioned::MemMappedVersionedKVJournal; +pub use versioned::VersionedKVJournal; diff --git a/bd-resilient-kv/src/kv_journal/versioned.rs b/bd-resilient-kv/src/kv_journal/versioned.rs new file mode 100644 index 00000000..6277af17 --- /dev/null +++ b/bd-resilient-kv/src/kv_journal/versioned.rs @@ -0,0 +1,532 @@ +// shared-core - bitdrift's common client/server libraries +// Copyright Bitdrift, Inc. All rights reserved. +// +// Use of this source code is governed by a source available license that can be found in the +// LICENSE file or at: +// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt + +use ahash::AHashMap; +use bd_bonjson::Value; +use bd_bonjson::decoder::from_slice; +use bd_bonjson::encoder::encode_into_buf; +use bd_bonjson::serialize_primitives::serialize_array_begin; +use bd_client_common::error::InvariantError; +use bytes::BufMut; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::{SystemTime, UNIX_EPOCH}; + +/// Versioned implementation of a key-value journaling system that tracks write versions +/// for point-in-time recovery. +/// +/// Each write operation is assigned a monotonically increasing version number, enabling +/// exact state reconstruction at any historical version. +#[derive(Debug)] +pub struct VersionedKVJournal<'a> { + #[allow(dead_code)] + format_version: u64, + position: usize, + buffer: &'a mut [u8], + high_water_mark: usize, + high_water_mark_triggered: bool, + initialized_at_unix_time_ns: u64, + current_version: AtomicU64, + base_version: u64, // First version in this journal +} + +// Versioned KV files have the following structure: +// | Position | Data | Type | +// |----------|--------------------------|----------------| +// | 0 | Format Version | u64 | +// | 8 | Position | u64 | +// | 16 | Type Code: Array Start | u8 | +// | 17 | Metadata Object | BONJSON Object | +// | ... | Versioned Journal Entry | BONJSON Object | +// | ... | Versioned Journal Entry | BONJSON Object | +// +// Metadata object: {"initialized": , "format_version": 2, "base_version": } +// Journal entries: {"v": , "t": , "k": "", "o": } + +const VERSION: u64 = 2; // The versioned format version +const INVALID_VERSION: u64 = 0; // 0 will never be a valid version + +const HEADER_SIZE: usize = 16; +const ARRAY_BEGIN: usize = 16; +const METADATA_OFFSET: usize = 17; + +// Minimum buffer size for a valid journal +const MIN_BUFFER_SIZE: usize = HEADER_SIZE + 4; + +/// Get current timestamp in nanoseconds since UNIX epoch. +fn current_timestamp() -> anyhow::Result { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_err(|_| InvariantError::Invariant.into()) + .and_then(|d| u64::try_from(d.as_nanos()).map_err(|_| InvariantError::Invariant.into())) +} + +fn read_version(buffer: &[u8]) -> anyhow::Result { + let version_bytes: [u8; 8] = buffer[.. 8].try_into()?; + let version = u64::from_le_bytes(version_bytes); + if version != VERSION { + anyhow::bail!("Unsupported version: {version}, expected {VERSION}"); + } + Ok(version) +} + +/// Write to the version field of a journal buffer. +fn write_version_field(buffer: &mut [u8], version: u64) { + let version_bytes = version.to_le_bytes(); + buffer[0 .. 8].copy_from_slice(&version_bytes); +} + +/// Write the version to a journal buffer. +fn write_version(buffer: &mut [u8]) { + write_version_field(buffer, VERSION); +} + +/// Invalidate the version field of a journal buffer. +fn invalidate_version(buffer: &mut [u8]) { + write_version_field(buffer, INVALID_VERSION); +} + +fn read_position(buffer: &[u8]) -> anyhow::Result { + let position_bytes: [u8; 8] = buffer[8 .. 16].try_into()?; + let position_u64 = u64::from_le_bytes(position_bytes); + let position = usize::try_from(position_u64) + .map_err(|_| anyhow::anyhow!("Position {position_u64} too large for usize"))?; + let buffer_len = buffer.len(); + if position >= buffer_len { + anyhow::bail!("Invalid position: {position}, buffer size: {buffer_len}",); + } + Ok(position) +} + +/// Write the position to a journal buffer. +fn write_position(buffer: &mut [u8], position: usize) { + let position_bytes = (position as u64).to_le_bytes(); + buffer[8 .. 16].copy_from_slice(&position_bytes); +} + +/// Read the bonjson payload in this buffer. +fn read_bonjson_payload(buffer: &[u8]) -> anyhow::Result { + let position = read_position(buffer)?; + let slice_to_decode = &buffer[ARRAY_BEGIN .. position]; + + match from_slice(slice_to_decode) { + Ok((_, decoded)) => Ok(decoded), + Err(bd_bonjson::decoder::DecodeError::Partial { partial_value, .. }) => Ok(partial_value), + Err(e) => anyhow::bail!("Failed to decode buffer: {e:?}"), + } +} + +/// Create and write the metadata section of a versioned journal. +fn write_metadata(buffer: &mut [u8], timestamp: u64, base_version: u64) -> anyhow::Result { + let buffer_len = buffer.len(); + let mut cursor = &mut buffer[METADATA_OFFSET ..]; + + // Create metadata object + let mut metadata = AHashMap::new(); + metadata.insert("initialized".to_string(), Value::Unsigned(timestamp)); + metadata.insert("format_version".to_string(), Value::Unsigned(VERSION)); + metadata.insert("base_version".to_string(), Value::Unsigned(base_version)); + + // Write metadata object + encode_into_buf(&mut cursor, &Value::Object(metadata)) + .map_err(|e| anyhow::anyhow!("Failed to encode metadata object: {e:?}"))?; + + Ok(buffer_len - cursor.remaining_mut()) +} + +/// Extract metadata from the buffer. +fn extract_metadata_from_buffer(buffer: &[u8]) -> anyhow::Result<(u64, u64)> { + let array = read_bonjson_payload(buffer)?; + if let Value::Array(entries) = array + && let Some(Value::Object(obj)) = entries.first() + { + let timestamp = if let Some(Value::Unsigned(ts)) = obj.get("initialized") { + *ts + } else if let Some(Value::Signed(ts)) = obj.get("initialized") { + #[allow(clippy::cast_sign_loss)] + (*ts as u64) + } else { + anyhow::bail!("No initialized timestamp found in metadata"); + }; + + let base_version = if let Some(Value::Unsigned(bv)) = obj.get("base_version") { + *bv + } else if let Some(Value::Signed(bv)) = obj.get("base_version") { + #[allow(clippy::cast_sign_loss)] + (*bv as u64) + } else { + 0 // Default to 0 if not found (for compatibility) + }; + + return Ok((timestamp, base_version)); + } + anyhow::bail!("No valid metadata found"); +} + +fn validate_buffer_len(buffer: &[u8]) -> anyhow::Result { + let buffer_len = buffer.len(); + if buffer_len < MIN_BUFFER_SIZE { + anyhow::bail!( + "Buffer too small: {buffer_len} bytes, but need at least {MIN_BUFFER_SIZE} bytes" + ); + } + Ok(buffer_len) +} + +/// Validate high water mark ratio and calculate the position from buffer length. +fn calculate_high_water_mark( + buffer_len: usize, + high_water_mark_ratio: Option, +) -> anyhow::Result { + let ratio = high_water_mark_ratio.unwrap_or(0.8); + if !(0.0 ..= 1.0).contains(&ratio) { + anyhow::bail!("High water mark ratio must be between 0.0 and 1.0, got: {ratio}"); + } + + #[allow( + clippy::cast_precision_loss, + clippy::cast_possible_truncation, + clippy::cast_sign_loss + )] + let high_water_mark = (buffer_len as f32 * ratio) as usize; + Ok(high_water_mark) +} + +impl<'a> VersionedKVJournal<'a> { + /// Create a new versioned journal using the provided buffer as storage space. + /// + /// # Arguments + /// * `buffer` - The storage buffer + /// * `base_version` - The starting version for this journal + /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 + /// + /// # Errors + /// Returns an error if serialization fails or if `high_water_mark_ratio` is invalid. + pub fn new( + buffer: &'a mut [u8], + base_version: u64, + high_water_mark_ratio: Option, + ) -> anyhow::Result { + // If this operation gets interrupted, the buffer must be considered invalid. + invalidate_version(buffer); + + let buffer_len = validate_buffer_len(buffer)?; + let high_water_mark = calculate_high_water_mark(buffer_len, high_water_mark_ratio)?; + + // Write array begin marker right after header + let mut cursor = &mut buffer[HEADER_SIZE ..]; + serialize_array_begin(&mut cursor).map_err(|_| InvariantError::Invariant)?; + + // Write metadata with current timestamp and base version + let timestamp = current_timestamp()?; + let position = write_metadata(buffer, timestamp, base_version)?; + + write_position(buffer, position); + write_version(buffer); + + Ok(Self { + format_version: VERSION, + position, + buffer, + high_water_mark, + high_water_mark_triggered: false, + initialized_at_unix_time_ns: timestamp, + current_version: AtomicU64::new(base_version), + base_version, + }) + } + + /// Create a new versioned journal with state loaded from the provided buffer. + /// + /// # Arguments + /// * `buffer` - The storage buffer containing existing versioned KV data + /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 + /// + /// # Errors + /// Returns an error if the buffer is invalid, corrupted, or if `high_water_mark_ratio` is + /// invalid. + pub fn from_buffer( + buffer: &'a mut [u8], + high_water_mark_ratio: Option, + ) -> anyhow::Result { + let buffer_len = validate_buffer_len(buffer)?; + let format_version = read_version(buffer)?; + let position = read_position(buffer)?; + let (init_timestamp, base_version) = extract_metadata_from_buffer(buffer)?; + let high_water_mark = calculate_high_water_mark(buffer_len, high_water_mark_ratio)?; + + // Find the highest version in the journal + let highest_version = Self::find_highest_version(buffer)?; + let current_version = highest_version.unwrap_or(base_version); + + Ok(Self { + format_version, + position, + buffer, + high_water_mark, + high_water_mark_triggered: position >= high_water_mark, + initialized_at_unix_time_ns: init_timestamp, + current_version: AtomicU64::new(current_version), + base_version, + }) + } + + /// Find the highest version number in the journal by scanning all entries. + fn find_highest_version(buffer: &[u8]) -> anyhow::Result> { + let array = read_bonjson_payload(buffer)?; + let mut max_version: Option = None; + + if let Value::Array(entries) = array { + for (index, entry) in entries.iter().enumerate() { + // Skip metadata (first entry) + if index == 0 { + continue; + } + + if let Value::Object(obj) = entry { + if let Some(Value::Unsigned(v)) = obj.get("v") { + max_version = Some(max_version.map_or(*v, |current| current.max(*v))); + } else if let Some(Value::Signed(v)) = obj.get("v") { + #[allow(clippy::cast_sign_loss)] + let version = *v as u64; + max_version = Some(max_version.map_or(version, |current| current.max(version))); + } + } + } + } + + Ok(max_version) + } + + /// Get the current version number. + #[must_use] + pub fn current_version(&self) -> u64 { + self.current_version.load(Ordering::SeqCst) + } + + /// Get the base version (first version in this journal). + #[must_use] + pub fn base_version(&self) -> u64 { + self.base_version + } + + fn set_position(&mut self, position: usize) { + self.position = position; + write_position(self.buffer, position); + self.check_high_water_mark(); + } + + fn check_high_water_mark(&mut self) { + if self.position >= self.high_water_mark { + self.trigger_high_water(); + } + } + + fn trigger_high_water(&mut self) { + self.high_water_mark_triggered = true; + } + + /// Write a versioned journal entry. + fn write_versioned_entry( + &mut self, + version: u64, + key: &str, + value: &Value, + ) -> anyhow::Result<()> { + let buffer_len = self.buffer.len(); + let mut cursor = &mut self.buffer[self.position ..]; + + // Create entry object: {"v": version, "t": timestamp, "k": key, "o": value} + let timestamp = current_timestamp()?; + let mut entry = AHashMap::new(); + entry.insert("v".to_string(), Value::Unsigned(version)); + entry.insert("t".to_string(), Value::Unsigned(timestamp)); + entry.insert("k".to_string(), Value::String(key.to_string())); + entry.insert("o".to_string(), value.clone()); + + encode_into_buf(&mut cursor, &Value::Object(entry)) + .map_err(|e| anyhow::anyhow!("Failed to encode versioned entry: {e:?}"))?; + + let remaining = cursor.remaining_mut(); + self.set_position(buffer_len - remaining); + Ok(()) + } + + /// Set a key-value pair with automatic version increment. + pub fn set_versioned(&mut self, key: &str, value: &Value) -> anyhow::Result { + let version = self.current_version.fetch_add(1, Ordering::SeqCst) + 1; + self.write_versioned_entry(version, key, value)?; + Ok(version) + } + + /// Delete a key with automatic version increment. + pub fn delete_versioned(&mut self, key: &str) -> anyhow::Result { + let version = self.current_version.fetch_add(1, Ordering::SeqCst) + 1; + self.write_versioned_entry(version, key, &Value::Null)?; + Ok(version) + } + + /// Get the high water mark position. + #[must_use] + pub fn high_water_mark(&self) -> usize { + self.high_water_mark + } + + /// Check if the high water mark has been triggered. + #[must_use] + pub fn is_high_water_mark_triggered(&self) -> bool { + self.high_water_mark_triggered + } + + /// Get the current buffer usage as a percentage (0.0 to 1.0). + #[must_use] + pub fn buffer_usage_ratio(&self) -> f32 { + #[allow(clippy::cast_precision_loss)] + let position_f32 = self.position as f32; + #[allow(clippy::cast_precision_loss)] + let buffer_len_f32 = self.buffer.len() as f32; + position_f32 / buffer_len_f32 + } + + /// Get the initialization timestamp. + #[must_use] + pub fn get_init_time(&self) -> u64 { + self.initialized_at_unix_time_ns + } + + /// Reconstruct the hashmap by replaying all journal entries. + pub fn as_hashmap(&self) -> anyhow::Result> { + let array = read_bonjson_payload(self.buffer)?; + let mut map = AHashMap::new(); + + if let Value::Array(entries) = array { + for (index, entry) in entries.iter().enumerate() { + // Skip metadata (first entry) + if index == 0 { + continue; + } + + if let Value::Object(obj) = entry { + // Extract key and operation from versioned entry + if let Some(Value::String(key)) = obj.get("k") + && let Some(operation) = obj.get("o") + { + if operation.is_null() { + map.remove(key); + } else { + map.insert(key.clone(), operation.clone()); + } + } + } + } + } + + Ok(map) + } + + /// Reconstruct the hashmap at a specific version by replaying entries up to that version. + pub fn as_hashmap_at_version( + &self, + target_version: u64, + ) -> anyhow::Result> { + let array = read_bonjson_payload(self.buffer)?; + let mut map = AHashMap::new(); + + if let Value::Array(entries) = array { + for (index, entry) in entries.iter().enumerate() { + // Skip metadata (first entry) + if index == 0 { + continue; + } + + if let Value::Object(obj) = entry { + // Check version + let entry_version = if let Some(Value::Unsigned(v)) = obj.get("v") { + *v + } else if let Some(Value::Signed(v)) = obj.get("v") { + #[allow(clippy::cast_sign_loss)] + (*v as u64) + } else { + continue; // Skip entries without version + }; + + // Only apply entries up to target version + if entry_version > target_version { + break; + } + + // Extract key and operation + if let Some(Value::String(key)) = obj.get("k") + && let Some(operation) = obj.get("o") + { + if operation.is_null() { + map.remove(key); + } else { + map.insert(key.clone(), operation.clone()); + } + } + } + } + } + + Ok(map) + } + + /// Get a copy of the buffer for testing purposes + #[cfg(test)] + #[must_use] + pub fn buffer_copy(&self) -> Vec { + self.buffer.to_vec() + } +} + +/// Rotation utilities for creating new journals with compacted state +impl<'a> VersionedKVJournal<'a> { + /// Create a new journal initialized with the compacted state from a snapshot version. + /// + /// The new journal will have all current key-value pairs written as versioned entries + /// at the `snapshot_version`, followed by the ability to continue with incremental writes. + /// + /// # Arguments + /// * `buffer` - The buffer to write the new journal to + /// * `snapshot_version` - The version to assign to all compacted state entries + /// * `state` - The current key-value state to write + /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark + /// + /// # Errors + /// Returns an error if serialization fails or buffer is too small. + pub fn create_rotated_journal( + buffer: &'a mut [u8], + snapshot_version: u64, + state: &AHashMap, + high_water_mark_ratio: Option, + ) -> anyhow::Result { + // Create a new journal with the snapshot version as the base + let mut journal = Self::new(buffer, snapshot_version, high_water_mark_ratio)?; + + // Write all current state as versioned entries at the snapshot version + let timestamp = current_timestamp()?; + for (key, value) in state { + let buffer_len = journal.buffer.len(); + let mut cursor = &mut journal.buffer[journal.position ..]; + + // Create entry object: {"v": version, "t": timestamp, "k": key, "o": value} + let mut entry = AHashMap::new(); + entry.insert("v".to_string(), Value::Unsigned(snapshot_version)); + entry.insert("t".to_string(), Value::Unsigned(timestamp)); + entry.insert("k".to_string(), Value::String(key.clone())); + entry.insert("o".to_string(), value.clone()); + + encode_into_buf(&mut cursor, &Value::Object(entry)) + .map_err(|e| anyhow::anyhow!("Failed to encode state entry: {e:?}"))?; + + let remaining = cursor.remaining_mut(); + journal.set_position(buffer_len - remaining); + } + + Ok(journal) + } +} diff --git a/bd-resilient-kv/src/lib.rs b/bd-resilient-kv/src/lib.rs index dc911eec..e9e399c3 100644 --- a/bd-resilient-kv/src/lib.rs +++ b/bd-resilient-kv/src/lib.rs @@ -19,6 +19,15 @@ mod tests; pub mod kv_journal; pub mod kv_store; +pub mod versioned_kv_store; -pub use kv_journal::{DoubleBufferedKVJournal, InMemoryKVJournal, KVJournal, MemMappedKVJournal}; +pub use kv_journal::{ + DoubleBufferedKVJournal, + InMemoryKVJournal, + KVJournal, + MemMappedKVJournal, + MemMappedVersionedKVJournal, + VersionedKVJournal, +}; pub use kv_store::KVStore; +pub use versioned_kv_store::{RotationCallback, VersionedKVStore}; diff --git a/bd-resilient-kv/src/tests/mod.rs b/bd-resilient-kv/src/tests/mod.rs index 58b0b4fd..6e2af215 100644 --- a/bd-resilient-kv/src/tests/mod.rs +++ b/bd-resilient-kv/src/tests/mod.rs @@ -33,3 +33,4 @@ pub mod error_handling_test; pub mod kv_store_test; pub mod kv_test; pub mod memmapped_test; +pub mod versioned_kv_store_test; diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs new file mode 100644 index 00000000..5ae93d70 --- /dev/null +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -0,0 +1,396 @@ +// shared-core - bitdrift's common client/server libraries +// Copyright Bitdrift, Inc. All rights reserved. +// +// Use of this source code is governed by a source available license that can be found in the +// LICENSE file or at: +// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt + +use crate::VersionedKVStore; +use bd_bonjson::Value; +use std::sync::{Arc, Mutex}; +use tempfile::TempDir; + +#[test] +fn test_versioned_store_new() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let store = VersionedKVStore::new(&file_path, 4096, None)?; + + // Should start empty + assert!(store.is_empty()); + assert_eq!(store.len(), 0); + assert_eq!(store.base_version(), 1); // Base version starts at 1 + assert_eq!(store.current_version(), 1); + + Ok(()) +} + +#[test] +fn test_versioned_store_basic_operations() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + // Test insert with version tracking + let v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; + assert_eq!(v1, 2); // First write is version 2 (base is 1) + + let retrieved = store.get("key1"); + assert_eq!(retrieved, Some(&Value::String("value1".to_string()))); + + // Test overwrite + let v2 = store.insert("key1".to_string(), Value::String("value2".to_string()))?; + assert_eq!(v2, 3); // Second write is version 3 + assert!(v2 > v1); + + let retrieved = store.get("key1"); + assert_eq!(retrieved, Some(&Value::String("value2".to_string()))); + + // Test contains_key + assert!(store.contains_key("key1")); + assert!(!store.contains_key("nonexistent")); + + // Test len and is_empty + assert_eq!(store.len(), 1); + assert!(!store.is_empty()); + + // Current version should track latest write + assert_eq!(store.current_version(), v2); + + Ok(()) +} + +#[test] +fn test_versioned_store_remove() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + // Insert some values + let v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; + let v2 = store.insert("key2".to_string(), Value::String("value2".to_string()))?; + + assert_eq!(store.len(), 2); + assert!(v2 > v1); + + // Remove a key + let v3 = store.remove("key1")?; + assert!(v3.is_some()); + assert!(v3.unwrap() > v2); + + assert_eq!(store.len(), 1); + assert!(!store.contains_key("key1")); + assert!(store.contains_key("key2")); + + // Remove non-existent key + let removed = store.remove("nonexistent")?; + assert!(removed.is_none()); + + Ok(()) +} + +#[test] +fn test_point_in_time_recovery() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + // Create a sequence of writes + let v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; + let v2 = store.insert("key2".to_string(), Value::String("value2".to_string()))?; + let v3 = store.insert("key1".to_string(), Value::String("updated1".to_string()))?; + let v4 = store.remove("key2")?; + + // Current state should have key1=updated1, key2 deleted + assert_eq!( + store.get("key1"), + Some(&Value::String("updated1".to_string())) + ); + assert_eq!(store.get("key2"), None); + assert_eq!(store.len(), 1); + + // Recover at v1: should have key1=value1 + let state_v1 = store.as_hashmap_at_version(v1)?; + assert_eq!(state_v1.len(), 1); + assert_eq!( + state_v1.get("key1"), + Some(&Value::String("value1".to_string())) + ); + + // Recover at v2: should have key1=value1, key2=value2 + let state_v2 = store.as_hashmap_at_version(v2)?; + assert_eq!(state_v2.len(), 2); + assert_eq!( + state_v2.get("key1"), + Some(&Value::String("value1".to_string())) + ); + assert_eq!( + state_v2.get("key2"), + Some(&Value::String("value2".to_string())) + ); + + // Recover at v3: should have key1=updated1, key2=value2 + let state_v3 = store.as_hashmap_at_version(v3)?; + assert_eq!(state_v3.len(), 2); + assert_eq!( + state_v3.get("key1"), + Some(&Value::String("updated1".to_string())) + ); + assert_eq!( + state_v3.get("key2"), + Some(&Value::String("value2".to_string())) + ); + + // Recover at v4: should have key1=updated1, key2 deleted + let state_v4 = store.as_hashmap_at_version(v4.unwrap())?; + assert_eq!(state_v4.len(), 1); + assert_eq!( + state_v4.get("key1"), + Some(&Value::String("updated1".to_string())) + ); + assert!(!state_v4.contains_key("key2")); + + Ok(()) +} + +#[test] +fn test_persistence_and_reload() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let v1; + let v2; + + // Create store and write some data + { + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; + v2 = store.insert("key2".to_string(), Value::Signed(42))?; + store.sync()?; + } + + // Reopen and verify data persisted + { + let store = VersionedKVStore::open_existing(&file_path, 4096, None)?; + assert_eq!(store.len(), 2); + assert_eq!( + store.get("key1"), + Some(&Value::String("value1".to_string())) + ); + assert_eq!(store.get("key2"), Some(&Value::Signed(42))); + + // Version numbers should be preserved + assert_eq!(store.current_version(), v2); + + // Point-in-time recovery should still work + let state_v1 = store.as_hashmap_at_version(v1)?; + assert_eq!(state_v1.len(), 1); + assert_eq!( + state_v1.get("key1"), + Some(&Value::String("value1".to_string())) + ); + } + + Ok(()) +} + +#[test] +fn test_null_value_is_deletion() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + // Insert a value + store.insert("key1".to_string(), Value::String("value1".to_string()))?; + assert!(store.contains_key("key1")); + + // Insert null to delete + store.insert("key1".to_string(), Value::Null)?; + assert!(!store.contains_key("key1")); + assert_eq!(store.len(), 0); + + Ok(()) +} + +#[test] +fn test_rotation_callback() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + // Use a small buffer and low high water mark to trigger rotation easily + let mut store = VersionedKVStore::new(&file_path, 1024, Some(0.3))?; + + // Set up callback to track rotation events + let callback_data = Arc::new(Mutex::new(Vec::new())); + let callback_data_clone = Arc::clone(&callback_data); + + store.set_rotation_callback(Box::new(move |old_path, new_path, version| { + let mut data = callback_data_clone.lock().unwrap(); + data.push((old_path.to_path_buf(), new_path.to_path_buf(), version)); + })); + + // Write enough data to trigger rotation + let mut last_version = 0; + for i in 0 .. 100 { + let key = format!("key{}", i); + let value = Value::String(format!("value_{}_with_some_extra_padding", i)); + last_version = store.insert(key, value)?; + + // Rotation happens automatically inside insert when high water mark is triggered + let data = callback_data.lock().unwrap(); + if !data.is_empty() { + break; + } + } + + // Check that callback was invoked + let data = callback_data.lock().unwrap(); + assert!(data.len() >= 1, "Expected at least one rotation event"); + + let (old_path, new_path, rotation_version) = &data[0]; + assert!(old_path.to_string_lossy().contains(".v")); + assert_eq!(new_path, &file_path); + assert!(*rotation_version <= last_version); + + Ok(()) +} + +#[test] +fn test_manual_rotation() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + // Insert some data + let _v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; + let v2 = store.insert("key2".to_string(), Value::String("value2".to_string()))?; + + // Manually trigger rotation + let rotation_version = store.current_version(); + store.rotate_journal()?; + + // Verify archived file exists + let archived_path = temp_dir + .path() + .join(format!("test.jrn.v{}", rotation_version)); + assert!(archived_path.exists()); + + // Verify active journal still works + let v3 = store.insert("key3".to_string(), Value::String("value3".to_string()))?; + assert!(v3 > v2); + assert_eq!(store.len(), 3); + + // Verify data is intact + assert_eq!( + store.get("key1"), + Some(&Value::String("value1".to_string())) + ); + assert_eq!( + store.get("key2"), + Some(&Value::String("value2".to_string())) + ); + assert_eq!( + store.get("key3"), + Some(&Value::String("value3".to_string())) + ); + + // New journal should have base version at rotation point + assert_eq!(store.base_version(), rotation_version); + + Ok(()) +} + +#[test] +fn test_rotation_preserves_state() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + // Create complex state + store.insert("key1".to_string(), Value::String("value1".to_string()))?; + store.insert("key2".to_string(), Value::Signed(42))?; + store.insert("key3".to_string(), Value::Bool(true))?; + store.insert("key4".to_string(), Value::Float(3.14159))?; + + let pre_rotation_state = store.as_hashmap().clone(); + let pre_rotation_version = store.current_version(); + + // Rotate + store.rotate_journal()?; + + // Verify state is preserved exactly + let post_rotation_state = store.as_hashmap(); + assert_eq!(&pre_rotation_state, post_rotation_state); + assert_eq!(store.len(), 4); + + // Verify we can continue writing + let v_new = store.insert("key5".to_string(), Value::String("value5".to_string()))?; + assert!(v_new > pre_rotation_version); + assert_eq!(store.len(), 5); + + Ok(()) +} + +#[test] +fn test_empty_store_operations() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + // Operations on empty store + assert_eq!(store.get("nonexistent"), None); + assert!(!store.contains_key("nonexistent")); + assert_eq!(store.remove("nonexistent")?, None); + assert!(store.is_empty()); + assert_eq!(store.len(), 0); + + // Point-in-time recovery of empty state + let state = store.as_hashmap_at_version(1)?; + assert!(state.is_empty()); + + Ok(()) +} + +#[test] +fn test_version_monotonicity() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + let mut last_version = store.current_version(); + + // Perform various operations and ensure version always increases + for i in 0 .. 20 { + let op_version = if i % 3 == 0 { + store.insert(format!("key{}", i), Value::Signed(i as i64))? + } else if i % 3 == 1 { + store.insert( + format!("key{}", i / 3), + Value::String(format!("updated{}", i)), + )? + } else { + store + .remove(&format!("key{}", i / 3))? + .unwrap_or(last_version) + }; + + assert!( + op_version >= last_version, + "Version should be monotonically increasing" + ); + last_version = op_version; + } + + assert_eq!(store.current_version(), last_version); + + Ok(()) +} diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs new file mode 100644 index 00000000..bc0cd761 --- /dev/null +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -0,0 +1,364 @@ +// shared-core - bitdrift's common client/server libraries +// Copyright Bitdrift, Inc. All rights reserved. +// +// Use of this source code is governed by a source available license that can be found in the +// LICENSE file or at: +// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt + +use crate::kv_journal::{MemMappedVersionedKVJournal, VersionedKVJournal}; +use ahash::AHashMap; +use bd_bonjson::Value; +use std::path::{Path, PathBuf}; + +/// Callback invoked when journal rotation occurs. +/// +/// The callback receives: +/// - `old_journal_path`: The path to the archived journal file that was just rotated out +/// - `new_journal_path`: The path to the new active journal file +/// - `rotation_version`: The version at which rotation occurred (snapshot version) +/// +/// This callback can be used to trigger asynchronous upload of archived journals to remote +/// storage, perform cleanup, or other post-rotation operations. +pub type RotationCallback = Box; + +/// A persistent key-value store with version tracking for point-in-time recovery. +/// +/// `VersionedKVStore` provides HashMap-like semantics backed by a versioned journal that +/// assigns a monotonically increasing version number to each write operation. This enables: +/// - Point-in-time recovery to any historical version +/// - Automatic journal rotation when high water mark is reached +/// - Optional callbacks for post-rotation operations (e.g., remote backup) +/// +/// For performance optimization, `VersionedKVStore` maintains an in-memory cache of the +/// current key-value data to provide O(1) read operations and avoid expensive journal +/// decoding on every access. +/// +/// # Rotation Strategy +/// When the journal reaches its high water mark, the store automatically: +/// 1. Creates a new journal file with a rotated name (e.g., `store.jrn.v12345`) +/// 2. Writes the current state as versioned entries at the rotation version +/// 3. Archives the old journal for potential upload/cleanup +/// 4. Continues normal operations in the new journal +/// +/// # Example +/// ```ignore +/// use bd_resilient_kv::VersionedKVStore; +/// use bd_bonjson::Value; +/// +/// let mut store = VersionedKVStore::new("mystore.jrn", 1024 * 1024, None)?; +/// +/// // Insert with version tracking +/// let v1 = store.insert("key1".to_string(), Value::from(42))?; +/// let v2 = store.insert("key2".to_string(), Value::from("hello"))?; +/// +/// // Point-in-time recovery +/// let state_at_v1 = store.as_hashmap_at_version(v1)?; +/// ``` +pub struct VersionedKVStore { + journal: MemMappedVersionedKVJournal, + cached_map: AHashMap, + base_path: PathBuf, + buffer_size: usize, + high_water_mark_ratio: Option, + rotation_callback: Option, +} + +impl VersionedKVStore { + /// Create a new `VersionedKVStore` with the specified path and buffer size. + /// + /// If the file already exists, it will be loaded with its existing contents. + /// If the specified size is larger than an existing file, it will be resized while preserving + /// data. If the specified size is smaller and the existing data doesn't fit, a fresh journal + /// will be created. + /// + /// # Arguments + /// * `file_path` - Path for the journal file + /// * `buffer_size` - Size in bytes for the journal buffer + /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 + /// + /// # Errors + /// Returns an error if the journal file cannot be created/opened or if initialization fails. + pub fn new>( + file_path: P, + buffer_size: usize, + high_water_mark_ratio: Option, + ) -> anyhow::Result { + let path = file_path.as_ref(); + let base_path = path.to_path_buf(); + + let journal = if path.exists() { + // Try to open existing journal + MemMappedVersionedKVJournal::from_file(path, buffer_size, high_water_mark_ratio).or_else( + |_| { + // Data is corrupt or unreadable, create fresh with base version 1 + MemMappedVersionedKVJournal::new(path, buffer_size, 1, high_water_mark_ratio) + }, + )? + } else { + // Create new journal with base version 1 + MemMappedVersionedKVJournal::new(path, buffer_size, 1, high_water_mark_ratio)? + }; + + let cached_map = journal.as_hashmap()?; + + Ok(Self { + journal, + cached_map, + base_path, + buffer_size, + high_water_mark_ratio, + rotation_callback: None, + }) + } + + /// Open an existing `VersionedKVStore` from a pre-existing journal file. + /// + /// Unlike `new()`, this method requires the journal file to exist and will fail if it's + /// missing. + /// + /// # Arguments + /// * `file_path` - Path to the existing journal file + /// * `buffer_size` - Size in bytes for the journal buffer + /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 + /// + /// # Errors + /// Returns an error if: + /// - The journal file does not exist + /// - The journal file cannot be opened + /// - The journal file contains invalid data + /// - Initialization fails + pub fn open_existing>( + file_path: P, + buffer_size: usize, + high_water_mark_ratio: Option, + ) -> anyhow::Result { + let path = file_path.as_ref(); + let base_path = path.to_path_buf(); + + let journal = MemMappedVersionedKVJournal::from_file(path, buffer_size, high_water_mark_ratio)?; + let cached_map = journal.as_hashmap()?; + + Ok(Self { + journal, + cached_map, + base_path, + buffer_size, + high_water_mark_ratio, + rotation_callback: None, + }) + } + + /// Set a callback to be invoked when journal rotation occurs. + /// + /// The callback receives the path to the archived journal file, the new active journal file, + /// and the rotation version. This can be used to trigger asynchronous upload of archived + /// journals to remote storage. + pub fn set_rotation_callback(&mut self, callback: RotationCallback) { + self.rotation_callback = Some(callback); + } + + /// Get a value by key. + /// + /// This operation is O(1) as it reads from the in-memory cache. + #[must_use] + pub fn get(&self, key: &str) -> Option<&Value> { + self.cached_map.get(key) + } + + /// Insert a value for a key, returning the version number assigned to this write. + /// + /// Note: Inserting `Value::Null` is equivalent to removing the key. + /// + /// # Errors + /// Returns an error if the value cannot be written to the journal. + pub fn insert(&mut self, key: String, value: Value) -> anyhow::Result { + let version = if matches!(value, Value::Null) { + // Inserting null is equivalent to deletion + let version = self.journal.delete_versioned(&key)?; + self.cached_map.remove(&key); + version + } else { + let version = self.journal.set_versioned(&key, &value)?; + self.cached_map.insert(key, value); + version + }; + + // Check if rotation is needed + if self.journal.is_high_water_mark_triggered() { + self.rotate_journal()?; + } + + Ok(version) + } + + /// Remove a key and return the version number assigned to this deletion. + /// + /// Returns `None` if the key didn't exist, otherwise returns the version number. + /// + /// # Errors + /// Returns an error if the deletion cannot be written to the journal. + pub fn remove(&mut self, key: &str) -> anyhow::Result> { + if !self.cached_map.contains_key(key) { + return Ok(None); + } + + let version = self.journal.delete_versioned(key)?; + self.cached_map.remove(key); + + // Check if rotation is needed + if self.journal.is_high_water_mark_triggered() { + self.rotate_journal()?; + } + + Ok(Some(version)) + } + + /// Check if the store contains a key. + /// + /// This operation is O(1) as it reads from the in-memory cache. + #[must_use] + pub fn contains_key(&self, key: &str) -> bool { + self.cached_map.contains_key(key) + } + + /// Get the number of key-value pairs in the store. + /// + /// This operation is O(1) as it reads from the in-memory cache. + #[must_use] + pub fn len(&self) -> usize { + self.cached_map.len() + } + + /// Check if the store is empty. + /// + /// This operation is O(1) as it reads from the in-memory cache. + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Get a reference to the current hash map. + /// + /// This operation is O(1) as it reads from the in-memory cache. + #[must_use] + pub fn as_hashmap(&self) -> &AHashMap { + &self.cached_map + } + + /// Reconstruct the hashmap at a specific version by replaying entries up to that version. + /// + /// This allows point-in-time recovery to any historical version in the current journal. + /// + /// # Errors + /// Returns an error if the journal cannot be decoded or the version is out of range. + pub fn as_hashmap_at_version( + &self, + target_version: u64, + ) -> anyhow::Result> { + self.journal.as_hashmap_at_version(target_version) + } + + /// Get the current version number. + #[must_use] + pub fn current_version(&self) -> u64 { + self.journal.current_version() + } + + /// Get the base version (first version in this journal). + #[must_use] + pub fn base_version(&self) -> u64 { + self.journal.base_version() + } + + /// Synchronize changes to disk. + /// + /// # Errors + /// Returns an error if the sync operation fails. + pub fn sync(&self) -> anyhow::Result<()> { + self.journal.sync() + } + + /// Get the current buffer usage ratio (0.0 to 1.0). + #[must_use] + pub fn buffer_usage_ratio(&self) -> f32 { + self.journal.buffer_usage_ratio() + } + + /// Check if the high water mark has been triggered. + #[must_use] + pub fn is_high_water_mark_triggered(&self) -> bool { + self.journal.is_high_water_mark_triggered() + } + + /// Manually trigger journal rotation. + /// + /// This will create a new journal with the current state compacted and archive the old journal. + /// Rotation typically happens automatically when the high water mark is reached, but this + /// method allows manual control when needed. + /// + /// # Errors + /// Returns an error if rotation fails. + pub fn rotate_journal(&mut self) -> anyhow::Result<()> { + let rotation_version = self.journal.current_version(); + + // Generate archived journal path with rotation version + let archived_path = self.generate_archived_path(rotation_version); + + // Create new journal with rotated state + let new_journal = self.create_rotated_journal(rotation_version)?; + + // Replace old journal with new one + let old_journal = std::mem::replace(&mut self.journal, new_journal); + + // Move old journal to archived location + drop(old_journal); // Release mmap before moving file + std::fs::rename(&self.base_path, &archived_path)?; + + // Rename new journal to base path + let temp_path = self.base_path.with_extension("jrn.tmp"); + std::fs::rename(&temp_path, &self.base_path)?; + + // Invoke rotation callback if set + if let Some(ref mut callback) = self.rotation_callback { + callback(&archived_path, &self.base_path, rotation_version); + } + + Ok(()) + } + + /// Generate the archived journal path for a given rotation version. + fn generate_archived_path(&self, rotation_version: u64) -> PathBuf { + let mut path = self.base_path.clone(); + if let Some(file_name) = path.file_name() { + let new_name = format!("{}.v{}", file_name.to_string_lossy(), rotation_version); + path.set_file_name(new_name); + } + path + } + + /// Create a new rotated journal with compacted state. + fn create_rotated_journal( + &self, + rotation_version: u64, + ) -> anyhow::Result { + // Create temporary journal file + let temp_path = self.base_path.with_extension("jrn.tmp"); + + // Create in-memory buffer for new journal + let mut buffer = vec![0u8; self.buffer_size]; + + // Use VersionedKVJournal to create rotated journal in memory + let _rotated = VersionedKVJournal::create_rotated_journal( + &mut buffer, + rotation_version, + &self.cached_map, + self.high_water_mark_ratio, + )?; + + // Write buffer to temporary file + std::fs::write(&temp_path, &buffer)?; + + // Open as memory-mapped journal + MemMappedVersionedKVJournal::from_file(&temp_path, self.buffer_size, self.high_water_mark_ratio) + } +} From 773c679322dcffb256430cdedc1dd5b6fa7c3318 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Tue, 4 Nov 2025 19:27:41 -0800 Subject: [PATCH 02/66] add versioned recovery, zlib compression --- Cargo.lock | 1 + bd-resilient-kv/AGENTS.md | 76 ++- bd-resilient-kv/Cargo.toml | 1 + bd-resilient-kv/README.md | 275 +++++++++- bd-resilient-kv/VERSIONED_FORMAT.md | 142 +++-- .../src/kv_journal/memmapped_versioned.rs | 18 +- bd-resilient-kv/src/kv_journal/mod.rs | 2 +- bd-resilient-kv/src/kv_journal/versioned.rs | 92 +++- bd-resilient-kv/src/lib.rs | 2 + bd-resilient-kv/src/tests/mod.rs | 1 + .../src/tests/versioned_kv_store_test.rs | 314 +++++++++--- .../src/tests/versioned_recovery_test.rs | 485 ++++++++++++++++++ bd-resilient-kv/src/versioned_kv_store.rs | 95 +++- bd-resilient-kv/src/versioned_recovery.rs | 321 ++++++++++++ 14 files changed, 1643 insertions(+), 182 deletions(-) create mode 100644 bd-resilient-kv/src/tests/versioned_recovery_test.rs create mode 100644 bd-resilient-kv/src/versioned_recovery.rs diff --git a/Cargo.lock b/Cargo.lock index ac6857f3..c6278fd9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1108,6 +1108,7 @@ dependencies = [ "bd-client-common", "bd-workspace-hack", "bytes", + "flate2", "memmap2", "tempfile", ] diff --git a/bd-resilient-kv/AGENTS.md b/bd-resilient-kv/AGENTS.md index 8cf65344..09e3fd80 100644 --- a/bd-resilient-kv/AGENTS.md +++ b/bd-resilient-kv/AGENTS.md @@ -4,6 +4,11 @@ This document provides insights and understanding about the `bd-resilient-kv` jo ## Core Architecture +The `bd-resilient-kv` library provides two storage models: + +1. **KVStore**: Standard double-buffered key-value store with automatic compaction +2. **VersionedKVStore**: Version-tracked store with point-in-time recovery and automatic rotation + ### KVJournal Trait The `KVJournal` trait is the foundation of the system, providing: - **Append-only semantics**: Journals accumulate entries over time without removing old data @@ -16,6 +21,9 @@ The `KVJournal` trait is the foundation of the system, providing: 1. **InMemoryKVJournal**: Core implementation backed by byte buffers 2. **MemMappedKVJournal**: File-backed implementation wrapping InMemoryKVJournal 3. **DoubleBufferedKVJournal**: High-level wrapper providing automatic compaction and retry logic +4. **VersionedKVJournal**: Versioned journal with entry-level version tracking +5. **MemMappedVersionedKVJournal**: Memory-mapped wrapper for versioned journals +6. **VersionedKVStore**: High-level API for versioned key-value storage with automatic rotation ### Bulk Operations Architecture @@ -31,14 +39,74 @@ The system provides efficient bulk operations through a consistent pattern: - Optimized for batch processing scenarios - Automatic timestamp synchronization for related entries +### Versioned Storage Architecture + +The `VersionedKVStore` provides a higher-level API built on top of `VersionedKVJournal`: + +**Key Components**: +- **VersionedKVJournal**: Low-level journal that tracks version numbers for each entry +- **MemMappedVersionedKVJournal**: Memory-mapped persistence layer +- **VersionedKVStore**: High-level HashMap-like API with automatic rotation + +**Version Tracking**: +- Every write operation (`insert`, `remove`) returns a monotonically increasing version number +- Version numbers start at 1 (base version), first write is version 2 +- Entries with `Value::Null` are treated as deletions but still versioned + +**Rotation Strategy**: +- Automatic rotation when journal size exceeds high water mark +- Current state is compacted into a new journal as versioned entries +- Old journal is archived with `.v{version}.zz` suffix +- Archived journals are automatically compressed using zlib (RFC 1950, level 3) +- Optional callback invoked with archived path and version +- Application controls upload/cleanup of archived journals + +**Compression**: +- All archived journals are automatically compressed during rotation +- Active journals remain uncompressed for write performance +- Compression uses zlib format (RFC 1950) with level 3 for balanced speed/ratio +- Typical compression achieves >50% size reduction for text-based data +- File extension `.zz` indicates compressed archives +- Recovery transparently decompresses archived journals when needed + +**Note on Point-in-Time Recovery**: +The `VersionedKVJournal` trait provides `as_hashmap_at_version()` for replaying entries within a single journal. However, `VersionedKVStore` does not expose this functionality because it only works within the current journal - once rotation occurs, historical versions in archived journals cannot be accessed. For true point-in-time recovery across rotations, applications would need to implement their own mechanism to load and replay archived journal files. + ## Critical Design Insights -### 1. Compaction Efficiency +### 1. Two Storage Models + +**KVStore (Double-Buffered)**: +- Best for: General-purpose key-value storage, configuration, caches +- Architecture: Two journals with automatic switching +- Compaction: Compresses entire state into inactive journal +- No version tracking + +**VersionedKVStore (Single Journal with Rotation)**: +- Best for: Audit logs, state history, remote backup +- Architecture: Single journal with archived versions +- Rotation: Creates new journal with compacted state +- Version tracking: Every write returns a version number + +### 2. Compaction Efficiency **Key Insight**: Compaction via `reinit_from()` is already maximally efficient. It writes data in the most compact possible serialized form (hashmap → bytes). If even this compact representation exceeds high water marks, then the data volume itself is the limiting factor, not inefficient storage. **Implication**: Never assume compaction can always solve high water mark issues. Sometimes both buffers are legitimately full. -### 2. Bulk Operations and Retry Logic +### 3. Versioned Store Rotation vs KVStore Compaction + +**Key Differences**: +- **KVStore**: Switches between two buffers, old buffer is reset and reused +- **VersionedKVStore**: Archives old journal with `.v{version}` suffix, creates new journal +- **Callback**: Only `VersionedKVStore` supports rotation callbacks for upload/cleanup +- **Version Preservation**: Archived journals preserve complete history for recovery + +**When Rotation Occurs**: +- Triggered during `insert()` or `remove()` when journal size exceeds high water mark +- Can be manually triggered via `rotate()` +- Automatic and transparent to the caller (except for callback) + +### 4. Bulk Operations and Retry Logic The system includes sophisticated retry logic specifically for bulk operations: **`set_multiple` Intelligence**: The `set_multiple` method in `DoubleBufferedKVJournal` implements a two-phase approach: @@ -51,7 +119,7 @@ The system includes sophisticated retry logic specifically for bulk operations: - A retry immediately after might succeed on the now-compacted journal - High water mark flag accurately reflects whether retry is worthwhile -### 3. Simplified High Water Mark Detection +### 5. Simplified High Water Mark Detection The system uses a straightforward approach to high water mark detection: ```rust @@ -66,7 +134,7 @@ if journal.is_high_water_mark_triggered() { - No callback complexity or thread safety concerns - Direct control over when to check status -### 3. Double Buffered Journal Logic +### 6. Double Buffered Journal Logic The `DoubleBufferedKVJournal` implements automatic switching with sophisticated retry logic: 1. **Normal Operations**: Forward to active journal, switch if high water mark triggered diff --git a/bd-resilient-kv/Cargo.toml b/bd-resilient-kv/Cargo.toml index fdb83557..cad895f6 100644 --- a/bd-resilient-kv/Cargo.toml +++ b/bd-resilient-kv/Cargo.toml @@ -18,4 +18,5 @@ bd-bonjson = { path = "../bd-bonjson" } bd-client-common = { path = "../bd-client-common" } bd-workspace-hack.workspace = true bytes.workspace = true +flate2.workspace = true memmap2.workspace = true diff --git a/bd-resilient-kv/README.md b/bd-resilient-kv/README.md index f9bf82f3..caa9ca87 100644 --- a/bd-resilient-kv/README.md +++ b/bd-resilient-kv/README.md @@ -15,6 +15,7 @@ A crash-resilient key-value store library for Rust with automatic persistence, c - **🔄 Self-Managing**: Automatic high water mark detection and buffer switching - **🎯 Simple API**: HashMap-like interface that's easy to use - **🏗️ JSON-like Values**: Built on `bd-bonjson` for flexible value types +- **📊 Version Tracking**: Optional versioned store with point-in-time recovery and automatic journal rotation ## Quick Start @@ -26,6 +27,17 @@ bd-resilient-kv = { path = "path/to/bd-resilient-kv" } bd-bonjson = { path = "path/to/bd-bonjson" } ``` +### Choosing Between KVStore and VersionedKVStore + +**KVStore**: Use for general key-value storage with automatic compaction +- Best for: Configuration storage, caches, general-purpose persistence +- Features: Double-buffered journaling, automatic compaction, high performance + +**VersionedKVStore**: Use when you need version tracking +- Best for: Audit logs, state history, remote backup +- Features: Every write operation returns a version number, automatic rotation with callbacks +- See: [VERSIONED_FORMAT.md](./VERSIONED_FORMAT.md) for detailed format documentation + ### Basic Usage ```rust @@ -174,9 +186,103 @@ fn main() -> anyhow::Result<()> { } ``` +## Versioned Key-Value Store + +For applications that require version tracking, audit logs, or point-in-time recovery, use `VersionedKVStore`: + +```rust +use bd_resilient_kv::VersionedKVStore; +use bd_bonjson::Value; + +fn main() -> anyhow::Result<()> { + // Create a versioned store with automatic rotation at 1MB + let mut store = VersionedKVStore::new( + "versioned_store.jrn", + 1024 * 1024, // Rotate when journal reaches 1MB + None // Optional rotation callback + )?; + + // All write operations return version numbers + let v1 = store.insert("config".to_string(), Value::String("v1".to_string()))?; + println!("Inserted at version: {}", v1); + + let v2 = store.insert("config".to_string(), Value::String("v2".to_string()))?; + println!("Updated at version: {}", v2); + + // Read current state (O(1) from cache) + assert_eq!(store.get("config"), Some(&Value::String("v2".to_string()))); + + // Removing a key also returns a version + let v3 = store.remove("config")?; + if let Some(version) = v3 { + println!("Removed at version: {}", version); + } + + Ok(()) +} +``` + +### Versioned Store with Rotation Callback + +Monitor journal rotation events for remote backup or cleanup: + +```rust +use bd_resilient_kv::{VersionedKVStore, RotationCallback}; +use bd_bonjson::Value; +use std::sync::Arc; + +fn upload_to_remote(path: &str, version: u64) { + println!("Uploading archived journal {} at version {}", path, version); + // Upload to S3, backup server, etc. +} + +fn main() -> anyhow::Result<()> { + let callback: RotationCallback = Arc::new(|archived_path, version| { + upload_to_remote(archived_path, version); + }); + + let mut store = VersionedKVStore::new( + "my_store.jrn", + 512 * 1024, // 512KB rotation threshold + Some(callback) + )?; + + // When high water mark is reached during insert/remove, + // the callback will be invoked with archived journal path + for i in 0..10000 { + store.insert(format!("key_{}", i), Value::Integer(i as i64))?; + // Automatic rotation happens when journal exceeds 512KB + } + + // Manual rotation is also supported + store.rotate()?; + + Ok(()) +} +``` + +### Key Features of VersionedKVStore + +- **Version Tracking**: Every `insert()` and `remove()` returns a monotonically increasing version number +- **Timestamp Preservation**: Write timestamps are internally tracked and preserved during journal rotation for recovery purposes +- **Automatic Rotation**: When the journal exceeds the high water mark, it automatically: + - Creates a new journal with the current state as versioned entries (compaction) + - Preserves original timestamps from the initial writes + - Archives the old journal with `.v{version}.zz` suffix + - Compresses the archived journal using zlib (RFC 1950, level 3) + - Invokes the rotation callback (if provided) for upload/cleanup +- **Automatic Compression**: Archived journals are automatically compressed to save disk space + - Active journals remain uncompressed for write performance + - Typically achieves >50% size reduction for text-based data + - Transparent decompression during recovery operations +- **O(1) Reads**: In-memory cache provides constant-time access to current state +- **Persistent**: Uses memory-mapped journals for crash-resilient storage + +See [VERSIONED_FORMAT.md](./VERSIONED_FORMAT.md) for detailed format documentation and recovery scenarios. + ## API Reference -### KVStore +### KVStore (Standard Key-Value Store) The main interface for the key-value store. @@ -212,11 +318,64 @@ pub fn remove(&mut self, key: &str) -> anyhow::Result> pub fn clear(&mut self) -> anyhow::Result<()> ``` +### VersionedKVStore (Version-Tracked Key-Value Store) + +A higher-level store that tracks versions for every write operation and supports point-in-time recovery. + +#### Constructor + +```rust +pub fn new>( + journal_path: P, + high_water_mark: usize, + rotation_callback: Option +) -> anyhow::Result +``` + +- `journal_path`: Path to the journal file (e.g., "my_store.jrn") +- `high_water_mark`: Size threshold for automatic rotation (in bytes) +- `rotation_callback`: Optional callback invoked when journal is rotated + - Signature: `Arc` + - Parameters: `(archived_journal_path, version_at_rotation)` + +#### Core Methods + +```rust +// Read operations (O(1) from cache) +pub fn get(&self, key: &str) -> Option<&Value> +pub fn contains_key(&self, key: &str) -> bool +pub fn len(&self) -> usize +pub fn is_empty(&self) -> bool +pub fn as_hashmap(&self) -> HashMap + +// Write operations (return version numbers) +pub fn insert(&mut self, key: String, value: Value) -> anyhow::Result +pub fn remove(&mut self, key: &str) -> anyhow::Result> + +// Manual rotation +pub fn rotate(&mut self) -> anyhow::Result<()> + +// Version information +pub fn current_version(&self) -> u64 +``` + +**Internal Timestamp Tracking**: The store internally tracks timestamps for all writes and preserves them during journal rotation. These timestamps are used for recovery and point-in-time operations but are not exposed in the primary API. For advanced use cases requiring timestamp access, the `get_with_timestamp()` method is available. + +#### Type Aliases + +```rust +pub type RotationCallback = Arc; +``` + ## Architecture -### Double-Buffered Journaling +### Storage Models -The store uses a double-buffered approach with two journal files: +The library provides two storage architectures: + +#### 1. Double-Buffered Journaling (KVStore) + +The standard store uses a double-buffered approach with two journal files: 1. **Active Journal**: Receives new writes 2. **Inactive Journal**: Standby for compression @@ -225,6 +384,34 @@ The store uses a double-buffered approach with two journal files: - Switches the inactive journal to become the new active journal - Resets the old active journal for future use +#### 2. Versioned Single-Journal (VersionedKVStore) + +The versioned store uses a different architecture optimized for version tracking: + +1. **Single Active Journal**: All writes go to one journal file +2. **Version Tracking**: Every entry includes a monotonically increasing version number +3. **Automatic Rotation**: When the journal reaches the high water mark: + - Current state is serialized as versioned entries into a new journal + - Old journal is archived with `.v{version}` suffix (e.g., `store.jrn.v123`) + - Optional callback is invoked for remote upload/cleanup +4. **Point-in-Time Recovery**: Journal can be replayed up to any previous version + +**Rotation Strategy**: +``` +Before rotation: + my_store.jrn (1MB, versions 1-1000) + +After rotation: + my_store.jrn (compacted, starts at version 1001) + my_store.jrn.v1000.zz (archived, compressed, readonly) +``` + +**Compression**: +- Archived journals are automatically compressed using zlib (RFC 1950, level 3) +- Active journals remain uncompressed for optimal write performance +- Decompression is handled transparently during recovery +- File extension `.zz` indicates compressed archives + ### Memory-Mapped I/O - Uses `memmap2` for efficient file operations @@ -233,13 +420,21 @@ The store uses a double-buffered approach with two journal files: ### Caching Strategy +Both `KVStore` and `VersionedKVStore` use the same caching approach: + - Maintains an in-memory `HashMap` cache of all key-value pairs - Cache is always kept in sync with the persistent state - Provides O(1) read performance - Write operations update both cache and journal +**VersionedKVStore Additions**: +- Maintains current version counter +- Can reconstruct state at any historical version by replaying journal entries + ## Performance Characteristics +### KVStore (Standard) + | Operation | Time Complexity | Notes | |------------------|-----------------|---------------------------------| | `get()` | O(1) | Reads from in-memory cache | @@ -250,6 +445,19 @@ The store uses a double-buffered approach with two journal files: | `as_hashmap()` | O(1) | Returns reference to cache | | `clear()` | O(1) | Efficient journal clearing | +### VersionedKVStore (With Version Tracking) + +| Operation | Time Complexity | Notes | +|--------------------|-----------------|-------------------------------------| +| `get()` | O(1) | Reads from in-memory cache | +| `insert()` | O(1) amortized | Journal write + cache + version | +| `remove()` | O(1) amortized | Journal write + cache + version | +| `contains_key()` | O(1) | Cache lookup | +| `len()` | O(1) | Cache size | +| `as_hashmap()` | O(n) | Creates temporary map of values | +| `rotate()` | O(n) | Serializes current state to new journal | +| `current_version()`| O(1) | Returns version counter | + ## Error Handling All write operations return `anyhow::Result` for comprehensive error handling, while read operations return values directly from the cache: @@ -272,6 +480,8 @@ fn main() -> anyhow::Result<()> { ## File Management +### KVStore Files + The library automatically manages journal files: - **Creation**: Files are created if they don't exist @@ -286,9 +496,26 @@ my_store.jrna # Journal A my_store.jrnb # Journal B ``` +### VersionedKVStore Files + +The versioned store manages a single journal with archived versions: + +- **Active Journal**: Current journal file (e.g., `my_store.jrn`) +- **Archived Journals**: Previous versions with `.v{version}` suffix +- **Automatic Archival**: Old journals are preserved during rotation +- **Callback Integration**: Application controls upload/cleanup of archived journals + +Example file structure after multiple rotations: +``` +my_store.jrn # Active journal (current, uncompressed) +my_store.jrn.v1000.zz # Archived at version 1000 (compressed) +my_store.jrn.v2500.zz # Archived at version 2500 (compressed) +my_store.jrn.v4000.zz # Archived at version 4000 (compressed) +``` + ## Thread Safety -`KVStore` is **not** thread-safe by design for maximum performance. For concurrent access, wrap it in appropriate synchronization primitives: +Both `KVStore` and `VersionedKVStore` are **not** thread-safe by design for maximum performance. For concurrent access, wrap them in appropriate synchronization primitives: ```rust use std::sync::{Arc, Mutex}; @@ -303,6 +530,46 @@ let store = Arc::new(Mutex::new( ## Advanced Usage +### Archived Journal Compression + +**VersionedKVStore** automatically compresses archived journals to save disk space: + +```rust +use bd_resilient_kv::VersionedKVStore; +use bd_bonjson::Value; + +fn main() -> anyhow::Result<()> { + let mut store = VersionedKVStore::new( + "my_store.jrn", + 512 * 1024, // 512KB rotation threshold + None + )?; + + // Write data that will trigger rotation + for i in 0..10000 { + store.insert(format!("key_{}", i), Value::Integer(i as i64))?; + } + + // After rotation, archived journals are automatically compressed: + // - my_store.jrn (active, uncompressed) + // - my_store.jrn.v10000.zz (archived, compressed with zlib) + + Ok(()) +} +``` + +**Compression Details**: +- **Format**: zlib (RFC 1950) with compression level 3 +- **Performance**: Balanced speed/compression ratio +- **Transparency**: Recovery automatically detects and decompresses archived journals +- **Naming**: `.zz` extension indicates compressed archives +- **Typical Savings**: >50% size reduction for text-based data + +**Active vs Archived**: +- Active journals remain **uncompressed** for maximum write performance +- Only archived journals are compressed during rotation +- No configuration needed - compression is automatic + ### Custom Buffer Sizes Choose buffer sizes based on your use case: diff --git a/bd-resilient-kv/VERSIONED_FORMAT.md b/bd-resilient-kv/VERSIONED_FORMAT.md index 5a896786..38a724e2 100644 --- a/bd-resilient-kv/VERSIONED_FORMAT.md +++ b/bd-resilient-kv/VERSIONED_FORMAT.md @@ -2,13 +2,13 @@ ## Overview -This document describes the versioned journal format (VERSION 2) that enables point-in-time recovery by tracking write versions for each operation. +This document describes the versioned journal format (VERSION 2) that enables version tracking for audit logs and remote backup by tracking write versions for each operation. ## Goals 1. **Version Tracking**: Each write operation gets a unique, monotonically increasing version number -2. **Point-in-Time Recovery**: Ability to reconstruct exact state at any version -3. **Journal Rotation**: Periodic compaction with self-contained state in each journal +2. **Journal Rotation**: Periodic compaction with self-contained state in each journal +3. **Remote Backup**: Archived journals can be uploaded to remote storage 4. **Backward Compatible**: New format coexists with existing VERSION 1 ## Design Philosophy @@ -22,11 +22,13 @@ Unlike traditional journal systems that use separate snapshot files, this design ## File Types ### 1. Active Journal (`my_store.jrn`) -The current active journal receiving new writes. +The current active journal receiving new writes. Active journals are **not compressed** for performance reasons. -### 2. Archived Journals (`my_store.jrn.v00020000`, `my_store.jrn.v00030000`, etc.) +### 2. Archived Journals (`my_store.jrn.v00020000.zz`, `my_store.jrn.v00030000.zz`, etc.) Previous journals, archived during rotation. Each contains complete state at rotation version plus subsequent incremental writes. The version number in the filename indicates the rotation/snapshot version. +**Archived journals are automatically compressed using zlib** (indicated by the `.zz` extension) to reduce storage space and bandwidth requirements for remote backup. Compression is mandatory and occurs automatically during rotation. + ## Format Specification ### Journal Format (VERSION 2) @@ -102,9 +104,11 @@ When high water mark is reached at version N: 1. **Create New Journal**: Initialize fresh journal file (e.g., `my_store.jrn.tmp`) 2. **Write Compacted State**: Write all current key-value pairs as versioned entries at version N -3. **Archive Old Journal**: Rename `my_store.jrn` → `my_store.jrn.v{N}` +3. **Archive Old Journal**: Rename `my_store.jrn` → `my_store.jrn.old` (temporary) 4. **Activate New Journal**: Rename `my_store.jrn.tmp` → `my_store.jrn` -5. **Callback**: Notify application for upload/cleanup of archived journal +5. **Compress Archive**: Compress `my_store.jrn.old` → `my_store.jrn.v{N}.zz` using zlib +6. **Delete Temporary**: Remove uncompressed `my_store.jrn.old` +7. **Callback**: Notify application for upload/cleanup of compressed archived journal Example: ``` @@ -113,53 +117,109 @@ Before rotation at v30000: After rotation: my_store.jrn # Active, base_version=30000, contains compacted state at v30000 - my_store.jrn.v30000 # Archived, contains v20000-v30000 + my_store.jrn.v30000.zz # Compressed archive, contains v20000-v30000 ``` -## Recovery Process +### Compression + +Archived journals are automatically compressed using zlib (compression level 3) during rotation: +- **Format**: Standard zlib format (RFC 1950) +- **Extension**: `.zz` indicates zlib compression +- **Transparency**: `VersionedRecovery` automatically decompresses archives when reading +- **Benefits**: Reduced storage space and bandwidth for remote backups +- **Performance**: Compression level 3 provides good balance between speed and compression ratio + +## Recovery and Audit ### Current State Recovery -Simply read the active journal (`my_store.jrn`) and replay all entries. +Simply read the active journal (`my_store.jrn`) and replay all entries to reconstruct the current state. -### Point-in-Time Recovery +### Audit and Analysis +While `VersionedKVStore` does not support point-in-time recovery through its API, archived journals contain complete historical data that can be used for: -To recover state at target version T: +- **Audit Logging**: Review what changes were made and when +- **Offline Analysis**: Process archived journals to understand historical patterns +- **Remote Backup**: Upload archived journals to remote storage for disaster recovery +- **Compliance**: Maintain immutable records of all changes with version tracking -1. **Find Correct Journal**: - - Check active journal's base_version and current_version range - - If T is in active journal range, use active journal - - Otherwise, find archived journal with appropriate version range +The version numbers in each entry allow you to understand the exact sequence of operations and build custom tooling for analyzing historical data. -2. **Replay Entries**: - - Read all entries from the journal - - Apply entries with version <= T - - Stop when reaching entries with version > T +### Point-in-Time Recovery with VersionedRecovery -3. **Result**: Exact state at version T +While `VersionedKVStore` is designed for active operation and does not support point-in-time recovery through its API, the `VersionedRecovery` utility provides a way to reconstruct state at arbitrary historical versions from raw journal bytes. -### Example Recovery Scenarios +#### Overview -**File Structure:** -``` -my_store.jrn # Active, base_version=30000, current=35000 -my_store.jrn.v30000 # Archived, contains v20000-v30000 -my_store.jrn.v20000 # Archived, contains v10000-v20000 +`VersionedRecovery` is a separate utility that: +- Works with raw journal bytes (`&[u8]`), not file paths +- Does not perform any file I/O operations +- Can process multiple journals for cross-rotation recovery +- Designed for offline analysis, server-side tooling, and audit systems +- Completely independent from `VersionedKVStore` + +#### Use Cases + +- **Server-Side Analysis**: Reconstruct state at specific versions for debugging or investigation +- **Audit Tooling**: Build custom audit systems that analyze historical changes +- **Cross-Rotation Recovery**: Recover state spanning multiple archived journals +- **Compliance**: Extract state at specific points in time for regulatory requirements +- **Testing**: Validate that state at historical versions matches expectations + +#### API Methods + +```rust +// Create recovery utility from journal byte slices (oldest to newest) +let recovery = VersionedRecovery::new(vec![&archived_journal, &active_journal])?; + +// Recover state at specific version +let state = recovery.recover_at_version(25000)?; + +// Get current state (latest version) +let current = recovery.recover_current()?; + +// Get available version range +if let Some((min, max)) = recovery.version_range() { + println!("Can recover versions {min} to {max}"); +} ``` -**Recover at v25000:** -1. Load `my_store.jrn.v30000` (archived journal) -2. Replay entries with version <= 25000 -3. Result: State at v25000 +#### Example: Cross-Rotation Recovery + +```rust +use bd_resilient_kv::VersionedRecovery; +use std::fs; + +// Load archived journals from remote storage or local disk +let journal_v20000 = fs::read("store.jrn.v20000")?; +let journal_v30000 = fs::read("store.jrn.v30000")?; +let journal_active = fs::read("store.jrn")?; + +// Create recovery utility with all journals +let recovery = VersionedRecovery::new(vec![ + &journal_v20000, + &journal_v30000, + &journal_active, +])?; + +// Recover state at version 25000 (in archived journal) +let state_at_25000 = recovery.recover_at_version(25000)?; + +// Recover state at version 35000 (across rotation boundary) +let state_at_35000 = recovery.recover_at_version(35000)?; + +// Process the recovered state +for (key, value) in state_at_25000 { + println!("{key} = {value:?}"); +} +``` -**Recover at v30000:** -1. Load `my_store.jrn.v30000` (archived journal) -2. Replay all entries up to v30000 -3. Result: State at v30000 +#### Implementation Details -**Recover at v32000:** -1. Load `my_store.jrn` (active journal, base_version=30000) -2. Replay entries with version <= 32000 -3. Result: State at v32000 +- **No File I/O**: Works purely with byte slices, caller is responsible for loading data +- **Chronological Order**: Journals should be provided oldest to newest +- **Efficient Replay**: Automatically skips journals outside the target version range +- **Cross-Rotation**: Seamlessly handles recovery across multiple archived journals +- **Version Tracking**: Replays all entries up to and including the target version ## Storage Efficiency @@ -193,8 +253,8 @@ let mut store = VersionedKVStore::new("mystore.jrn", 1024 * 1024, None)?; let v1 = store.insert("key1".to_string(), Value::from(42))?; let v2 = store.insert("key2".to_string(), Value::from("hello"))?; -// Point-in-time recovery -let state_at_v1 = store.as_hashmap_at_version(v1)?; +// Read current values +let value = store.get("key1")?; ``` ### Rotation Callback diff --git a/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs b/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs index 5eb55cff..f6ddc09a 100644 --- a/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs +++ b/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs @@ -5,7 +5,7 @@ // LICENSE file or at: // https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt -use super::versioned::VersionedKVJournal; +use super::versioned::{TimestampedValue, VersionedKVJournal}; use ahash::AHashMap; use bd_bonjson::Value; use memmap2::{MmapMut, MmapOptions}; @@ -122,21 +122,21 @@ impl MemMappedVersionedKVJournal { /// Set a key-value pair with automatic version increment. /// - /// Returns the version number assigned to this write. + /// Returns a tuple of (version, timestamp). /// /// # Errors /// Returns an error if the journal entry cannot be written. - pub fn set_versioned(&mut self, key: &str, value: &Value) -> anyhow::Result { + pub fn set_versioned(&mut self, key: &str, value: &Value) -> anyhow::Result<(u64, u64)> { self.versioned_kv.set_versioned(key, value) } /// Delete a key with automatic version increment. /// - /// Returns the version number assigned to this deletion. + /// Returns a tuple of (version, timestamp). /// /// # Errors /// Returns an error if the journal entry cannot be written. - pub fn delete_versioned(&mut self, key: &str) -> anyhow::Result { + pub fn delete_versioned(&mut self, key: &str) -> anyhow::Result<(u64, u64)> { self.versioned_kv.delete_versioned(key) } @@ -184,6 +184,14 @@ impl MemMappedVersionedKVJournal { self.versioned_kv.as_hashmap() } + /// Reconstruct the hashmap with timestamps by replaying all journal entries. + /// + /// # Errors + /// Returns an error if the buffer cannot be decoded. + pub fn as_hashmap_with_timestamps(&self) -> anyhow::Result> { + self.versioned_kv.as_hashmap_with_timestamps() + } + /// Reconstruct the hashmap at a specific version by replaying entries up to that version. /// /// # Errors diff --git a/bd-resilient-kv/src/kv_journal/mod.rs b/bd-resilient-kv/src/kv_journal/mod.rs index 866f5c27..b9799927 100644 --- a/bd-resilient-kv/src/kv_journal/mod.rs +++ b/bd-resilient-kv/src/kv_journal/mod.rs @@ -101,4 +101,4 @@ pub use double_buffered::DoubleBufferedKVJournal; pub use in_memory::InMemoryKVJournal; pub use memmapped::MemMappedKVJournal; pub use memmapped_versioned::MemMappedVersionedKVJournal; -pub use versioned::VersionedKVJournal; +pub use versioned::{TimestampedValue, VersionedKVJournal}; diff --git a/bd-resilient-kv/src/kv_journal/versioned.rs b/bd-resilient-kv/src/kv_journal/versioned.rs index 6277af17..89a7a4fd 100644 --- a/bd-resilient-kv/src/kv_journal/versioned.rs +++ b/bd-resilient-kv/src/kv_journal/versioned.rs @@ -15,6 +15,15 @@ use bytes::BufMut; use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{SystemTime, UNIX_EPOCH}; +/// Represents a value with its associated timestamp. +#[derive(Debug, Clone, PartialEq)] +pub struct TimestampedValue { + /// The value stored in the key-value store. + pub value: Value, + /// The timestamp (in nanoseconds since UNIX epoch) when this value was last written. + pub timestamp: u64, +} + /// Versioned implementation of a key-value journaling system that tracks write versions /// for point-in-time recovery. /// @@ -329,13 +338,13 @@ impl<'a> VersionedKVJournal<'a> { self.high_water_mark_triggered = true; } - /// Write a versioned journal entry. + /// Write a versioned journal entry and return the timestamp. fn write_versioned_entry( &mut self, version: u64, key: &str, value: &Value, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { let buffer_len = self.buffer.len(); let mut cursor = &mut self.buffer[self.position ..]; @@ -352,21 +361,23 @@ impl<'a> VersionedKVJournal<'a> { let remaining = cursor.remaining_mut(); self.set_position(buffer_len - remaining); - Ok(()) + Ok(timestamp) } /// Set a key-value pair with automatic version increment. - pub fn set_versioned(&mut self, key: &str, value: &Value) -> anyhow::Result { + /// Returns a tuple of (version, timestamp). + pub fn set_versioned(&mut self, key: &str, value: &Value) -> anyhow::Result<(u64, u64)> { let version = self.current_version.fetch_add(1, Ordering::SeqCst) + 1; - self.write_versioned_entry(version, key, value)?; - Ok(version) + let timestamp = self.write_versioned_entry(version, key, value)?; + Ok((version, timestamp)) } /// Delete a key with automatic version increment. - pub fn delete_versioned(&mut self, key: &str) -> anyhow::Result { + /// Returns a tuple of (version, timestamp). + pub fn delete_versioned(&mut self, key: &str) -> anyhow::Result<(u64, u64)> { let version = self.current_version.fetch_add(1, Ordering::SeqCst) + 1; - self.write_versioned_entry(version, key, &Value::Null)?; - Ok(version) + let timestamp = self.write_versioned_entry(version, key, &Value::Null)?; + Ok((version, timestamp)) } /// Get the high water mark position. @@ -427,6 +438,52 @@ impl<'a> VersionedKVJournal<'a> { Ok(map) } + /// Reconstruct the hashmap with timestamps by replaying all journal entries. + pub fn as_hashmap_with_timestamps(&self) -> anyhow::Result> { + let array = read_bonjson_payload(self.buffer)?; + let mut map = AHashMap::new(); + + if let Value::Array(entries) = array { + for (index, entry) in entries.iter().enumerate() { + // Skip metadata (first entry) + if index == 0 { + continue; + } + + if let Value::Object(obj) = entry { + // Extract key, operation, and timestamp from versioned entry + if let Some(Value::String(key)) = obj.get("k") + && let Some(operation) = obj.get("o") + { + // Extract timestamp (default to 0 if not found) + let timestamp = if let Some(Value::Unsigned(t)) = obj.get("t") { + *t + } else if let Some(Value::Signed(t)) = obj.get("t") { + #[allow(clippy::cast_sign_loss)] + (*t as u64) + } else { + 0 + }; + + if operation.is_null() { + map.remove(key); + } else { + map.insert( + key.clone(), + TimestampedValue { + value: operation.clone(), + timestamp, + }, + ); + } + } + } + } + } + + Ok(map) + } + /// Reconstruct the hashmap at a specific version by replaying entries up to that version. pub fn as_hashmap_at_version( &self, @@ -488,12 +545,12 @@ impl<'a> VersionedKVJournal<'a> { /// Create a new journal initialized with the compacted state from a snapshot version. /// /// The new journal will have all current key-value pairs written as versioned entries - /// at the `snapshot_version`, followed by the ability to continue with incremental writes. + /// at the `snapshot_version`, using their original timestamps to preserve historical accuracy. /// /// # Arguments /// * `buffer` - The buffer to write the new journal to /// * `snapshot_version` - The version to assign to all compacted state entries - /// * `state` - The current key-value state to write + /// * `state` - The current key-value state with timestamps to write /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark /// /// # Errors @@ -501,24 +558,27 @@ impl<'a> VersionedKVJournal<'a> { pub fn create_rotated_journal( buffer: &'a mut [u8], snapshot_version: u64, - state: &AHashMap, + state: &AHashMap, high_water_mark_ratio: Option, ) -> anyhow::Result { // Create a new journal with the snapshot version as the base let mut journal = Self::new(buffer, snapshot_version, high_water_mark_ratio)?; // Write all current state as versioned entries at the snapshot version - let timestamp = current_timestamp()?; - for (key, value) in state { + // Use the original timestamp from each entry to preserve historical accuracy + for (key, timestamped_value) in state { let buffer_len = journal.buffer.len(); let mut cursor = &mut journal.buffer[journal.position ..]; // Create entry object: {"v": version, "t": timestamp, "k": key, "o": value} let mut entry = AHashMap::new(); entry.insert("v".to_string(), Value::Unsigned(snapshot_version)); - entry.insert("t".to_string(), Value::Unsigned(timestamp)); + entry.insert( + "t".to_string(), + Value::Unsigned(timestamped_value.timestamp), + ); entry.insert("k".to_string(), Value::String(key.clone())); - entry.insert("o".to_string(), value.clone()); + entry.insert("o".to_string(), timestamped_value.value.clone()); encode_into_buf(&mut cursor, &Value::Object(entry)) .map_err(|e| anyhow::anyhow!("Failed to encode state entry: {e:?}"))?; diff --git a/bd-resilient-kv/src/lib.rs b/bd-resilient-kv/src/lib.rs index e9e399c3..2b4c1732 100644 --- a/bd-resilient-kv/src/lib.rs +++ b/bd-resilient-kv/src/lib.rs @@ -20,6 +20,7 @@ mod tests; pub mod kv_journal; pub mod kv_store; pub mod versioned_kv_store; +pub mod versioned_recovery; pub use kv_journal::{ DoubleBufferedKVJournal, @@ -31,3 +32,4 @@ pub use kv_journal::{ }; pub use kv_store::KVStore; pub use versioned_kv_store::{RotationCallback, VersionedKVStore}; +pub use versioned_recovery::VersionedRecovery; diff --git a/bd-resilient-kv/src/tests/mod.rs b/bd-resilient-kv/src/tests/mod.rs index 6e2af215..7d838890 100644 --- a/bd-resilient-kv/src/tests/mod.rs +++ b/bd-resilient-kv/src/tests/mod.rs @@ -34,3 +34,4 @@ pub mod kv_store_test; pub mod kv_test; pub mod memmapped_test; pub mod versioned_kv_store_test; +pub mod versioned_recovery_test; diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index 5ae93d70..ad6e0950 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -92,83 +92,17 @@ fn test_versioned_store_remove() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_point_in_time_recovery() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; - - // Create a sequence of writes - let v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; - let v2 = store.insert("key2".to_string(), Value::String("value2".to_string()))?; - let v3 = store.insert("key1".to_string(), Value::String("updated1".to_string()))?; - let v4 = store.remove("key2")?; - - // Current state should have key1=updated1, key2 deleted - assert_eq!( - store.get("key1"), - Some(&Value::String("updated1".to_string())) - ); - assert_eq!(store.get("key2"), None); - assert_eq!(store.len(), 1); - - // Recover at v1: should have key1=value1 - let state_v1 = store.as_hashmap_at_version(v1)?; - assert_eq!(state_v1.len(), 1); - assert_eq!( - state_v1.get("key1"), - Some(&Value::String("value1".to_string())) - ); - - // Recover at v2: should have key1=value1, key2=value2 - let state_v2 = store.as_hashmap_at_version(v2)?; - assert_eq!(state_v2.len(), 2); - assert_eq!( - state_v2.get("key1"), - Some(&Value::String("value1".to_string())) - ); - assert_eq!( - state_v2.get("key2"), - Some(&Value::String("value2".to_string())) - ); - - // Recover at v3: should have key1=updated1, key2=value2 - let state_v3 = store.as_hashmap_at_version(v3)?; - assert_eq!(state_v3.len(), 2); - assert_eq!( - state_v3.get("key1"), - Some(&Value::String("updated1".to_string())) - ); - assert_eq!( - state_v3.get("key2"), - Some(&Value::String("value2".to_string())) - ); - - // Recover at v4: should have key1=updated1, key2 deleted - let state_v4 = store.as_hashmap_at_version(v4.unwrap())?; - assert_eq!(state_v4.len(), 1); - assert_eq!( - state_v4.get("key1"), - Some(&Value::String("updated1".to_string())) - ); - assert!(!state_v4.contains_key("key2")); - - Ok(()) -} - #[test] fn test_persistence_and_reload() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); - let v1; let v2; // Create store and write some data { let mut store = VersionedKVStore::new(&file_path, 4096, None)?; - v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; + let _v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; v2 = store.insert("key2".to_string(), Value::Signed(42))?; store.sync()?; } @@ -185,14 +119,6 @@ fn test_persistence_and_reload() -> anyhow::Result<()> { // Version numbers should be preserved assert_eq!(store.current_version(), v2); - - // Point-in-time recovery should still work - let state_v1 = store.as_hashmap_at_version(v1)?; - assert_eq!(state_v1.len(), 1); - assert_eq!( - state_v1.get("key1"), - Some(&Value::String("value1".to_string())) - ); } Ok(()) @@ -275,10 +201,10 @@ fn test_manual_rotation() -> anyhow::Result<()> { let rotation_version = store.current_version(); store.rotate_journal()?; - // Verify archived file exists + // Verify archived file exists (compressed) let archived_path = temp_dir .path() - .join(format!("test.jrn.v{}", rotation_version)); + .join(format!("test.jrn.v{}.zz", rotation_version)); assert!(archived_path.exists()); // Verify active journal still works @@ -319,7 +245,7 @@ fn test_rotation_preserves_state() -> anyhow::Result<()> { store.insert("key3".to_string(), Value::Bool(true))?; store.insert("key4".to_string(), Value::Float(3.14159))?; - let pre_rotation_state = store.as_hashmap().clone(); + let pre_rotation_state = store.as_hashmap(); let pre_rotation_version = store.current_version(); // Rotate @@ -327,7 +253,7 @@ fn test_rotation_preserves_state() -> anyhow::Result<()> { // Verify state is preserved exactly let post_rotation_state = store.as_hashmap(); - assert_eq!(&pre_rotation_state, post_rotation_state); + assert_eq!(pre_rotation_state, post_rotation_state); assert_eq!(store.len(), 4); // Verify we can continue writing @@ -352,10 +278,6 @@ fn test_empty_store_operations() -> anyhow::Result<()> { assert!(store.is_empty()); assert_eq!(store.len(), 0); - // Point-in-time recovery of empty state - let state = store.as_hashmap_at_version(1)?; - assert!(state.is_empty()); - Ok(()) } @@ -394,3 +316,229 @@ fn test_version_monotonicity() -> anyhow::Result<()> { Ok(()) } + +#[test] +fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + // Create store with small buffer to trigger rotation easily + let mut store = VersionedKVStore::new(&file_path, 2048, Some(0.5))?; + + // Insert some keys and capture their timestamps + store.insert("key1".to_string(), Value::String("value1".to_string()))?; + #[allow(clippy::unwrap_used)] + let ts1 = store + .get_with_timestamp("key1") + .map(|tv| tv.timestamp) + .unwrap(); + + // Small sleep to ensure different timestamps + std::thread::sleep(std::time::Duration::from_millis(10)); + + store.insert("key2".to_string(), Value::String("value2".to_string()))?; + #[allow(clippy::unwrap_used)] + let ts2 = store + .get_with_timestamp("key2") + .map(|tv| tv.timestamp) + .unwrap(); + + // Verify timestamps are different + assert_ne!(ts1, ts2, "Timestamps should be different"); + assert!(ts2 > ts1, "Later writes should have later timestamps"); + + // Write enough data to trigger rotation + for i in 0 .. 50 { + store.insert(format!("fill{i}"), Value::Signed(i))?; + } + + // Verify that after rotation, the original timestamps are preserved + #[allow(clippy::unwrap_used)] + let ts1_after = store + .get_with_timestamp("key1") + .map(|tv| tv.timestamp) + .unwrap(); + + #[allow(clippy::unwrap_used)] + let ts2_after = store + .get_with_timestamp("key2") + .map(|tv| tv.timestamp) + .unwrap(); + + assert_eq!( + ts1, ts1_after, + "key1 timestamp should be preserved during rotation" + ); + assert_eq!( + ts2, ts2_after, + "key2 timestamp should be preserved during rotation" + ); + + // Verify ordering is still correct + assert!(ts2_after > ts1_after, "Timestamp ordering should be preserved"); + + Ok(()) +} + +#[test] +fn test_compression_during_rotation() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + // Insert some data + let data = "x".repeat(1000); // Large value to make compression effective + store.insert("key1".to_string(), Value::String(data.clone()))?; + store.insert("key2".to_string(), Value::String(data.clone()))?; + store.insert("key3".to_string(), Value::String(data))?; + + // Get size of uncompressed journal before rotation + let uncompressed_size = std::fs::metadata(&file_path)?.len(); + + // Get current version before rotation (this is what will be used in the archive name) + let rotation_version = store.current_version(); + + // Trigger rotation + store.rotate_journal()?; + + // Verify compressed archive exists + let archived_path = temp_dir + .path() + .join(format!("test.jrn.v{}.zz", rotation_version)); + assert!( + archived_path.exists(), + "Compressed archive should exist at {:?}", + archived_path + ); + + // Verify compressed size is smaller than original + let compressed_size = std::fs::metadata(&archived_path)?.len(); + assert!( + compressed_size < uncompressed_size, + "Compressed size ({}) should be smaller than uncompressed ({})", + compressed_size, + uncompressed_size + ); + + // Verify uncompressed temporary file was deleted + let temp_archive_path = temp_dir.path().join("test.jrn.old"); + assert!( + !temp_archive_path.exists(), + "Temporary uncompressed archive should be deleted" + ); + + // Verify active journal still works + store.insert("key4".to_string(), Value::String("value4".to_string()))?; + assert_eq!(store.len(), 4); + + Ok(()) +} + +#[test] +fn test_compression_ratio() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let mut store = VersionedKVStore::new(&file_path, 8192, None)?; + + // Insert highly compressible data + let compressible_data = "A".repeat(500); + for i in 0 .. 10 { + store.insert(format!("key{}", i), Value::String(compressible_data.clone()))?; + } + + let uncompressed_size = std::fs::metadata(&file_path)?.len(); + let rotation_version = store.current_version(); + + store.rotate_journal()?; + + let archived_path = temp_dir + .path() + .join(format!("test.jrn.v{}.zz", rotation_version)); + let compressed_size = std::fs::metadata(&archived_path)?.len(); + + // With highly compressible data, we should get significant compression + // Expecting at least 50% compression ratio for repeated characters + #[allow(clippy::cast_precision_loss)] + let compression_ratio = (compressed_size as f64) / (uncompressed_size as f64); + assert!( + compression_ratio < 0.5, + "Compression ratio should be better than 50% for repeated data, got {:.2}%", + compression_ratio * 100.0 + ); + + Ok(()) +} + +#[test] +fn test_multiple_rotations_with_compression() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + let mut rotation_versions = Vec::new(); + + // Perform multiple rotations + for i in 0 .. 3 { + let key = format!("key{}", i); + let value = Value::String(format!("value{}", i)); + let version = store.insert(key, value)?; + rotation_versions.push(version); + store.rotate_journal()?; + } + + // Verify all compressed archives exist + for version in rotation_versions { + let archived_path = temp_dir + .path() + .join(format!("test.jrn.v{}.zz", version)); + assert!( + archived_path.exists(), + "Compressed archive for version {} should exist", + version + ); + } + + Ok(()) +} + +#[test] +fn test_rotation_callback_receives_compressed_path() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + let callback_data = Arc::new(Mutex::new(None)); + let callback_data_clone = Arc::clone(&callback_data); + + store.set_rotation_callback(Box::new(move |old_path, _new_path, _version| { + let mut data = callback_data_clone.lock().unwrap(); + *data = Some(old_path.to_path_buf()); + })); + + store.insert("key1".to_string(), Value::String("value1".to_string()))?; + store.rotate_journal()?; + + // Verify callback received compressed path + let data = callback_data.lock().unwrap(); + #[allow(clippy::unwrap_used)] + let archived_path = data.as_ref().unwrap(); + + assert!( + archived_path.to_string_lossy().ends_with(".zz"), + "Callback should receive compressed archive path ending with .zz, got: {:?}", + archived_path + ); + + // Verify the file actually exists + assert!( + archived_path.exists(), + "Compressed archive passed to callback should exist" + ); + + Ok(()) +} + diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs new file mode 100644 index 00000000..e2c48866 --- /dev/null +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -0,0 +1,485 @@ +// shared-core - bitdrift's common client/server libraries +// Copyright Bitdrift, Inc. All rights reserved. +// +// Use of this source code is governed by a source available license that can be found in the +// LICENSE file or at: +// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt + +use crate::VersionedKVStore; +use crate::versioned_recovery::VersionedRecovery; +use bd_bonjson::Value; +use tempfile::TempDir; + +#[test] +fn test_recovery_single_journal() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + // Create a store and write some versioned data + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + let v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; + let v2 = store.insert("key2".to_string(), Value::String("value2".to_string()))?; + let v3 = store.insert("key1".to_string(), Value::String("updated1".to_string()))?; + store.sync()?; + + // Read the journal data + let journal_data = std::fs::read(&file_path)?; + + // Create recovery utility + let recovery = VersionedRecovery::new(vec![&journal_data])?; + + // Verify version range + let version_range = recovery.version_range(); + assert!(version_range.is_some()); + #[allow(clippy::unwrap_used)] + let (min, max) = version_range.unwrap(); + assert_eq!(min, 1); + assert_eq!(max, v3); + + // Recover at v1: should have only key1=value1 + let state_v1 = recovery.recover_at_version(v1)?; + assert_eq!(state_v1.len(), 1); + assert_eq!( + state_v1.get("key1").map(|tv| &tv.value), + Some(&Value::String("value1".to_string())) + ); + + // Recover at v2: should have key1=value1, key2=value2 + let state_v2 = recovery.recover_at_version(v2)?; + assert_eq!(state_v2.len(), 2); + assert_eq!( + state_v2.get("key1").map(|tv| &tv.value), + Some(&Value::String("value1".to_string())) + ); + assert_eq!( + state_v2.get("key2").map(|tv| &tv.value), + Some(&Value::String("value2".to_string())) + ); + + // Recover at v3: should have key1=updated1, key2=value2 + let state_v3 = recovery.recover_at_version(v3)?; + assert_eq!(state_v3.len(), 2); + assert_eq!( + state_v3.get("key1").map(|tv| &tv.value), + Some(&Value::String("updated1".to_string())) + ); + assert_eq!( + state_v3.get("key2").map(|tv| &tv.value), + Some(&Value::String("value2".to_string())) + ); + + // Recover current should match v3 + let current = recovery.recover_current()?; + assert_eq!(current, state_v3); + + Ok(()) +} + +#[test] +fn test_recovery_with_deletions() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + // Create a store with deletions + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + let v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; + let v2 = store.insert("key2".to_string(), Value::String("value2".to_string()))?; + let v3_opt = store.remove("key1")?; + assert!(v3_opt.is_some()); + #[allow(clippy::unwrap_used)] + let v3 = v3_opt.unwrap(); + store.sync()?; + + // Read the journal data + let journal_data = std::fs::read(&file_path)?; + let recovery = VersionedRecovery::new(vec![&journal_data])?; + + // At v1: key1 exists + let state_v1 = recovery.recover_at_version(v1)?; + assert_eq!(state_v1.len(), 1); + assert!(state_v1.contains_key("key1")); + + // At v2: both keys exist + let state_v2 = recovery.recover_at_version(v2)?; + assert_eq!(state_v2.len(), 2); + + // At v3: only key2 exists (key1 deleted) + let state_v3 = recovery.recover_at_version(v3)?; + assert_eq!(state_v3.len(), 1); + assert!(!state_v3.contains_key("key1")); + assert!(state_v3.contains_key("key2")); + + Ok(()) +} + +#[test] +fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + // Create a store with larger buffer to avoid BufferFull errors during test + let mut store = VersionedKVStore::new(&file_path, 2048, None)?; + + // Write data that will trigger rotation + let v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; + store.insert("key2".to_string(), Value::String("value2".to_string()))?; + + // Write more data to trigger rotation + for i in 0 .. 20 { + store.insert(format!("key{i}"), Value::Signed(i))?; + } + + let v_middle = store.current_version(); + + // Write more after rotation + let v_final = store.insert( + "final".to_string(), + Value::String("final_value".to_string()), + )?; + store.sync()?; + + // Read all journal files + let mut all_journals = Vec::new(); + + // Read archived journals + let archived_files = std::fs::read_dir(temp_dir.path())? + .filter_map(|entry| { + let entry = entry.ok()?; + let path = entry.path(); + if path.file_name()?.to_str()?.starts_with("test.jrn.v") { + Some(path) + } else { + None + } + }) + .collect::>(); + + for archived_path in archived_files { + all_journals.push(std::fs::read(archived_path)?); + } + + // Read active journal + all_journals.push(std::fs::read(&file_path)?); + + // Create recovery utility with all journals + let journal_refs: Vec<&[u8]> = all_journals.iter().map(std::vec::Vec::as_slice).collect(); + let recovery = VersionedRecovery::new(journal_refs)?; + + // Verify we can recover at early version + let state_v1 = recovery.recover_at_version(v1)?; + assert_eq!(state_v1.len(), 1); + assert!(state_v1.contains_key("key1")); + + // Verify we can recover at middle version (after rotation) + let state_middle = recovery.recover_at_version(v_middle)?; + assert!(state_middle.len() > 2); + assert!(state_middle.contains_key("key1")); + assert!(state_middle.contains_key("key2")); + + // Verify we can recover at final version + let state_final = recovery.recover_at_version(v_final)?; + assert!(state_final.contains_key("final")); + assert_eq!( + state_final.get("final").map(|tv| &tv.value), + Some(&Value::String("final_value".to_string())) + ); + + Ok(()) +} + +#[test] +fn test_recovery_empty_journal() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + // Create an empty store + let store = VersionedKVStore::new(&file_path, 4096, None)?; + store.sync()?; + + let journal_data = std::fs::read(&file_path)?; + let recovery = VersionedRecovery::new(vec![&journal_data])?; + + // Should have version range starting at 1 + let version_range = recovery.version_range(); + assert!(version_range.is_some()); + #[allow(clippy::unwrap_used)] + let (min, _max) = version_range.unwrap(); + assert_eq!(min, 1); + + // Recovering at any version should return empty map + let state = recovery.recover_at_version(1)?; + assert_eq!(state.len(), 0); + + Ok(()) +} + +#[test] +fn test_recovery_version_range() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + store.insert("key1".to_string(), Value::String("value1".to_string()))?; + store.insert("key2".to_string(), Value::String("value2".to_string()))?; + let v3 = store.insert("key3".to_string(), Value::String("value3".to_string()))?; + store.sync()?; + + let journal_data = std::fs::read(&file_path)?; + let recovery = VersionedRecovery::new(vec![&journal_data])?; + + let version_range = recovery.version_range(); + assert!(version_range.is_some()); + #[allow(clippy::unwrap_used)] + let (min, max) = version_range.unwrap(); + assert_eq!(min, 1); // base_version defaults to 1 for new stores + assert_eq!(max, v3); + + Ok(()) +} + +#[test] +fn test_recovery_with_overwrites() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + let v1 = store.insert("key".to_string(), Value::Signed(1))?; + let v2 = store.insert("key".to_string(), Value::Signed(2))?; + let v3 = store.insert("key".to_string(), Value::Signed(3))?; + store.sync()?; + + let journal_data = std::fs::read(&file_path)?; + let recovery = VersionedRecovery::new(vec![&journal_data])?; + + // Each version should show the value at that time + let state_v1 = recovery.recover_at_version(v1)?; + assert_eq!(state_v1.get("key").map(|tv| &tv.value), Some(&Value::Signed(1))); + + let state_v2 = recovery.recover_at_version(v2)?; + assert_eq!(state_v2.get("key").map(|tv| &tv.value), Some(&Value::Signed(2))); + + let state_v3 = recovery.recover_at_version(v3)?; + assert_eq!(state_v3.get("key").map(|tv| &tv.value), Some(&Value::Signed(3))); + + Ok(()) +} + +#[test] +fn test_recovery_various_value_types() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + store.insert("string".to_string(), Value::String("hello".to_string()))?; + store.insert("number".to_string(), Value::Signed(42))?; + store.insert("float".to_string(), Value::Float(3.14))?; + store.insert("bool".to_string(), Value::Bool(true))?; + let v_final = store.current_version(); + store.sync()?; + + let journal_data = std::fs::read(&file_path)?; + let recovery = VersionedRecovery::new(vec![&journal_data])?; + + let state = recovery.recover_at_version(v_final)?; + assert_eq!(state.len(), 4); + assert_eq!( + state.get("string").map(|tv| &tv.value), + Some(&Value::String("hello".to_string())) + ); + assert_eq!(state.get("number").map(|tv| &tv.value), Some(&Value::Signed(42))); + assert_eq!(state.get("float").map(|tv| &tv.value), Some(&Value::Float(3.14))); + assert_eq!(state.get("bool").map(|tv| &tv.value), Some(&Value::Bool(true))); + + Ok(()) +} + +#[test] +fn test_recovery_from_compressed_archive() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + // Create a store and write some data + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + let v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; + let v2 = store.insert("key2".to_string(), Value::String("value2".to_string()))?; + store.sync()?; + + // Get the current version before rotation (this will be used in the archive name) + let archive_version = store.current_version(); + + // Rotate to create compressed archive + store.rotate_journal()?; + + // Add more data to active journal + let v3 = store.insert("key3".to_string(), Value::String("value3".to_string()))?; + store.sync()?; + + // Find the compressed archive (using the version at the time of rotation) + let archived_path = temp_dir.path().join(format!("test.jrn.v{}.zz", archive_version)); + assert!(archived_path.exists(), "Compressed archive should exist"); + + // Read both journals + let compressed_data = std::fs::read(&archived_path)?; + let active_data = std::fs::read(&file_path)?; + + // Create recovery from both journals (compressed first, then active) + let recovery = VersionedRecovery::new(vec![&compressed_data, &active_data])?; + + // Verify version range spans both journals + let version_range = recovery.version_range(); + assert!(version_range.is_some()); + #[allow(clippy::unwrap_used)] + let (min, max) = version_range.unwrap(); + assert_eq!(min, 1); + assert_eq!(max, v3); + + // Recover at v1 (should be in compressed archive) + let state_v1 = recovery.recover_at_version(v1)?; + assert_eq!(state_v1.len(), 1); + assert_eq!( + state_v1.get("key1").map(|tv| &tv.value), + Some(&Value::String("value1".to_string())) + ); + + // Recover at v2 (should be in compressed archive) + let state_v2 = recovery.recover_at_version(v2)?; + assert_eq!(state_v2.len(), 2); + + // Recover at v3 (should include data from both archives and active journal) + let state_v3 = recovery.recover_at_version(v3)?; + assert_eq!(state_v3.len(), 3); + assert_eq!( + state_v3.get("key3").map(|tv| &tv.value), + Some(&Value::String("value3".to_string())) + ); + + Ok(()) +} + +#[test] +fn test_recovery_from_multiple_compressed_archives() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + // Create a store and perform multiple rotations + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + let v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; + let archive1_version = store.current_version(); + store.rotate_journal()?; + + let v2 = store.insert("key2".to_string(), Value::String("value2".to_string()))?; + let archive2_version = store.current_version(); + store.rotate_journal()?; + + let v3 = store.insert("key3".to_string(), Value::String("value3".to_string()))?; + store.sync()?; + + // Collect all journal data (2 compressed + 1 active) + let archive1_path = temp_dir.path().join(format!("test.jrn.v{}.zz", archive1_version)); + let archive2_path = temp_dir.path().join(format!("test.jrn.v{}.zz", archive2_version)); + + let archive1_data = std::fs::read(&archive1_path)?; + let archive2_data = std::fs::read(&archive2_path)?; + let active_data = std::fs::read(&file_path)?; + + // Create recovery from all journals + let recovery = VersionedRecovery::new(vec![&archive1_data, &archive2_data, &active_data])?; + + // Verify we can recover at any version + let state_v1 = recovery.recover_at_version(v1)?; + assert_eq!(state_v1.len(), 1); + assert!(state_v1.contains_key("key1")); + + let state_v2 = recovery.recover_at_version(v2)?; + assert_eq!(state_v2.len(), 2); + assert!(state_v2.contains_key("key1")); + assert!(state_v2.contains_key("key2")); + + let state_v3 = recovery.recover_at_version(v3)?; + assert_eq!(state_v3.len(), 3); + assert!(state_v3.contains_key("key1")); + assert!(state_v3.contains_key("key2")); + assert!(state_v3.contains_key("key3")); + + Ok(()) +} + +#[test] +fn test_recovery_mixed_compressed_and_uncompressed() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + // Create initial store and archive (will be compressed) + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + let _v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; + store.sync()?; + let archive_version = store.current_version(); + store.rotate_journal()?; + + // Get compressed archive + let compressed_archive_path = temp_dir.path().join(format!("test.jrn.v{}.zz", archive_version)); + let compressed_data = std::fs::read(&compressed_archive_path)?; + + // Create uncompressed journal data manually + let mut uncompressed_store = VersionedKVStore::new(&file_path, 4096, None)?; + let v2 = uncompressed_store.insert("key2".to_string(), Value::String("value2".to_string()))?; + uncompressed_store.sync()?; + let uncompressed_data = std::fs::read(&file_path)?; + + // Recovery should handle both compressed and uncompressed + let recovery = VersionedRecovery::new(vec![&compressed_data, &uncompressed_data])?; + + let state_final = recovery.recover_at_version(v2)?; + assert_eq!(state_final.len(), 2); + assert!(state_final.contains_key("key1")); + assert!(state_final.contains_key("key2")); + + Ok(()) +} + +#[test] +fn test_recovery_decompression_transparent() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("test.jrn"); + + // Create store with compressible data + let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + let compressible = "A".repeat(500); + let v1 = store.insert("data".to_string(), Value::String(compressible.clone()))?; + store.sync()?; + + // Create uncompressed recovery baseline + let uncompressed_data = std::fs::read(&file_path)?; + let recovery_uncompressed = VersionedRecovery::new(vec![&uncompressed_data])?; + let state_uncompressed = recovery_uncompressed.recover_at_version(v1)?; + + // Get archive version before rotation + let archive_version = store.current_version(); + + // Rotate to compress + store.rotate_journal()?; + + // Read compressed archive + let compressed_path = temp_dir.path().join(format!("test.jrn.v{}.zz", archive_version)); + let compressed_data = std::fs::read(&compressed_path)?; + + // Verify it's actually compressed (smaller) + assert!(compressed_data.len() < uncompressed_data.len()); + + // Create recovery from compressed data + let recovery_compressed = VersionedRecovery::new(vec![&compressed_data])?; + let state_compressed = recovery_compressed.recover_at_version(v1)?; + + // Both should produce identical results + assert_eq!(state_uncompressed.len(), state_compressed.len()); + assert_eq!( + state_uncompressed.get("data").map(|tv| &tv.value), + state_compressed.get("data").map(|tv| &tv.value) + ); + assert_eq!( + state_uncompressed.get("data").map(|tv| &tv.value), + Some(&Value::String(compressible)) + ); + + Ok(()) +} diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs index bc0cd761..3163d556 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -5,9 +5,12 @@ // LICENSE file or at: // https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt -use crate::kv_journal::{MemMappedVersionedKVJournal, VersionedKVJournal}; +use crate::kv_journal::{MemMappedVersionedKVJournal, TimestampedValue, VersionedKVJournal}; use ahash::AHashMap; use bd_bonjson::Value; +use flate2::write::ZlibEncoder; +use flate2::Compression; +use std::io::Write; use std::path::{Path, PathBuf}; /// Callback invoked when journal rotation occurs. @@ -21,11 +24,24 @@ use std::path::{Path, PathBuf}; /// storage, perform cleanup, or other post-rotation operations. pub type RotationCallback = Box; -/// A persistent key-value store with version tracking for point-in-time recovery. +/// Compress an archived journal using zlib. +fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result<()> { + let journal_bytes = std::fs::read(source)?; + + let mut encoder = ZlibEncoder::new(Vec::new(), Compression::new(3)); + encoder.write_all(&journal_bytes)?; + let compressed = encoder.finish()?; + + std::fs::write(dest, compressed)?; + + Ok(()) +} + +/// A persistent key-value store with version tracking. /// /// `VersionedKVStore` provides HashMap-like semantics backed by a versioned journal that /// assigns a monotonically increasing version number to each write operation. This enables: -/// - Point-in-time recovery to any historical version +/// - Audit logs with version tracking for every write /// - Automatic journal rotation when high water mark is reached /// - Optional callbacks for post-rotation operations (e.g., remote backup) /// @@ -50,13 +66,10 @@ pub type RotationCallback = Box; /// // Insert with version tracking /// let v1 = store.insert("key1".to_string(), Value::from(42))?; /// let v2 = store.insert("key2".to_string(), Value::from("hello"))?; -/// -/// // Point-in-time recovery -/// let state_at_v1 = store.as_hashmap_at_version(v1)?; /// ``` pub struct VersionedKVStore { journal: MemMappedVersionedKVJournal, - cached_map: AHashMap, + cached_map: AHashMap, base_path: PathBuf, buffer_size: usize, high_water_mark_ratio: Option, @@ -99,7 +112,7 @@ impl VersionedKVStore { MemMappedVersionedKVJournal::new(path, buffer_size, 1, high_water_mark_ratio)? }; - let cached_map = journal.as_hashmap()?; + let cached_map = journal.as_hashmap_with_timestamps()?; Ok(Self { journal, @@ -136,7 +149,7 @@ impl VersionedKVStore { let base_path = path.to_path_buf(); let journal = MemMappedVersionedKVJournal::from_file(path, buffer_size, high_water_mark_ratio)?; - let cached_map = journal.as_hashmap()?; + let cached_map = journal.as_hashmap_with_timestamps()?; Ok(Self { journal, @@ -162,6 +175,14 @@ impl VersionedKVStore { /// This operation is O(1) as it reads from the in-memory cache. #[must_use] pub fn get(&self, key: &str) -> Option<&Value> { + self.cached_map.get(key).map(|tv| &tv.value) + } + + /// Get a value with its timestamp by key. + /// + /// This operation is O(1) as it reads from the in-memory cache. + #[must_use] + pub fn get_with_timestamp(&self, key: &str) -> Option<&TimestampedValue> { self.cached_map.get(key) } @@ -174,12 +195,18 @@ impl VersionedKVStore { pub fn insert(&mut self, key: String, value: Value) -> anyhow::Result { let version = if matches!(value, Value::Null) { // Inserting null is equivalent to deletion - let version = self.journal.delete_versioned(&key)?; + let (version, _timestamp) = self.journal.delete_versioned(&key)?; self.cached_map.remove(&key); version } else { - let version = self.journal.set_versioned(&key, &value)?; - self.cached_map.insert(key, value); + let (version, timestamp) = self.journal.set_versioned(&key, &value)?; + self.cached_map.insert( + key, + TimestampedValue { + value, + timestamp, + }, + ); version }; @@ -202,7 +229,7 @@ impl VersionedKVStore { return Ok(None); } - let version = self.journal.delete_versioned(key)?; + let (version, _timestamp) = self.journal.delete_versioned(key)?; self.cached_map.remove(key); // Check if rotation is needed @@ -237,25 +264,28 @@ impl VersionedKVStore { self.len() == 0 } - /// Get a reference to the current hash map. + /// Get a reference to the current hash map with timestamps. /// /// This operation is O(1) as it reads from the in-memory cache. #[must_use] - pub fn as_hashmap(&self) -> &AHashMap { + pub fn as_hashmap_with_timestamps(&self) -> &AHashMap { &self.cached_map } - /// Reconstruct the hashmap at a specific version by replaying entries up to that version. + /// Get a reference to the current hash map (values only, without timestamps). /// - /// This allows point-in-time recovery to any historical version in the current journal. + /// Note: This method creates a temporary hashmap. For better performance, + /// consider using `get()` for individual lookups or `as_hashmap_with_timestamps()` + /// if you need the full map with timestamps. /// - /// # Errors - /// Returns an error if the journal cannot be decoded or the version is out of range. - pub fn as_hashmap_at_version( - &self, - target_version: u64, - ) -> anyhow::Result> { - self.journal.as_hashmap_at_version(target_version) + /// This operation is O(n) where n is the number of keys. + #[must_use] + pub fn as_hashmap(&self) -> AHashMap { + self + .cached_map + .iter() + .map(|(k, tv)| (k.clone(), tv.value.clone())) + .collect() } /// Get the current version number. @@ -293,6 +323,7 @@ impl VersionedKVStore { /// Manually trigger journal rotation. /// /// This will create a new journal with the current state compacted and archive the old journal. + /// The archived journal will be compressed using zlib to reduce storage size. /// Rotation typically happens automatically when the high water mark is reached, but this /// method allows manual control when needed. /// @@ -301,7 +332,7 @@ impl VersionedKVStore { pub fn rotate_journal(&mut self) -> anyhow::Result<()> { let rotation_version = self.journal.current_version(); - // Generate archived journal path with rotation version + // Generate archived journal path with rotation version (compressed) let archived_path = self.generate_archived_path(rotation_version); // Create new journal with rotated state @@ -310,14 +341,21 @@ impl VersionedKVStore { // Replace old journal with new one let old_journal = std::mem::replace(&mut self.journal, new_journal); - // Move old journal to archived location + // Move old journal to temporary location drop(old_journal); // Release mmap before moving file - std::fs::rename(&self.base_path, &archived_path)?; + let temp_uncompressed = self.base_path.with_extension("jrn.old"); + std::fs::rename(&self.base_path, &temp_uncompressed)?; // Rename new journal to base path let temp_path = self.base_path.with_extension("jrn.tmp"); std::fs::rename(&temp_path, &self.base_path)?; + // Compress the archived journal + compress_archived_journal(&temp_uncompressed, &archived_path)?; + + // Remove uncompressed version + std::fs::remove_file(&temp_uncompressed)?; + // Invoke rotation callback if set if let Some(ref mut callback) = self.rotation_callback { callback(&archived_path, &self.base_path, rotation_version); @@ -327,10 +365,11 @@ impl VersionedKVStore { } /// Generate the archived journal path for a given rotation version. + /// Archived journals use the .zz extension to indicate zlib compression. fn generate_archived_path(&self, rotation_version: u64) -> PathBuf { let mut path = self.base_path.clone(); if let Some(file_name) = path.file_name() { - let new_name = format!("{}.v{}", file_name.to_string_lossy(), rotation_version); + let new_name = format!("{}.v{}.zz", file_name.to_string_lossy(), rotation_version); path.set_file_name(new_name); } path diff --git a/bd-resilient-kv/src/versioned_recovery.rs b/bd-resilient-kv/src/versioned_recovery.rs new file mode 100644 index 00000000..30cc806e --- /dev/null +++ b/bd-resilient-kv/src/versioned_recovery.rs @@ -0,0 +1,321 @@ +// shared-core - bitdrift's common client/server libraries +// Copyright Bitdrift, Inc. All rights reserved. +// +// Use of this source code is governed by a source available license that can be found in the +// LICENSE file or at: +// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt + +use crate::kv_journal::TimestampedValue; +use ahash::AHashMap; +use bd_bonjson::Value; +use bd_bonjson::decoder::from_slice; +use flate2::read::ZlibDecoder; +use std::io::Read; + +/// A utility for recovering state at arbitrary versions from raw journal data. +/// +/// This utility operates on raw byte slices from versioned journals and can reconstruct +/// the key-value state at any historical version by replaying journal entries. +/// +/// Supports both compressed (zlib) and uncompressed journals. Compressed journals are +/// automatically detected and decompressed transparently. +/// +/// # Usage +/// +/// ```ignore +/// use bd_resilient_kv::VersionedRecovery; +/// +/// // Load journal data as byte slices (may be compressed or uncompressed) +/// let archived_journal = std::fs::read("store.jrn.v30000.zz")?; // Compressed +/// let active_journal = std::fs::read("store.jrn")?; // Uncompressed +/// +/// // Create recovery utility with both journals +/// let recovery = VersionedRecovery::new(vec![&archived_journal, &active_journal])?; +/// +/// // Recover state at specific version +/// let state_at_25000 = recovery.recover_at_version(25000)?; +/// ``` +#[derive(Debug)] +pub struct VersionedRecovery { + journals: Vec, +} + +#[derive(Debug)] +struct JournalInfo { + data: Vec, + base_version: u64, + max_version: u64, +} + +impl VersionedRecovery { + /// Create a new recovery utility from a list of journal byte slices. + /// + /// The journals should be provided in chronological order (oldest to newest). + /// Each journal must be a valid versioned journal (VERSION 2 format). + /// Journals may be compressed with zlib or uncompressed - decompression is automatic. + /// + /// # Errors + /// + /// Returns an error if any journal is invalid or cannot be parsed. + pub fn new(journals: Vec<&[u8]>) -> anyhow::Result { + let mut journal_infos = Vec::new(); + + for data in journals { + // Detect and decompress if needed + let decompressed = decompress_if_needed(data)?; + let (base_version, max_version) = extract_version_range(&decompressed)?; + journal_infos.push(JournalInfo { + data: decompressed, + base_version, + max_version, + }); + } + + Ok(Self { + journals: journal_infos, + }) + } + + /// Recover the key-value state at a specific version. + /// + /// This method replays all journal entries from all provided journals up to and including + /// the target version, reconstructing the exact state at that point in time. + /// + /// # Arguments + /// + /// * `target_version` - The version to recover state at + /// + /// # Returns + /// + /// A hashmap containing all key-value pairs with their timestamps as they existed at the + /// target version. + /// + /// # Errors + /// + /// Returns an error if: + /// - The target version is not found in any journal + /// - Journal data is corrupted or invalid + pub fn recover_at_version( + &self, + target_version: u64, + ) -> anyhow::Result> { + let mut map = AHashMap::new(); + + // Find all journals that might contain entries up to target version + for journal in &self.journals { + // Skip journals that start after our target + if journal.base_version > target_version { + break; + } + + // Replay entries from this journal + replay_journal_to_version(&journal.data, target_version, &mut map)?; + + // If this journal contains the target version, we're done + if journal.max_version >= target_version { + break; + } + } + + Ok(map) + } + + /// Get the range of versions available in the recovery utility. + /// + /// Returns (`min_version`, `max_version`) tuple representing the earliest and latest + /// versions that can be recovered. + #[must_use] + pub fn version_range(&self) -> Option<(u64, u64)> { + if self.journals.is_empty() { + return None; + } + + let min = self.journals.first().map(|j| j.base_version)?; + let max = self.journals.last().map(|j| j.max_version)?; + Some((min, max)) + } + + /// Get the current state (at the latest version). + /// + /// # Errors + /// + /// Returns an error if journal data is corrupted or invalid. + pub fn recover_current(&self) -> anyhow::Result> { + let mut map = AHashMap::new(); + + for journal in &self.journals { + replay_journal_to_version(&journal.data, u64::MAX, &mut map)?; + } + + Ok(map) + } +} + +/// Decompress journal data if it's zlib-compressed, otherwise return as-is. +/// +/// Detection: Try to read the header. If it's a valid journal header (format version at offset 0), +/// it's uncompressed. Otherwise, attempt zlib decompression. +fn decompress_if_needed(data: &[u8]) -> anyhow::Result> { + const HEADER_SIZE: usize = 16; + + // Check if data looks like a valid uncompressed journal + if data.len() >= HEADER_SIZE { + // Read format version (first 8 bytes as u64 little-endian) + let version_bytes: [u8; 8] = data[0 .. 8] + .try_into() + .map_err(|_| anyhow::anyhow!("Failed to read version bytes"))?; + let format_version = u64::from_le_bytes(version_bytes); + + // If format version is 1 or 2, it's likely uncompressed + if format_version == 1 || format_version == 2 { + return Ok(data.to_vec()); + } + } + + // Try to decompress as zlib + let mut decoder = ZlibDecoder::new(data); + let mut decompressed = Vec::new(); + decoder.read_to_end(&mut decompressed)?; + + Ok(decompressed) +} + +/// Extract the base version and maximum version from a journal. +fn extract_version_range(buffer: &[u8]) -> anyhow::Result<(u64, u64)> { + let array = read_bonjson_payload(buffer)?; + + // Extract base_version from metadata (default to 1 if not found) + let base_version = if let Value::Array(entries) = &array + && let Some(Value::Object(obj)) = entries.first() + { + if let Some(Value::Unsigned(base)) = obj.get("base_version") { + *base + } else if let Some(Value::Signed(base)) = obj.get("base_version") { + #[allow(clippy::cast_sign_loss)] + (*base as u64) + } else { + 1 // Default to 1 for compatibility + } + } else { + anyhow::bail!("Failed to extract metadata from journal"); + }; + + // Find the maximum version by scanning all entries + let mut max_version = base_version; + if let Value::Array(entries) = array { + for (index, entry) in entries.iter().enumerate() { + if index == 0 { + continue; // Skip metadata + } + + if let Value::Object(obj) = entry { + if let Some(Value::Unsigned(v)) = obj.get("v") { + max_version = max_version.max(*v); + } else if let Some(Value::Signed(v)) = obj.get("v") { + #[allow(clippy::cast_sign_loss)] + { + max_version = max_version.max(*v as u64); + } + } + } + } + } + + Ok((base_version, max_version)) +} + +/// Replay journal entries up to a target version. +fn replay_journal_to_version( + buffer: &[u8], + target_version: u64, + map: &mut AHashMap, +) -> anyhow::Result<()> { + let array = read_bonjson_payload(buffer)?; + + if let Value::Array(entries) = array { + for (index, entry) in entries.iter().enumerate() { + // Skip metadata (first entry) + if index == 0 { + continue; + } + + if let Value::Object(obj) = entry { + // Check version + let entry_version = if let Some(Value::Unsigned(v)) = obj.get("v") { + *v + } else if let Some(Value::Signed(v)) = obj.get("v") { + #[allow(clippy::cast_sign_loss)] + (*v as u64) + } else { + continue; // Skip entries without version + }; + + // Only apply entries up to target version + if entry_version > target_version { + break; + } + + // Extract timestamp + let timestamp = if let Some(Value::Unsigned(t)) = obj.get("t") { + *t + } else if let Some(Value::Signed(t)) = obj.get("t") { + #[allow(clippy::cast_sign_loss)] + (*t as u64) + } else { + 0 // Default to 0 if not found (shouldn't happen in v2 format) + }; + + // Extract key and operation + if let Some(Value::String(key)) = obj.get("k") + && let Some(operation) = obj.get("o") + { + if operation.is_null() { + map.remove(key); + } else { + map.insert( + key.clone(), + TimestampedValue { + value: operation.clone(), + timestamp, + }, + ); + } + } + } + } + } + + Ok(()) +} + +/// Read the bonjson payload from a journal buffer. +fn read_bonjson_payload(buffer: &[u8]) -> anyhow::Result { + const HEADER_SIZE: usize = 16; + const ARRAY_BEGIN: usize = 16; + + if buffer.len() < HEADER_SIZE { + anyhow::bail!("Buffer too small: {}", buffer.len()); + } + + // Read position from header + let position_bytes: [u8; 8] = buffer[8 .. 16] + .try_into() + .map_err(|_| anyhow::anyhow!("Failed to read position"))?; + #[allow(clippy::cast_possible_truncation)] + let position = u64::from_le_bytes(position_bytes) as usize; + + if position > buffer.len() { + anyhow::bail!( + "Invalid position: {position}, buffer size: {}", + buffer.len() + ); + } + + let slice_to_decode = &buffer[ARRAY_BEGIN .. position]; + + match from_slice(slice_to_decode) { + Ok((_, decoded)) => Ok(decoded), + Err(bd_bonjson::decoder::DecodeError::Partial { partial_value, .. }) => Ok(partial_value), + Err(e) => anyhow::bail!("Failed to decode buffer: {e:?}"), + } +} From bca695fdef3a0171673ddbd250f1f9466c148754 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Tue, 4 Nov 2025 19:32:52 -0800 Subject: [PATCH 03/66] bd-resilient-kv: add snapshot cleanup utility for archived journals Add SnapshotCleanup utility to manage disk space by removing old archived journal snapshots based on version thresholds or retention counts. Supports two cleanup strategies: version-based (keep snapshots >= min version) and count-based (keep N most recent). Includes comprehensive test coverage with 15 tests covering various scenarios including empty directories, edge cases, and multi-journal isolation. --- bd-resilient-kv/README.md | 78 ++++ bd-resilient-kv/src/lib.rs | 2 + bd-resilient-kv/src/snapshot_cleanup.rs | 252 +++++++++++++ bd-resilient-kv/src/tests/mod.rs | 1 + .../src/tests/snapshot_cleanup_test.rs | 332 ++++++++++++++++++ .../src/tests/versioned_kv_store_test.rs | 17 +- .../src/tests/versioned_recovery_test.rs | 54 ++- bd-resilient-kv/src/versioned_kv_store.rs | 12 +- 8 files changed, 720 insertions(+), 28 deletions(-) create mode 100644 bd-resilient-kv/src/snapshot_cleanup.rs create mode 100644 bd-resilient-kv/src/tests/snapshot_cleanup_test.rs diff --git a/bd-resilient-kv/README.md b/bd-resilient-kv/README.md index caa9ca87..33475df1 100644 --- a/bd-resilient-kv/README.md +++ b/bd-resilient-kv/README.md @@ -570,6 +570,84 @@ fn main() -> anyhow::Result<()> { - Only archived journals are compressed during rotation - No configuration needed - compression is automatic +### Snapshot Cleanup Management + +**SnapshotCleanup** provides utilities for managing disk space by cleaning up old archived journals: + +```rust +use bd_resilient_kv::SnapshotCleanup; + +fn main() -> anyhow::Result<()> { + // Create cleanup utility for your journal + let cleanup = SnapshotCleanup::new("my_store.jrn")?; + + // List all archived snapshots + let snapshots = cleanup.list_snapshots()?; + for snapshot in &snapshots { + println!("Version: {}, Size: {} bytes, Path: {:?}", + snapshot.version, snapshot.size_bytes, snapshot.path); + } + + // Strategy 1: Remove snapshots older than a specific version + // (e.g., your system determined you need to keep data back to version 5000) + let removed = cleanup.cleanup_before_version(5000)?; + println!("Removed {} old snapshots", removed.len()); + + // Strategy 2: Keep only the N most recent snapshots + let removed = cleanup.cleanup_keep_recent(10)?; + println!("Removed {} snapshots, kept 10 most recent", removed.len()); + + // Check disk usage + let total_size = cleanup.total_snapshot_size()?; + println!("Total snapshot size: {} bytes", total_size); + + // Get version range + if let Some(oldest) = cleanup.oldest_snapshot_version()? { + if let Some(newest) = cleanup.newest_snapshot_version()? { + println!("Snapshots range from version {} to {}", oldest, newest); + } + } + + Ok(()) +} +``` + +**Key Features**: +- **Version-based cleanup**: Remove snapshots before a specific version +- **Count-based cleanup**: Keep only N most recent snapshots +- **Safe operations**: Only removes compressed archives (`.zz` files), never active journals +- **Disk space monitoring**: Query total size and version ranges +- **Per-journal isolation**: Each cleanup instance only manages its own journal's snapshots + +**Integration with VersionedKVStore**: +```rust +use bd_resilient_kv::{VersionedKVStore, SnapshotCleanup}; +use bd_bonjson::Value; + +fn main() -> anyhow::Result<()> { + // Your application logic determines minimum required version + let min_version_from_external_system = get_minimum_required_version(); + + // Create store + let mut store = VersionedKVStore::new("my_store.jrn", 1024 * 1024, None)?; + + // Perform operations... + store.insert("key".to_string(), Value::from(42))?; + + // Periodically clean up old snapshots + let cleanup = SnapshotCleanup::new("my_store.jrn")?; + cleanup.cleanup_before_version(min_version_from_external_system)?; + + Ok(()) +} + +fn get_minimum_required_version() -> u64 { + // Your external system (e.g., backup service, replication manager) + // tells you how far back you need to maintain history + 5000 +} +``` + ### Custom Buffer Sizes Choose buffer sizes based on your use case: diff --git a/bd-resilient-kv/src/lib.rs b/bd-resilient-kv/src/lib.rs index 2b4c1732..a2195527 100644 --- a/bd-resilient-kv/src/lib.rs +++ b/bd-resilient-kv/src/lib.rs @@ -19,6 +19,7 @@ mod tests; pub mod kv_journal; pub mod kv_store; +pub mod snapshot_cleanup; pub mod versioned_kv_store; pub mod versioned_recovery; @@ -31,5 +32,6 @@ pub use kv_journal::{ VersionedKVJournal, }; pub use kv_store::KVStore; +pub use snapshot_cleanup::{SnapshotCleanup, SnapshotInfo}; pub use versioned_kv_store::{RotationCallback, VersionedKVStore}; pub use versioned_recovery::VersionedRecovery; diff --git a/bd-resilient-kv/src/snapshot_cleanup.rs b/bd-resilient-kv/src/snapshot_cleanup.rs new file mode 100644 index 00000000..f65b198c --- /dev/null +++ b/bd-resilient-kv/src/snapshot_cleanup.rs @@ -0,0 +1,252 @@ +// shared-core - bitdrift's common client/server libraries +// Copyright Bitdrift, Inc. All rights reserved. +// +// Use of this source code is governed by a source available license that can be found in the +// LICENSE file or at: +// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt + +use std::fs; +use std::path::{Path, PathBuf}; + +/// Information about an archived journal snapshot. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SnapshotInfo { + /// Full path to the snapshot file + pub path: PathBuf, + /// Version number extracted from the snapshot filename + pub version: u64, + /// File size in bytes + pub size_bytes: u64, +} + +/// A utility for managing cleanup of archived journal snapshots. +/// +/// `SnapshotCleanup` provides functionality to discover and remove old archived journals +/// based on version thresholds. This is useful for managing disk space when your system +/// determines how far back in history you need to maintain snapshots. +/// +/// Archived journals follow the naming pattern: `{base_name}.v{version}.zz` +/// For example: `my_store.jrn.v1000.zz`, `my_store.jrn.v2000.zz` +/// +/// # Example +/// ```ignore +/// use bd_resilient_kv::SnapshotCleanup; +/// +/// // Create cleanup utility for a journal +/// let cleanup = SnapshotCleanup::new("my_store.jrn")?; +/// +/// // List all archived snapshots +/// let snapshots = cleanup.list_snapshots()?; +/// for snapshot in &snapshots { +/// println!("Version: {}, Size: {} bytes", snapshot.version, snapshot.size_bytes); +/// } +/// +/// // Remove snapshots older than version 5000 +/// let removed = cleanup.cleanup_before_version(5000)?; +/// println!("Removed {} snapshots", removed.len()); +/// ``` +pub struct SnapshotCleanup { + directory: PathBuf, + base_filename: String, +} + +impl SnapshotCleanup { + /// Create a new `SnapshotCleanup` utility for the given journal path. + /// + /// The journal path should be the same path used to create the `VersionedKVStore`. + /// For example, if you created your store with `"my_store.jrn"`, pass the same path here. + /// + /// # Arguments + /// * `journal_path` - Path to the journal file (e.g., "`my_store.jrn`") + /// + /// # Errors + /// Returns an error if the path is invalid or cannot be canonicalized. + pub fn new>(journal_path: P) -> anyhow::Result { + let path = journal_path.as_ref(); + + let directory = path + .parent() + .map_or_else(|| PathBuf::from("."), std::path::Path::to_path_buf); + + let base_filename = path + .file_name() + .ok_or_else(|| anyhow::anyhow!("Invalid journal path: no filename"))? + .to_string_lossy() + .to_string(); + + Ok(Self { + directory, + base_filename, + }) + } + + /// List all archived snapshots for this journal. + /// + /// Returns a vector of `SnapshotInfo` containing details about each archived journal, + /// sorted by version number in ascending order. + /// + /// # Errors + /// Returns an error if the directory cannot be read or if file metadata cannot be accessed. + pub fn list_snapshots(&self) -> anyhow::Result> { + let mut snapshots = Vec::new(); + + // Read directory entries + if !self.directory.exists() { + return Ok(snapshots); + } + + let entries = fs::read_dir(&self.directory)?; + + for entry in entries { + let entry = entry?; + let path = entry.path(); + + // Check if this is an archived snapshot for our journal + if let Some(version) = self.extract_version_from_path(&path) { + let metadata = entry.metadata()?; + snapshots.push(SnapshotInfo { + path: path.clone(), + version, + size_bytes: metadata.len(), + }); + } + } + + // Sort by version number + snapshots.sort_by_key(|s| s.version); + + Ok(snapshots) + } + + /// Remove all archived snapshots with versions strictly less than the specified version. + /// + /// This keeps snapshots at or after the minimum version, removing only older ones. + /// + /// # Arguments + /// * `min_version` - Minimum version to keep (exclusive). Snapshots with versions less than this + /// will be removed. + /// + /// # Returns + /// Returns a vector of `SnapshotInfo` for the snapshots that were removed. + /// + /// # Errors + /// Returns an error if any snapshot cannot be removed. If an error occurs while removing + /// a snapshot, the operation stops and returns the error. Some snapshots may have been + /// removed before the error occurred. + /// + /// # Example + /// ```ignore + /// // Keep snapshots at version 5000 and later, remove older ones + /// let removed = cleanup.cleanup_before_version(5000)?; + /// ``` + pub fn cleanup_before_version(&self, min_version: u64) -> anyhow::Result> { + let snapshots = self.list_snapshots()?; + let mut removed = Vec::new(); + + for snapshot in snapshots { + if snapshot.version < min_version { + fs::remove_file(&snapshot.path)?; + removed.push(snapshot); + } + } + + Ok(removed) + } + + /// Remove all archived snapshots except the most recent N versions. + /// + /// This keeps the N newest snapshots and removes all older ones. + /// + /// # Arguments + /// * `keep_count` - Number of most recent snapshots to keep + /// + /// # Returns + /// Returns a vector of `SnapshotInfo` for the snapshots that were removed. + /// + /// # Errors + /// Returns an error if any snapshot cannot be removed. + /// + /// # Example + /// ```ignore + /// // Keep only the 5 most recent snapshots + /// let removed = cleanup.cleanup_keep_recent(5)?; + /// ``` + pub fn cleanup_keep_recent(&self, keep_count: usize) -> anyhow::Result> { + let mut snapshots = self.list_snapshots()?; + + if snapshots.len() <= keep_count { + return Ok(Vec::new()); + } + + // Sort by version descending to get most recent first + snapshots.sort_by_key(|s| std::cmp::Reverse(s.version)); + + // Remove all except the most recent keep_count + let mut removed = Vec::new(); + for snapshot in snapshots.into_iter().skip(keep_count) { + fs::remove_file(&snapshot.path)?; + removed.push(snapshot); + } + + // Sort removed list by version ascending for consistency + removed.sort_by_key(|s| s.version); + + Ok(removed) + } + + /// Calculate the total disk space used by all archived snapshots. + /// + /// # Errors + /// Returns an error if snapshots cannot be listed. + pub fn total_snapshot_size(&self) -> anyhow::Result { + let snapshots = self.list_snapshots()?; + Ok(snapshots.iter().map(|s| s.size_bytes).sum()) + } + + /// Get the oldest snapshot version. + /// + /// Returns `None` if there are no archived snapshots. + /// + /// # Errors + /// Returns an error if snapshots cannot be listed. + pub fn oldest_snapshot_version(&self) -> anyhow::Result> { + let snapshots = self.list_snapshots()?; + Ok(snapshots.first().map(|s| s.version)) + } + + /// Get the newest snapshot version. + /// + /// Returns `None` if there are no archived snapshots. + /// + /// # Errors + /// Returns an error if snapshots cannot be listed. + pub fn newest_snapshot_version(&self) -> anyhow::Result> { + let snapshots = self.list_snapshots()?; + Ok(snapshots.last().map(|s| s.version)) + } + + /// Extract version number from an archived journal path. + /// + /// Returns `Some(version)` if the path matches the pattern `{base_name}.v{version}.zz`, + /// otherwise returns `None`. + fn extract_version_from_path(&self, path: &Path) -> Option { + let filename = path.file_name()?.to_string_lossy(); + + // Check if filename starts with our base filename + if !filename.starts_with(&self.base_filename) { + return None; + } + + // Pattern: {base_filename}.v{version}.zz + let suffix = filename.strip_prefix(&self.base_filename)?; + + // Should start with ".v" + let version_part = suffix.strip_prefix(".v")?; + + // Should end with ".zz" + let version_str = version_part.strip_suffix(".zz")?; + + // Parse version number + version_str.parse::().ok() + } +} diff --git a/bd-resilient-kv/src/tests/mod.rs b/bd-resilient-kv/src/tests/mod.rs index 7d838890..3972eea9 100644 --- a/bd-resilient-kv/src/tests/mod.rs +++ b/bd-resilient-kv/src/tests/mod.rs @@ -33,5 +33,6 @@ pub mod error_handling_test; pub mod kv_store_test; pub mod kv_test; pub mod memmapped_test; +pub mod snapshot_cleanup_test; pub mod versioned_kv_store_test; pub mod versioned_recovery_test; diff --git a/bd-resilient-kv/src/tests/snapshot_cleanup_test.rs b/bd-resilient-kv/src/tests/snapshot_cleanup_test.rs new file mode 100644 index 00000000..66833ca3 --- /dev/null +++ b/bd-resilient-kv/src/tests/snapshot_cleanup_test.rs @@ -0,0 +1,332 @@ +// shared-core - bitdrift's common client/server libraries +// Copyright Bitdrift, Inc. All rights reserved. +// +// Use of this source code is governed by a source available license that can be found in the +// LICENSE file or at: +// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt + +use crate::snapshot_cleanup::{SnapshotCleanup, SnapshotInfo}; +use std::fs; +use tempfile::TempDir; + +fn create_mock_snapshot(dir: &TempDir, base_name: &str, version: u64, size: usize) -> SnapshotInfo { + let filename = format!("{}.v{}.zz", base_name, version); + let path = dir.path().join(&filename); + + // Create file with specified size + let data = vec![0u8; size]; + fs::write(&path, data).unwrap(); + + SnapshotInfo { + path, + version, + size_bytes: size as u64, + } +} + +#[test] +fn list_snapshots_empty_directory() { + let temp_dir = TempDir::new().unwrap(); + let journal_path = temp_dir.path().join("test.jrn"); + + let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); + let snapshots = cleanup.list_snapshots().unwrap(); + + assert!(snapshots.is_empty()); +} + +#[test] +fn list_snapshots_with_multiple_versions() { + let temp_dir = TempDir::new().unwrap(); + let journal_path = temp_dir.path().join("test.jrn"); + + // Create snapshots with different versions + create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); + create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); + create_mock_snapshot(&temp_dir, "test.jrn", 1500, 150); + create_mock_snapshot(&temp_dir, "test.jrn", 3000, 300); + + let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); + let snapshots = cleanup.list_snapshots().unwrap(); + + assert_eq!(snapshots.len(), 4); + // Should be sorted by version + assert_eq!(snapshots[0].version, 1000); + assert_eq!(snapshots[1].version, 1500); + assert_eq!(snapshots[2].version, 2000); + assert_eq!(snapshots[3].version, 3000); + + // Verify sizes + assert_eq!(snapshots[0].size_bytes, 100); + assert_eq!(snapshots[1].size_bytes, 150); + assert_eq!(snapshots[2].size_bytes, 200); + assert_eq!(snapshots[3].size_bytes, 300); +} + +#[test] +fn list_snapshots_ignores_other_files() { + let temp_dir = TempDir::new().unwrap(); + let journal_path = temp_dir.path().join("test.jrn"); + + // Create valid snapshots + create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); + create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); + + // Create files that should be ignored + fs::write(temp_dir.path().join("test.jrn"), b"active journal").unwrap(); + fs::write(temp_dir.path().join("other.jrn.v1000.zz"), b"other journal").unwrap(); + fs::write(temp_dir.path().join("test.jrn.v1000"), b"uncompressed").unwrap(); + fs::write(temp_dir.path().join("test.jrn.backup"), b"backup").unwrap(); + + let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); + let snapshots = cleanup.list_snapshots().unwrap(); + + assert_eq!(snapshots.len(), 2); + assert_eq!(snapshots[0].version, 1000); + assert_eq!(snapshots[1].version, 2000); +} + +#[test] +fn cleanup_before_version_removes_old_snapshots() { + let temp_dir = TempDir::new().unwrap(); + let journal_path = temp_dir.path().join("test.jrn"); + + create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); + create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); + create_mock_snapshot(&temp_dir, "test.jrn", 3000, 300); + create_mock_snapshot(&temp_dir, "test.jrn", 4000, 400); + + let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); + + // Remove snapshots before version 3000 (keep 3000 and 4000) + let removed = cleanup.cleanup_before_version(3000).unwrap(); + + assert_eq!(removed.len(), 2); + assert_eq!(removed[0].version, 1000); + assert_eq!(removed[1].version, 2000); + + // Verify remaining snapshots + let remaining = cleanup.list_snapshots().unwrap(); + assert_eq!(remaining.len(), 2); + assert_eq!(remaining[0].version, 3000); + assert_eq!(remaining[1].version, 4000); + + // Verify files are actually deleted + assert!(!removed[0].path.exists()); + assert!(!removed[1].path.exists()); + assert!(remaining[0].path.exists()); + assert!(remaining[1].path.exists()); +} + +#[test] +fn cleanup_before_version_keeps_all_if_min_version_too_low() { + let temp_dir = TempDir::new().unwrap(); + let journal_path = temp_dir.path().join("test.jrn"); + + create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); + create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); + + let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); + + // Min version is lower than all snapshots + let removed = cleanup.cleanup_before_version(500).unwrap(); + + assert!(removed.is_empty()); + + let remaining = cleanup.list_snapshots().unwrap(); + assert_eq!(remaining.len(), 2); +} + +#[test] +fn cleanup_before_version_removes_all_if_min_version_too_high() { + let temp_dir = TempDir::new().unwrap(); + let journal_path = temp_dir.path().join("test.jrn"); + + create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); + create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); + + let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); + + // Min version is higher than all snapshots + let removed = cleanup.cleanup_before_version(5000).unwrap(); + + assert_eq!(removed.len(), 2); + + let remaining = cleanup.list_snapshots().unwrap(); + assert!(remaining.is_empty()); +} + +#[test] +fn cleanup_keep_recent_removes_old_snapshots() { + let temp_dir = TempDir::new().unwrap(); + let journal_path = temp_dir.path().join("test.jrn"); + + create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); + create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); + create_mock_snapshot(&temp_dir, "test.jrn", 3000, 300); + create_mock_snapshot(&temp_dir, "test.jrn", 4000, 400); + create_mock_snapshot(&temp_dir, "test.jrn", 5000, 500); + + let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); + + // Keep only the 2 most recent snapshots + let removed = cleanup.cleanup_keep_recent(2).unwrap(); + + assert_eq!(removed.len(), 3); + assert_eq!(removed[0].version, 1000); + assert_eq!(removed[1].version, 2000); + assert_eq!(removed[2].version, 3000); + + // Verify remaining snapshots + let remaining = cleanup.list_snapshots().unwrap(); + assert_eq!(remaining.len(), 2); + assert_eq!(remaining[0].version, 4000); + assert_eq!(remaining[1].version, 5000); +} + +#[test] +fn cleanup_keep_recent_keeps_all_if_count_too_high() { + let temp_dir = TempDir::new().unwrap(); + let journal_path = temp_dir.path().join("test.jrn"); + + create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); + create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); + + let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); + + // Keep count is higher than total snapshots + let removed = cleanup.cleanup_keep_recent(5).unwrap(); + + assert!(removed.is_empty()); + + let remaining = cleanup.list_snapshots().unwrap(); + assert_eq!(remaining.len(), 2); +} + +#[test] +fn cleanup_keep_recent_with_zero_removes_all() { + let temp_dir = TempDir::new().unwrap(); + let journal_path = temp_dir.path().join("test.jrn"); + + create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); + create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); + + let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); + + let removed = cleanup.cleanup_keep_recent(0).unwrap(); + + assert_eq!(removed.len(), 2); + + let remaining = cleanup.list_snapshots().unwrap(); + assert!(remaining.is_empty()); +} + +#[test] +fn total_snapshot_size_calculates_correctly() { + let temp_dir = TempDir::new().unwrap(); + let journal_path = temp_dir.path().join("test.jrn"); + + create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); + create_mock_snapshot(&temp_dir, "test.jrn", 2000, 250); + create_mock_snapshot(&temp_dir, "test.jrn", 3000, 150); + + let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); + let total_size = cleanup.total_snapshot_size().unwrap(); + + assert_eq!(total_size, 500); // 100 + 250 + 150 +} + +#[test] +fn total_snapshot_size_returns_zero_for_empty() { + let temp_dir = TempDir::new().unwrap(); + let journal_path = temp_dir.path().join("test.jrn"); + + let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); + let total_size = cleanup.total_snapshot_size().unwrap(); + + assert_eq!(total_size, 0); +} + +#[test] +fn oldest_and_newest_snapshot_versions() { + let temp_dir = TempDir::new().unwrap(); + let journal_path = temp_dir.path().join("test.jrn"); + + create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); + create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); + create_mock_snapshot(&temp_dir, "test.jrn", 3000, 300); + + let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); + + assert_eq!(cleanup.oldest_snapshot_version().unwrap(), Some(1000)); + assert_eq!(cleanup.newest_snapshot_version().unwrap(), Some(3000)); +} + +#[test] +fn oldest_and_newest_return_none_for_empty() { + let temp_dir = TempDir::new().unwrap(); + let journal_path = temp_dir.path().join("test.jrn"); + + let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); + + assert_eq!(cleanup.oldest_snapshot_version().unwrap(), None); + assert_eq!(cleanup.newest_snapshot_version().unwrap(), None); +} + +#[test] +fn works_with_subdirectory_paths() { + let temp_dir = TempDir::new().unwrap(); + let subdir = temp_dir.path().join("data"); + fs::create_dir(&subdir).unwrap(); + + let journal_path = subdir.join("store.jrn"); + + // Create snapshots in subdirectory + let filename1 = format!("store.jrn.v{}.zz", 1000); + let path1 = subdir.join(&filename1); + fs::write(&path1, vec![0u8; 100]).unwrap(); + + let filename2 = format!("store.jrn.v{}.zz", 2000); + let path2 = subdir.join(&filename2); + fs::write(&path2, vec![0u8; 200]).unwrap(); + + let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); + let snapshots = cleanup.list_snapshots().unwrap(); + + assert_eq!(snapshots.len(), 2); + assert_eq!(snapshots[0].version, 1000); + assert_eq!(snapshots[1].version, 2000); +} + +#[test] +fn cleanup_with_different_base_names() { + let temp_dir = TempDir::new().unwrap(); + + // Create snapshots for different journals + create_mock_snapshot(&temp_dir, "journal_a.jrn", 1000, 100); + create_mock_snapshot(&temp_dir, "journal_a.jrn", 2000, 200); + create_mock_snapshot(&temp_dir, "journal_b.jrn", 1000, 100); + create_mock_snapshot(&temp_dir, "journal_b.jrn", 2000, 200); + + // Cleanup for journal_a should only see journal_a snapshots + let cleanup_a = SnapshotCleanup::new(temp_dir.path().join("journal_a.jrn")).unwrap(); + let snapshots_a = cleanup_a.list_snapshots().unwrap(); + + assert_eq!(snapshots_a.len(), 2); + assert!( + snapshots_a + .iter() + .all(|s| s.path.to_string_lossy().contains("journal_a")) + ); + + // Cleanup for journal_b should only see journal_b snapshots + let cleanup_b = SnapshotCleanup::new(temp_dir.path().join("journal_b.jrn")).unwrap(); + let snapshots_b = cleanup_b.list_snapshots().unwrap(); + + assert_eq!(snapshots_b.len(), 2); + assert!( + snapshots_b + .iter() + .all(|s| s.path.to_string_lossy().contains("journal_b")) + ); +} diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index ad6e0950..3774e99e 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -375,7 +375,10 @@ fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { ); // Verify ordering is still correct - assert!(ts2_after > ts1_after, "Timestamp ordering should be preserved"); + assert!( + ts2_after > ts1_after, + "Timestamp ordering should be preserved" + ); Ok(()) } @@ -398,7 +401,7 @@ fn test_compression_during_rotation() -> anyhow::Result<()> { // Get current version before rotation (this is what will be used in the archive name) let rotation_version = store.current_version(); - + // Trigger rotation store.rotate_journal()?; @@ -445,7 +448,10 @@ fn test_compression_ratio() -> anyhow::Result<()> { // Insert highly compressible data let compressible_data = "A".repeat(500); for i in 0 .. 10 { - store.insert(format!("key{}", i), Value::String(compressible_data.clone()))?; + store.insert( + format!("key{}", i), + Value::String(compressible_data.clone()), + )?; } let uncompressed_size = std::fs::metadata(&file_path)?.len(); @@ -491,9 +497,7 @@ fn test_multiple_rotations_with_compression() -> anyhow::Result<()> { // Verify all compressed archives exist for version in rotation_versions { - let archived_path = temp_dir - .path() - .join(format!("test.jrn.v{}.zz", version)); + let archived_path = temp_dir.path().join(format!("test.jrn.v{}.zz", version)); assert!( archived_path.exists(), "Compressed archive for version {} should exist", @@ -541,4 +545,3 @@ fn test_rotation_callback_receives_compressed_path() -> anyhow::Result<()> { Ok(()) } - diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index e2c48866..c92963c9 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -253,13 +253,22 @@ fn test_recovery_with_overwrites() -> anyhow::Result<()> { // Each version should show the value at that time let state_v1 = recovery.recover_at_version(v1)?; - assert_eq!(state_v1.get("key").map(|tv| &tv.value), Some(&Value::Signed(1))); + assert_eq!( + state_v1.get("key").map(|tv| &tv.value), + Some(&Value::Signed(1)) + ); let state_v2 = recovery.recover_at_version(v2)?; - assert_eq!(state_v2.get("key").map(|tv| &tv.value), Some(&Value::Signed(2))); + assert_eq!( + state_v2.get("key").map(|tv| &tv.value), + Some(&Value::Signed(2)) + ); let state_v3 = recovery.recover_at_version(v3)?; - assert_eq!(state_v3.get("key").map(|tv| &tv.value), Some(&Value::Signed(3))); + assert_eq!( + state_v3.get("key").map(|tv| &tv.value), + Some(&Value::Signed(3)) + ); Ok(()) } @@ -286,9 +295,18 @@ fn test_recovery_various_value_types() -> anyhow::Result<()> { state.get("string").map(|tv| &tv.value), Some(&Value::String("hello".to_string())) ); - assert_eq!(state.get("number").map(|tv| &tv.value), Some(&Value::Signed(42))); - assert_eq!(state.get("float").map(|tv| &tv.value), Some(&Value::Float(3.14))); - assert_eq!(state.get("bool").map(|tv| &tv.value), Some(&Value::Bool(true))); + assert_eq!( + state.get("number").map(|tv| &tv.value), + Some(&Value::Signed(42)) + ); + assert_eq!( + state.get("float").map(|tv| &tv.value), + Some(&Value::Float(3.14)) + ); + assert_eq!( + state.get("bool").map(|tv| &tv.value), + Some(&Value::Bool(true)) + ); Ok(()) } @@ -306,7 +324,7 @@ fn test_recovery_from_compressed_archive() -> anyhow::Result<()> { // Get the current version before rotation (this will be used in the archive name) let archive_version = store.current_version(); - + // Rotate to create compressed archive store.rotate_journal()?; @@ -315,7 +333,9 @@ fn test_recovery_from_compressed_archive() -> anyhow::Result<()> { store.sync()?; // Find the compressed archive (using the version at the time of rotation) - let archived_path = temp_dir.path().join(format!("test.jrn.v{}.zz", archive_version)); + let archived_path = temp_dir + .path() + .join(format!("test.jrn.v{}.zz", archive_version)); assert!(archived_path.exists(), "Compressed archive should exist"); // Read both journals @@ -375,8 +395,12 @@ fn test_recovery_from_multiple_compressed_archives() -> anyhow::Result<()> { store.sync()?; // Collect all journal data (2 compressed + 1 active) - let archive1_path = temp_dir.path().join(format!("test.jrn.v{}.zz", archive1_version)); - let archive2_path = temp_dir.path().join(format!("test.jrn.v{}.zz", archive2_version)); + let archive1_path = temp_dir + .path() + .join(format!("test.jrn.v{}.zz", archive1_version)); + let archive2_path = temp_dir + .path() + .join(format!("test.jrn.v{}.zz", archive2_version)); let archive1_data = std::fs::read(&archive1_path)?; let archive2_data = std::fs::read(&archive2_path)?; @@ -417,7 +441,9 @@ fn test_recovery_mixed_compressed_and_uncompressed() -> anyhow::Result<()> { store.rotate_journal()?; // Get compressed archive - let compressed_archive_path = temp_dir.path().join(format!("test.jrn.v{}.zz", archive_version)); + let compressed_archive_path = temp_dir + .path() + .join(format!("test.jrn.v{}.zz", archive_version)); let compressed_data = std::fs::read(&compressed_archive_path)?; // Create uncompressed journal data manually @@ -455,12 +481,14 @@ fn test_recovery_decompression_transparent() -> anyhow::Result<()> { // Get archive version before rotation let archive_version = store.current_version(); - + // Rotate to compress store.rotate_journal()?; // Read compressed archive - let compressed_path = temp_dir.path().join(format!("test.jrn.v{}.zz", archive_version)); + let compressed_path = temp_dir + .path() + .join(format!("test.jrn.v{}.zz", archive_version)); let compressed_data = std::fs::read(&compressed_path)?; // Verify it's actually compressed (smaller) diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs index 3163d556..bcfc4e93 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -8,8 +8,8 @@ use crate::kv_journal::{MemMappedVersionedKVJournal, TimestampedValue, VersionedKVJournal}; use ahash::AHashMap; use bd_bonjson::Value; -use flate2::write::ZlibEncoder; use flate2::Compression; +use flate2::write::ZlibEncoder; use std::io::Write; use std::path::{Path, PathBuf}; @@ -200,13 +200,9 @@ impl VersionedKVStore { version } else { let (version, timestamp) = self.journal.set_versioned(&key, &value)?; - self.cached_map.insert( - key, - TimestampedValue { - value, - timestamp, - }, - ); + self + .cached_map + .insert(key, TimestampedValue { value, timestamp }); version }; From 1c6498241d1481764c9b751b335a6ae62f1bfc39 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 06:31:20 -0800 Subject: [PATCH 04/66] make async --- Cargo.lock | 1 + bd-resilient-kv/Cargo.toml | 1 + bd-resilient-kv/src/snapshot_cleanup.rs | 37 ++-- .../src/tests/snapshot_cleanup_test.rs | 108 +++++----- .../src/tests/versioned_kv_store_test.rs | 191 +++++++++++------- .../src/tests/versioned_recovery_test.rs | 160 +++++++++------ bd-resilient-kv/src/versioned_kv_store.rs | 30 +-- bd-resilient-kv/src/versioned_recovery.rs | 23 +++ 8 files changed, 334 insertions(+), 217 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c6278fd9..5acb1e4f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1111,6 +1111,7 @@ dependencies = [ "flate2", "memmap2", "tempfile", + "tokio", ] [[package]] diff --git a/bd-resilient-kv/Cargo.toml b/bd-resilient-kv/Cargo.toml index cad895f6..5b52539c 100644 --- a/bd-resilient-kv/Cargo.toml +++ b/bd-resilient-kv/Cargo.toml @@ -20,3 +20,4 @@ bd-workspace-hack.workspace = true bytes.workspace = true flate2.workspace = true memmap2.workspace = true +tokio.workspace = true diff --git a/bd-resilient-kv/src/snapshot_cleanup.rs b/bd-resilient-kv/src/snapshot_cleanup.rs index f65b198c..4b3668af 100644 --- a/bd-resilient-kv/src/snapshot_cleanup.rs +++ b/bd-resilient-kv/src/snapshot_cleanup.rs @@ -5,7 +5,6 @@ // LICENSE file or at: // https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt -use std::fs; use std::path::{Path, PathBuf}; /// Information about an archived journal snapshot. @@ -87,7 +86,7 @@ impl SnapshotCleanup { /// /// # Errors /// Returns an error if the directory cannot be read or if file metadata cannot be accessed. - pub fn list_snapshots(&self) -> anyhow::Result> { + pub async fn list_snapshots(&self) -> anyhow::Result> { let mut snapshots = Vec::new(); // Read directory entries @@ -95,15 +94,14 @@ impl SnapshotCleanup { return Ok(snapshots); } - let entries = fs::read_dir(&self.directory)?; + let mut entries = tokio::fs::read_dir(&self.directory).await?; - for entry in entries { - let entry = entry?; + while let Some(entry) = entries.next_entry().await? { let path = entry.path(); // Check if this is an archived snapshot for our journal if let Some(version) = self.extract_version_from_path(&path) { - let metadata = entry.metadata()?; + let metadata = entry.metadata().await?; snapshots.push(SnapshotInfo { path: path.clone(), version, @@ -139,13 +137,16 @@ impl SnapshotCleanup { /// // Keep snapshots at version 5000 and later, remove older ones /// let removed = cleanup.cleanup_before_version(5000)?; /// ``` - pub fn cleanup_before_version(&self, min_version: u64) -> anyhow::Result> { - let snapshots = self.list_snapshots()?; + pub async fn cleanup_before_version( + &self, + min_version: u64, + ) -> anyhow::Result> { + let snapshots = self.list_snapshots().await?; let mut removed = Vec::new(); for snapshot in snapshots { if snapshot.version < min_version { - fs::remove_file(&snapshot.path)?; + tokio::fs::remove_file(&snapshot.path).await?; removed.push(snapshot); } } @@ -171,8 +172,8 @@ impl SnapshotCleanup { /// // Keep only the 5 most recent snapshots /// let removed = cleanup.cleanup_keep_recent(5)?; /// ``` - pub fn cleanup_keep_recent(&self, keep_count: usize) -> anyhow::Result> { - let mut snapshots = self.list_snapshots()?; + pub async fn cleanup_keep_recent(&self, keep_count: usize) -> anyhow::Result> { + let mut snapshots = self.list_snapshots().await?; if snapshots.len() <= keep_count { return Ok(Vec::new()); @@ -184,7 +185,7 @@ impl SnapshotCleanup { // Remove all except the most recent keep_count let mut removed = Vec::new(); for snapshot in snapshots.into_iter().skip(keep_count) { - fs::remove_file(&snapshot.path)?; + tokio::fs::remove_file(&snapshot.path).await?; removed.push(snapshot); } @@ -198,8 +199,8 @@ impl SnapshotCleanup { /// /// # Errors /// Returns an error if snapshots cannot be listed. - pub fn total_snapshot_size(&self) -> anyhow::Result { - let snapshots = self.list_snapshots()?; + pub async fn total_snapshot_size(&self) -> anyhow::Result { + let snapshots = self.list_snapshots().await?; Ok(snapshots.iter().map(|s| s.size_bytes).sum()) } @@ -209,8 +210,8 @@ impl SnapshotCleanup { /// /// # Errors /// Returns an error if snapshots cannot be listed. - pub fn oldest_snapshot_version(&self) -> anyhow::Result> { - let snapshots = self.list_snapshots()?; + pub async fn oldest_snapshot_version(&self) -> anyhow::Result> { + let snapshots = self.list_snapshots().await?; Ok(snapshots.first().map(|s| s.version)) } @@ -220,8 +221,8 @@ impl SnapshotCleanup { /// /// # Errors /// Returns an error if snapshots cannot be listed. - pub fn newest_snapshot_version(&self) -> anyhow::Result> { - let snapshots = self.list_snapshots()?; + pub async fn newest_snapshot_version(&self) -> anyhow::Result> { + let snapshots = self.list_snapshots().await?; Ok(snapshots.last().map(|s| s.version)) } diff --git a/bd-resilient-kv/src/tests/snapshot_cleanup_test.rs b/bd-resilient-kv/src/tests/snapshot_cleanup_test.rs index 66833ca3..2af2bff6 100644 --- a/bd-resilient-kv/src/tests/snapshot_cleanup_test.rs +++ b/bd-resilient-kv/src/tests/snapshot_cleanup_test.rs @@ -24,19 +24,19 @@ fn create_mock_snapshot(dir: &TempDir, base_name: &str, version: u64, size: usiz } } -#[test] -fn list_snapshots_empty_directory() { +#[tokio::test] +async fn list_snapshots_empty_directory() { let temp_dir = TempDir::new().unwrap(); let journal_path = temp_dir.path().join("test.jrn"); let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - let snapshots = cleanup.list_snapshots().unwrap(); + let snapshots = cleanup.list_snapshots().await.unwrap(); assert!(snapshots.is_empty()); } -#[test] -fn list_snapshots_with_multiple_versions() { +#[tokio::test] +async fn list_snapshots_with_multiple_versions() { let temp_dir = TempDir::new().unwrap(); let journal_path = temp_dir.path().join("test.jrn"); @@ -47,7 +47,7 @@ fn list_snapshots_with_multiple_versions() { create_mock_snapshot(&temp_dir, "test.jrn", 3000, 300); let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - let snapshots = cleanup.list_snapshots().unwrap(); + let snapshots = cleanup.list_snapshots().await.unwrap(); assert_eq!(snapshots.len(), 4); // Should be sorted by version @@ -63,8 +63,8 @@ fn list_snapshots_with_multiple_versions() { assert_eq!(snapshots[3].size_bytes, 300); } -#[test] -fn list_snapshots_ignores_other_files() { +#[tokio::test] +async fn list_snapshots_ignores_other_files() { let temp_dir = TempDir::new().unwrap(); let journal_path = temp_dir.path().join("test.jrn"); @@ -79,15 +79,15 @@ fn list_snapshots_ignores_other_files() { fs::write(temp_dir.path().join("test.jrn.backup"), b"backup").unwrap(); let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - let snapshots = cleanup.list_snapshots().unwrap(); + let snapshots = cleanup.list_snapshots().await.unwrap(); assert_eq!(snapshots.len(), 2); assert_eq!(snapshots[0].version, 1000); assert_eq!(snapshots[1].version, 2000); } -#[test] -fn cleanup_before_version_removes_old_snapshots() { +#[tokio::test] +async fn cleanup_before_version_removes_old_snapshots() { let temp_dir = TempDir::new().unwrap(); let journal_path = temp_dir.path().join("test.jrn"); @@ -99,14 +99,14 @@ fn cleanup_before_version_removes_old_snapshots() { let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); // Remove snapshots before version 3000 (keep 3000 and 4000) - let removed = cleanup.cleanup_before_version(3000).unwrap(); + let removed = cleanup.cleanup_before_version(3000).await.unwrap(); assert_eq!(removed.len(), 2); assert_eq!(removed[0].version, 1000); assert_eq!(removed[1].version, 2000); // Verify remaining snapshots - let remaining = cleanup.list_snapshots().unwrap(); + let remaining = cleanup.list_snapshots().await.unwrap(); assert_eq!(remaining.len(), 2); assert_eq!(remaining[0].version, 3000); assert_eq!(remaining[1].version, 4000); @@ -118,8 +118,8 @@ fn cleanup_before_version_removes_old_snapshots() { assert!(remaining[1].path.exists()); } -#[test] -fn cleanup_before_version_keeps_all_if_min_version_too_low() { +#[tokio::test] +async fn cleanup_before_version_keeps_all_if_min_version_too_low() { let temp_dir = TempDir::new().unwrap(); let journal_path = temp_dir.path().join("test.jrn"); @@ -129,16 +129,16 @@ fn cleanup_before_version_keeps_all_if_min_version_too_low() { let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); // Min version is lower than all snapshots - let removed = cleanup.cleanup_before_version(500).unwrap(); + let removed = cleanup.cleanup_before_version(500).await.unwrap(); assert!(removed.is_empty()); - let remaining = cleanup.list_snapshots().unwrap(); + let remaining = cleanup.list_snapshots().await.unwrap(); assert_eq!(remaining.len(), 2); } -#[test] -fn cleanup_before_version_removes_all_if_min_version_too_high() { +#[tokio::test] +async fn cleanup_before_version_removes_all_if_min_version_too_high() { let temp_dir = TempDir::new().unwrap(); let journal_path = temp_dir.path().join("test.jrn"); @@ -148,16 +148,16 @@ fn cleanup_before_version_removes_all_if_min_version_too_high() { let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); // Min version is higher than all snapshots - let removed = cleanup.cleanup_before_version(5000).unwrap(); + let removed = cleanup.cleanup_before_version(5000).await.unwrap(); assert_eq!(removed.len(), 2); - let remaining = cleanup.list_snapshots().unwrap(); + let remaining = cleanup.list_snapshots().await.unwrap(); assert!(remaining.is_empty()); } -#[test] -fn cleanup_keep_recent_removes_old_snapshots() { +#[tokio::test] +async fn cleanup_keep_recent_removes_old_snapshots() { let temp_dir = TempDir::new().unwrap(); let journal_path = temp_dir.path().join("test.jrn"); @@ -170,7 +170,7 @@ fn cleanup_keep_recent_removes_old_snapshots() { let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); // Keep only the 2 most recent snapshots - let removed = cleanup.cleanup_keep_recent(2).unwrap(); + let removed = cleanup.cleanup_keep_recent(2).await.unwrap(); assert_eq!(removed.len(), 3); assert_eq!(removed[0].version, 1000); @@ -178,14 +178,14 @@ fn cleanup_keep_recent_removes_old_snapshots() { assert_eq!(removed[2].version, 3000); // Verify remaining snapshots - let remaining = cleanup.list_snapshots().unwrap(); + let remaining = cleanup.list_snapshots().await.unwrap(); assert_eq!(remaining.len(), 2); assert_eq!(remaining[0].version, 4000); assert_eq!(remaining[1].version, 5000); } -#[test] -fn cleanup_keep_recent_keeps_all_if_count_too_high() { +#[tokio::test] +async fn cleanup_keep_recent_keeps_all_if_count_too_high() { let temp_dir = TempDir::new().unwrap(); let journal_path = temp_dir.path().join("test.jrn"); @@ -195,16 +195,16 @@ fn cleanup_keep_recent_keeps_all_if_count_too_high() { let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); // Keep count is higher than total snapshots - let removed = cleanup.cleanup_keep_recent(5).unwrap(); + let removed = cleanup.cleanup_keep_recent(5).await.unwrap(); assert!(removed.is_empty()); - let remaining = cleanup.list_snapshots().unwrap(); + let remaining = cleanup.list_snapshots().await.unwrap(); assert_eq!(remaining.len(), 2); } -#[test] -fn cleanup_keep_recent_with_zero_removes_all() { +#[tokio::test] +async fn cleanup_keep_recent_with_zero_removes_all() { let temp_dir = TempDir::new().unwrap(); let journal_path = temp_dir.path().join("test.jrn"); @@ -213,16 +213,16 @@ fn cleanup_keep_recent_with_zero_removes_all() { let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - let removed = cleanup.cleanup_keep_recent(0).unwrap(); + let removed = cleanup.cleanup_keep_recent(0).await.unwrap(); assert_eq!(removed.len(), 2); - let remaining = cleanup.list_snapshots().unwrap(); + let remaining = cleanup.list_snapshots().await.unwrap(); assert!(remaining.is_empty()); } -#[test] -fn total_snapshot_size_calculates_correctly() { +#[tokio::test] +async fn total_snapshot_size_calculates_correctly() { let temp_dir = TempDir::new().unwrap(); let journal_path = temp_dir.path().join("test.jrn"); @@ -231,24 +231,24 @@ fn total_snapshot_size_calculates_correctly() { create_mock_snapshot(&temp_dir, "test.jrn", 3000, 150); let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - let total_size = cleanup.total_snapshot_size().unwrap(); + let total_size = cleanup.total_snapshot_size().await.unwrap(); assert_eq!(total_size, 500); // 100 + 250 + 150 } -#[test] -fn total_snapshot_size_returns_zero_for_empty() { +#[tokio::test] +async fn total_snapshot_size_returns_zero_for_empty() { let temp_dir = TempDir::new().unwrap(); let journal_path = temp_dir.path().join("test.jrn"); let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - let total_size = cleanup.total_snapshot_size().unwrap(); + let total_size = cleanup.total_snapshot_size().await.unwrap(); assert_eq!(total_size, 0); } -#[test] -fn oldest_and_newest_snapshot_versions() { +#[tokio::test] +async fn oldest_and_newest_snapshot_versions() { let temp_dir = TempDir::new().unwrap(); let journal_path = temp_dir.path().join("test.jrn"); @@ -258,23 +258,23 @@ fn oldest_and_newest_snapshot_versions() { let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - assert_eq!(cleanup.oldest_snapshot_version().unwrap(), Some(1000)); - assert_eq!(cleanup.newest_snapshot_version().unwrap(), Some(3000)); + assert_eq!(cleanup.oldest_snapshot_version().await.unwrap(), Some(1000)); + assert_eq!(cleanup.newest_snapshot_version().await.unwrap(), Some(3000)); } -#[test] -fn oldest_and_newest_return_none_for_empty() { +#[tokio::test] +async fn oldest_and_newest_return_none_for_empty() { let temp_dir = TempDir::new().unwrap(); let journal_path = temp_dir.path().join("test.jrn"); let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - assert_eq!(cleanup.oldest_snapshot_version().unwrap(), None); - assert_eq!(cleanup.newest_snapshot_version().unwrap(), None); + assert_eq!(cleanup.oldest_snapshot_version().await.unwrap(), None); + assert_eq!(cleanup.newest_snapshot_version().await.unwrap(), None); } -#[test] -fn works_with_subdirectory_paths() { +#[tokio::test] +async fn works_with_subdirectory_paths() { let temp_dir = TempDir::new().unwrap(); let subdir = temp_dir.path().join("data"); fs::create_dir(&subdir).unwrap(); @@ -291,15 +291,15 @@ fn works_with_subdirectory_paths() { fs::write(&path2, vec![0u8; 200]).unwrap(); let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - let snapshots = cleanup.list_snapshots().unwrap(); + let snapshots = cleanup.list_snapshots().await.unwrap(); assert_eq!(snapshots.len(), 2); assert_eq!(snapshots[0].version, 1000); assert_eq!(snapshots[1].version, 2000); } -#[test] -fn cleanup_with_different_base_names() { +#[tokio::test] +async fn cleanup_with_different_base_names() { let temp_dir = TempDir::new().unwrap(); // Create snapshots for different journals @@ -310,7 +310,7 @@ fn cleanup_with_different_base_names() { // Cleanup for journal_a should only see journal_a snapshots let cleanup_a = SnapshotCleanup::new(temp_dir.path().join("journal_a.jrn")).unwrap(); - let snapshots_a = cleanup_a.list_snapshots().unwrap(); + let snapshots_a = cleanup_a.list_snapshots().await.unwrap(); assert_eq!(snapshots_a.len(), 2); assert!( @@ -321,7 +321,7 @@ fn cleanup_with_different_base_names() { // Cleanup for journal_b should only see journal_b snapshots let cleanup_b = SnapshotCleanup::new(temp_dir.path().join("journal_b.jrn")).unwrap(); - let snapshots_b = cleanup_b.list_snapshots().unwrap(); + let snapshots_b = cleanup_b.list_snapshots().await.unwrap(); assert_eq!(snapshots_b.len(), 2); assert!( diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index 3774e99e..571ee185 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -26,22 +26,26 @@ fn test_versioned_store_new() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_versioned_store_basic_operations() -> anyhow::Result<()> { +#[tokio::test] +async fn test_versioned_store_basic_operations() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); let mut store = VersionedKVStore::new(&file_path, 4096, None)?; // Test insert with version tracking - let v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; + let v1 = store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; assert_eq!(v1, 2); // First write is version 2 (base is 1) let retrieved = store.get("key1"); assert_eq!(retrieved, Some(&Value::String("value1".to_string()))); // Test overwrite - let v2 = store.insert("key1".to_string(), Value::String("value2".to_string()))?; + let v2 = store + .insert("key1".to_string(), Value::String("value2".to_string())) + .await?; assert_eq!(v2, 3); // Second write is version 3 assert!(v2 > v1); @@ -62,22 +66,26 @@ fn test_versioned_store_basic_operations() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_versioned_store_remove() -> anyhow::Result<()> { +#[tokio::test] +async fn test_versioned_store_remove() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); let mut store = VersionedKVStore::new(&file_path, 4096, None)?; // Insert some values - let v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; - let v2 = store.insert("key2".to_string(), Value::String("value2".to_string()))?; + let v1 = store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + let v2 = store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; assert_eq!(store.len(), 2); assert!(v2 > v1); // Remove a key - let v3 = store.remove("key1")?; + let v3 = store.remove("key1").await?; assert!(v3.is_some()); assert!(v3.unwrap() > v2); @@ -86,14 +94,14 @@ fn test_versioned_store_remove() -> anyhow::Result<()> { assert!(store.contains_key("key2")); // Remove non-existent key - let removed = store.remove("nonexistent")?; + let removed = store.remove("nonexistent").await?; assert!(removed.is_none()); Ok(()) } -#[test] -fn test_persistence_and_reload() -> anyhow::Result<()> { +#[tokio::test] +async fn test_persistence_and_reload() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); @@ -102,8 +110,10 @@ fn test_persistence_and_reload() -> anyhow::Result<()> { // Create store and write some data { let mut store = VersionedKVStore::new(&file_path, 4096, None)?; - let _v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; - v2 = store.insert("key2".to_string(), Value::Signed(42))?; + let _v1 = store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + v2 = store.insert("key2".to_string(), Value::Signed(42)).await?; store.sync()?; } @@ -124,27 +134,29 @@ fn test_persistence_and_reload() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_null_value_is_deletion() -> anyhow::Result<()> { +#[tokio::test] +async fn test_null_value_is_deletion() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); let mut store = VersionedKVStore::new(&file_path, 4096, None)?; // Insert a value - store.insert("key1".to_string(), Value::String("value1".to_string()))?; + store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; assert!(store.contains_key("key1")); // Insert null to delete - store.insert("key1".to_string(), Value::Null)?; + store.insert("key1".to_string(), Value::Null).await?; assert!(!store.contains_key("key1")); assert_eq!(store.len(), 0); Ok(()) } -#[test] -fn test_rotation_callback() -> anyhow::Result<()> { +#[tokio::test] +async fn test_rotation_callback() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); @@ -165,7 +177,7 @@ fn test_rotation_callback() -> anyhow::Result<()> { for i in 0 .. 100 { let key = format!("key{}", i); let value = Value::String(format!("value_{}_with_some_extra_padding", i)); - last_version = store.insert(key, value)?; + last_version = store.insert(key, value).await?; // Rotation happens automatically inside insert when high water mark is triggered let data = callback_data.lock().unwrap(); @@ -186,20 +198,24 @@ fn test_rotation_callback() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_manual_rotation() -> anyhow::Result<()> { +#[tokio::test] +async fn test_manual_rotation() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); let mut store = VersionedKVStore::new(&file_path, 4096, None)?; // Insert some data - let _v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; - let v2 = store.insert("key2".to_string(), Value::String("value2".to_string()))?; + let _v1 = store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + let v2 = store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; // Manually trigger rotation let rotation_version = store.current_version(); - store.rotate_journal()?; + store.rotate_journal().await?; // Verify archived file exists (compressed) let archived_path = temp_dir @@ -208,7 +224,9 @@ fn test_manual_rotation() -> anyhow::Result<()> { assert!(archived_path.exists()); // Verify active journal still works - let v3 = store.insert("key3".to_string(), Value::String("value3".to_string()))?; + let v3 = store + .insert("key3".to_string(), Value::String("value3".to_string())) + .await?; assert!(v3 > v2); assert_eq!(store.len(), 3); @@ -232,24 +250,28 @@ fn test_manual_rotation() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_rotation_preserves_state() -> anyhow::Result<()> { +#[tokio::test] +async fn test_rotation_preserves_state() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); let mut store = VersionedKVStore::new(&file_path, 4096, None)?; // Create complex state - store.insert("key1".to_string(), Value::String("value1".to_string()))?; - store.insert("key2".to_string(), Value::Signed(42))?; - store.insert("key3".to_string(), Value::Bool(true))?; - store.insert("key4".to_string(), Value::Float(3.14159))?; + store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + store.insert("key2".to_string(), Value::Signed(42)).await?; + store.insert("key3".to_string(), Value::Bool(true)).await?; + store + .insert("key4".to_string(), Value::Float(3.14159)) + .await?; let pre_rotation_state = store.as_hashmap(); let pre_rotation_version = store.current_version(); // Rotate - store.rotate_journal()?; + store.rotate_journal().await?; // Verify state is preserved exactly let post_rotation_state = store.as_hashmap(); @@ -257,15 +279,17 @@ fn test_rotation_preserves_state() -> anyhow::Result<()> { assert_eq!(store.len(), 4); // Verify we can continue writing - let v_new = store.insert("key5".to_string(), Value::String("value5".to_string()))?; + let v_new = store + .insert("key5".to_string(), Value::String("value5".to_string())) + .await?; assert!(v_new > pre_rotation_version); assert_eq!(store.len(), 5); Ok(()) } -#[test] -fn test_empty_store_operations() -> anyhow::Result<()> { +#[tokio::test] +async fn test_empty_store_operations() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); @@ -274,15 +298,15 @@ fn test_empty_store_operations() -> anyhow::Result<()> { // Operations on empty store assert_eq!(store.get("nonexistent"), None); assert!(!store.contains_key("nonexistent")); - assert_eq!(store.remove("nonexistent")?, None); + assert_eq!(store.remove("nonexistent").await?, None); assert!(store.is_empty()); assert_eq!(store.len(), 0); Ok(()) } -#[test] -fn test_version_monotonicity() -> anyhow::Result<()> { +#[tokio::test] +async fn test_version_monotonicity() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); @@ -293,15 +317,20 @@ fn test_version_monotonicity() -> anyhow::Result<()> { // Perform various operations and ensure version always increases for i in 0 .. 20 { let op_version = if i % 3 == 0 { - store.insert(format!("key{}", i), Value::Signed(i as i64))? + store + .insert(format!("key{}", i), Value::Signed(i as i64)) + .await? } else if i % 3 == 1 { - store.insert( - format!("key{}", i / 3), - Value::String(format!("updated{}", i)), - )? + store + .insert( + format!("key{}", i / 3), + Value::String(format!("updated{}", i)), + ) + .await? } else { store - .remove(&format!("key{}", i / 3))? + .remove(&format!("key{}", i / 3)) + .await? .unwrap_or(last_version) }; @@ -317,8 +346,8 @@ fn test_version_monotonicity() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { +#[tokio::test] +async fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); @@ -326,7 +355,9 @@ fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { let mut store = VersionedKVStore::new(&file_path, 2048, Some(0.5))?; // Insert some keys and capture their timestamps - store.insert("key1".to_string(), Value::String("value1".to_string()))?; + store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; #[allow(clippy::unwrap_used)] let ts1 = store .get_with_timestamp("key1") @@ -336,7 +367,9 @@ fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { // Small sleep to ensure different timestamps std::thread::sleep(std::time::Duration::from_millis(10)); - store.insert("key2".to_string(), Value::String("value2".to_string()))?; + store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; #[allow(clippy::unwrap_used)] let ts2 = store .get_with_timestamp("key2") @@ -349,7 +382,7 @@ fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { // Write enough data to trigger rotation for i in 0 .. 50 { - store.insert(format!("fill{i}"), Value::Signed(i))?; + store.insert(format!("fill{i}"), Value::Signed(i)).await?; } // Verify that after rotation, the original timestamps are preserved @@ -383,8 +416,8 @@ fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_compression_during_rotation() -> anyhow::Result<()> { +#[tokio::test] +async fn test_compression_during_rotation() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); @@ -392,9 +425,15 @@ fn test_compression_during_rotation() -> anyhow::Result<()> { // Insert some data let data = "x".repeat(1000); // Large value to make compression effective - store.insert("key1".to_string(), Value::String(data.clone()))?; - store.insert("key2".to_string(), Value::String(data.clone()))?; - store.insert("key3".to_string(), Value::String(data))?; + store + .insert("key1".to_string(), Value::String(data.clone())) + .await?; + store + .insert("key2".to_string(), Value::String(data.clone())) + .await?; + store + .insert("key3".to_string(), Value::String(data)) + .await?; // Get size of uncompressed journal before rotation let uncompressed_size = std::fs::metadata(&file_path)?.len(); @@ -403,7 +442,7 @@ fn test_compression_during_rotation() -> anyhow::Result<()> { let rotation_version = store.current_version(); // Trigger rotation - store.rotate_journal()?; + store.rotate_journal().await?; // Verify compressed archive exists let archived_path = temp_dir @@ -432,14 +471,16 @@ fn test_compression_during_rotation() -> anyhow::Result<()> { ); // Verify active journal still works - store.insert("key4".to_string(), Value::String("value4".to_string()))?; + store + .insert("key4".to_string(), Value::String("value4".to_string())) + .await?; assert_eq!(store.len(), 4); Ok(()) } -#[test] -fn test_compression_ratio() -> anyhow::Result<()> { +#[tokio::test] +async fn test_compression_ratio() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); @@ -448,16 +489,18 @@ fn test_compression_ratio() -> anyhow::Result<()> { // Insert highly compressible data let compressible_data = "A".repeat(500); for i in 0 .. 10 { - store.insert( - format!("key{}", i), - Value::String(compressible_data.clone()), - )?; + store + .insert( + format!("key{}", i), + Value::String(compressible_data.clone()), + ) + .await?; } let uncompressed_size = std::fs::metadata(&file_path)?.len(); let rotation_version = store.current_version(); - store.rotate_journal()?; + store.rotate_journal().await?; let archived_path = temp_dir .path() @@ -477,8 +520,8 @@ fn test_compression_ratio() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_multiple_rotations_with_compression() -> anyhow::Result<()> { +#[tokio::test] +async fn test_multiple_rotations_with_compression() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); @@ -490,9 +533,9 @@ fn test_multiple_rotations_with_compression() -> anyhow::Result<()> { for i in 0 .. 3 { let key = format!("key{}", i); let value = Value::String(format!("value{}", i)); - let version = store.insert(key, value)?; + let version = store.insert(key, value).await?; rotation_versions.push(version); - store.rotate_journal()?; + store.rotate_journal().await?; } // Verify all compressed archives exist @@ -508,8 +551,8 @@ fn test_multiple_rotations_with_compression() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_rotation_callback_receives_compressed_path() -> anyhow::Result<()> { +#[tokio::test] +async fn test_rotation_callback_receives_compressed_path() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); @@ -523,8 +566,10 @@ fn test_rotation_callback_receives_compressed_path() -> anyhow::Result<()> { *data = Some(old_path.to_path_buf()); })); - store.insert("key1".to_string(), Value::String("value1".to_string()))?; - store.rotate_journal()?; + store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + store.rotate_journal().await?; // Verify callback received compressed path let data = callback_data.lock().unwrap(); diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index c92963c9..87554afc 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -10,16 +10,22 @@ use crate::versioned_recovery::VersionedRecovery; use bd_bonjson::Value; use tempfile::TempDir; -#[test] -fn test_recovery_single_journal() -> anyhow::Result<()> { +#[tokio::test] +async fn test_recovery_single_journal() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); // Create a store and write some versioned data let mut store = VersionedKVStore::new(&file_path, 4096, None)?; - let v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; - let v2 = store.insert("key2".to_string(), Value::String("value2".to_string()))?; - let v3 = store.insert("key1".to_string(), Value::String("updated1".to_string()))?; + let v1 = store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + let v2 = store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; + let v3 = store + .insert("key1".to_string(), Value::String("updated1".to_string())) + .await?; store.sync()?; // Read the journal data @@ -75,16 +81,20 @@ fn test_recovery_single_journal() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_recovery_with_deletions() -> anyhow::Result<()> { +#[tokio::test] +async fn test_recovery_with_deletions() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); // Create a store with deletions let mut store = VersionedKVStore::new(&file_path, 4096, None)?; - let v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; - let v2 = store.insert("key2".to_string(), Value::String("value2".to_string()))?; - let v3_opt = store.remove("key1")?; + let v1 = store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + let v2 = store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; + let v3_opt = store.remove("key1").await?; assert!(v3_opt.is_some()); #[allow(clippy::unwrap_used)] let v3 = v3_opt.unwrap(); @@ -112,8 +122,8 @@ fn test_recovery_with_deletions() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { +#[tokio::test] +async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); @@ -121,21 +131,27 @@ fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { let mut store = VersionedKVStore::new(&file_path, 2048, None)?; // Write data that will trigger rotation - let v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; - store.insert("key2".to_string(), Value::String("value2".to_string()))?; + let v1 = store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; // Write more data to trigger rotation for i in 0 .. 20 { - store.insert(format!("key{i}"), Value::Signed(i))?; + store.insert(format!("key{i}"), Value::Signed(i)).await?; } let v_middle = store.current_version(); // Write more after rotation - let v_final = store.insert( - "final".to_string(), - Value::String("final_value".to_string()), - )?; + let v_final = store + .insert( + "final".to_string(), + Value::String("final_value".to_string()), + ) + .await?; store.sync()?; // Read all journal files @@ -213,15 +229,21 @@ fn test_recovery_empty_journal() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_recovery_version_range() -> anyhow::Result<()> { +#[tokio::test] +async fn test_recovery_version_range() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); let mut store = VersionedKVStore::new(&file_path, 4096, None)?; - store.insert("key1".to_string(), Value::String("value1".to_string()))?; - store.insert("key2".to_string(), Value::String("value2".to_string()))?; - let v3 = store.insert("key3".to_string(), Value::String("value3".to_string()))?; + store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; + let v3 = store + .insert("key3".to_string(), Value::String("value3".to_string())) + .await?; store.sync()?; let journal_data = std::fs::read(&file_path)?; @@ -237,15 +259,15 @@ fn test_recovery_version_range() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_recovery_with_overwrites() -> anyhow::Result<()> { +#[tokio::test] +async fn test_recovery_with_overwrites() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); let mut store = VersionedKVStore::new(&file_path, 4096, None)?; - let v1 = store.insert("key".to_string(), Value::Signed(1))?; - let v2 = store.insert("key".to_string(), Value::Signed(2))?; - let v3 = store.insert("key".to_string(), Value::Signed(3))?; + let v1 = store.insert("key".to_string(), Value::Signed(1)).await?; + let v2 = store.insert("key".to_string(), Value::Signed(2)).await?; + let v3 = store.insert("key".to_string(), Value::Signed(3)).await?; store.sync()?; let journal_data = std::fs::read(&file_path)?; @@ -273,16 +295,22 @@ fn test_recovery_with_overwrites() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_recovery_various_value_types() -> anyhow::Result<()> { +#[tokio::test] +async fn test_recovery_various_value_types() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); let mut store = VersionedKVStore::new(&file_path, 4096, None)?; - store.insert("string".to_string(), Value::String("hello".to_string()))?; - store.insert("number".to_string(), Value::Signed(42))?; - store.insert("float".to_string(), Value::Float(3.14))?; - store.insert("bool".to_string(), Value::Bool(true))?; + store + .insert("string".to_string(), Value::String("hello".to_string())) + .await?; + store + .insert("number".to_string(), Value::Signed(42)) + .await?; + store + .insert("float".to_string(), Value::Float(3.14)) + .await?; + store.insert("bool".to_string(), Value::Bool(true)).await?; let v_final = store.current_version(); store.sync()?; @@ -311,25 +339,31 @@ fn test_recovery_various_value_types() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_recovery_from_compressed_archive() -> anyhow::Result<()> { +#[tokio::test] +async fn test_recovery_from_compressed_archive() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); // Create a store and write some data let mut store = VersionedKVStore::new(&file_path, 4096, None)?; - let v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; - let v2 = store.insert("key2".to_string(), Value::String("value2".to_string()))?; + let v1 = store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + let v2 = store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; store.sync()?; // Get the current version before rotation (this will be used in the archive name) let archive_version = store.current_version(); // Rotate to create compressed archive - store.rotate_journal()?; + store.rotate_journal().await?; // Add more data to active journal - let v3 = store.insert("key3".to_string(), Value::String("value3".to_string()))?; + let v3 = store + .insert("key3".to_string(), Value::String("value3".to_string())) + .await?; store.sync()?; // Find the compressed archive (using the version at the time of rotation) @@ -376,22 +410,28 @@ fn test_recovery_from_compressed_archive() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_recovery_from_multiple_compressed_archives() -> anyhow::Result<()> { +#[tokio::test] +async fn test_recovery_from_multiple_compressed_archives() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); // Create a store and perform multiple rotations let mut store = VersionedKVStore::new(&file_path, 4096, None)?; - let v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; + let v1 = store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; let archive1_version = store.current_version(); - store.rotate_journal()?; + store.rotate_journal().await?; - let v2 = store.insert("key2".to_string(), Value::String("value2".to_string()))?; + let v2 = store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; let archive2_version = store.current_version(); - store.rotate_journal()?; + store.rotate_journal().await?; - let v3 = store.insert("key3".to_string(), Value::String("value3".to_string()))?; + let v3 = store + .insert("key3".to_string(), Value::String("value3".to_string())) + .await?; store.sync()?; // Collect all journal data (2 compressed + 1 active) @@ -428,17 +468,19 @@ fn test_recovery_from_multiple_compressed_archives() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_recovery_mixed_compressed_and_uncompressed() -> anyhow::Result<()> { +#[tokio::test] +async fn test_recovery_mixed_compressed_and_uncompressed() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); // Create initial store and archive (will be compressed) let mut store = VersionedKVStore::new(&file_path, 4096, None)?; - let _v1 = store.insert("key1".to_string(), Value::String("value1".to_string()))?; + let _v1 = store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; store.sync()?; let archive_version = store.current_version(); - store.rotate_journal()?; + store.rotate_journal().await?; // Get compressed archive let compressed_archive_path = temp_dir @@ -448,7 +490,9 @@ fn test_recovery_mixed_compressed_and_uncompressed() -> anyhow::Result<()> { // Create uncompressed journal data manually let mut uncompressed_store = VersionedKVStore::new(&file_path, 4096, None)?; - let v2 = uncompressed_store.insert("key2".to_string(), Value::String("value2".to_string()))?; + let v2 = uncompressed_store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; uncompressed_store.sync()?; let uncompressed_data = std::fs::read(&file_path)?; @@ -463,15 +507,17 @@ fn test_recovery_mixed_compressed_and_uncompressed() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_recovery_decompression_transparent() -> anyhow::Result<()> { +#[tokio::test] +async fn test_recovery_decompression_transparent() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let file_path = temp_dir.path().join("test.jrn"); // Create store with compressible data let mut store = VersionedKVStore::new(&file_path, 4096, None)?; let compressible = "A".repeat(500); - let v1 = store.insert("data".to_string(), Value::String(compressible.clone()))?; + let v1 = store + .insert("data".to_string(), Value::String(compressible.clone())) + .await?; store.sync()?; // Create uncompressed recovery baseline @@ -483,7 +529,7 @@ fn test_recovery_decompression_transparent() -> anyhow::Result<()> { let archive_version = store.current_version(); // Rotate to compress - store.rotate_journal()?; + store.rotate_journal().await?; // Read compressed archive let compressed_path = temp_dir diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs index bcfc4e93..1896c12b 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -25,14 +25,14 @@ use std::path::{Path, PathBuf}; pub type RotationCallback = Box; /// Compress an archived journal using zlib. -fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result<()> { - let journal_bytes = std::fs::read(source)?; +async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result<()> { + let journal_bytes = tokio::fs::read(source).await?; let mut encoder = ZlibEncoder::new(Vec::new(), Compression::new(3)); encoder.write_all(&journal_bytes)?; let compressed = encoder.finish()?; - std::fs::write(dest, compressed)?; + tokio::fs::write(dest, compressed).await?; Ok(()) } @@ -192,7 +192,7 @@ impl VersionedKVStore { /// /// # Errors /// Returns an error if the value cannot be written to the journal. - pub fn insert(&mut self, key: String, value: Value) -> anyhow::Result { + pub async fn insert(&mut self, key: String, value: Value) -> anyhow::Result { let version = if matches!(value, Value::Null) { // Inserting null is equivalent to deletion let (version, _timestamp) = self.journal.delete_versioned(&key)?; @@ -208,7 +208,7 @@ impl VersionedKVStore { // Check if rotation is needed if self.journal.is_high_water_mark_triggered() { - self.rotate_journal()?; + self.rotate_journal().await?; } Ok(version) @@ -220,7 +220,7 @@ impl VersionedKVStore { /// /// # Errors /// Returns an error if the deletion cannot be written to the journal. - pub fn remove(&mut self, key: &str) -> anyhow::Result> { + pub async fn remove(&mut self, key: &str) -> anyhow::Result> { if !self.cached_map.contains_key(key) { return Ok(None); } @@ -230,7 +230,7 @@ impl VersionedKVStore { // Check if rotation is needed if self.journal.is_high_water_mark_triggered() { - self.rotate_journal()?; + self.rotate_journal().await?; } Ok(Some(version)) @@ -325,14 +325,14 @@ impl VersionedKVStore { /// /// # Errors /// Returns an error if rotation fails. - pub fn rotate_journal(&mut self) -> anyhow::Result<()> { + pub async fn rotate_journal(&mut self) -> anyhow::Result<()> { let rotation_version = self.journal.current_version(); // Generate archived journal path with rotation version (compressed) let archived_path = self.generate_archived_path(rotation_version); // Create new journal with rotated state - let new_journal = self.create_rotated_journal(rotation_version)?; + let new_journal = self.create_rotated_journal(rotation_version).await?; // Replace old journal with new one let old_journal = std::mem::replace(&mut self.journal, new_journal); @@ -340,17 +340,17 @@ impl VersionedKVStore { // Move old journal to temporary location drop(old_journal); // Release mmap before moving file let temp_uncompressed = self.base_path.with_extension("jrn.old"); - std::fs::rename(&self.base_path, &temp_uncompressed)?; + tokio::fs::rename(&self.base_path, &temp_uncompressed).await?; // Rename new journal to base path let temp_path = self.base_path.with_extension("jrn.tmp"); - std::fs::rename(&temp_path, &self.base_path)?; + tokio::fs::rename(&temp_path, &self.base_path).await?; // Compress the archived journal - compress_archived_journal(&temp_uncompressed, &archived_path)?; + compress_archived_journal(&temp_uncompressed, &archived_path).await?; // Remove uncompressed version - std::fs::remove_file(&temp_uncompressed)?; + tokio::fs::remove_file(&temp_uncompressed).await?; // Invoke rotation callback if set if let Some(ref mut callback) = self.rotation_callback { @@ -372,7 +372,7 @@ impl VersionedKVStore { } /// Create a new rotated journal with compacted state. - fn create_rotated_journal( + async fn create_rotated_journal( &self, rotation_version: u64, ) -> anyhow::Result { @@ -391,7 +391,7 @@ impl VersionedKVStore { )?; // Write buffer to temporary file - std::fs::write(&temp_path, &buffer)?; + tokio::fs::write(&temp_path, &buffer).await?; // Open as memory-mapped journal MemMappedVersionedKVJournal::from_file(&temp_path, self.buffer_size, self.high_water_mark_ratio) diff --git a/bd-resilient-kv/src/versioned_recovery.rs b/bd-resilient-kv/src/versioned_recovery.rs index 30cc806e..d99ea041 100644 --- a/bd-resilient-kv/src/versioned_recovery.rs +++ b/bd-resilient-kv/src/versioned_recovery.rs @@ -11,6 +11,7 @@ use bd_bonjson::Value; use bd_bonjson::decoder::from_slice; use flate2::read::ZlibDecoder; use std::io::Read; +use std::path::Path; /// A utility for recovering state at arbitrary versions from raw journal data. /// @@ -76,6 +77,28 @@ impl VersionedRecovery { }) } + /// Create a new recovery utility from journal file paths. + /// + /// This is an async convenience method that reads journal files from disk. + /// The journals should be provided in chronological order (oldest to newest). + /// + /// # Errors + /// + /// Returns an error if any file cannot be read or if any journal is invalid. + pub async fn from_files(journal_paths: Vec<&Path>) -> anyhow::Result { + let mut journal_data = Vec::new(); + + for path in journal_paths { + let data = tokio::fs::read(path).await?; + journal_data.push(data); + } + + // Convert Vec> to Vec<&[u8]> + let journal_slices: Vec<&[u8]> = journal_data.iter().map(Vec::as_slice).collect(); + + Self::new(journal_slices) + } + /// Recover the key-value state at a specific version. /// /// This method replays all journal entries from all provided journals up to and including From edee87cfcdf0ff253fde10df6f7dce20c062c45a Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 07:59:32 -0800 Subject: [PATCH 05/66] better compression, allows, + --- AGENTS.md | 5 + Cargo.lock | 1 + Cargo.toml | 1 + bd-resilient-kv/Cargo.toml | 1 + bd-resilient-kv/src/kv_journal/memmapped.rs | 15 +- .../src/kv_journal/memmapped_versioned.rs | 8 +- bd-resilient-kv/src/tests/kv_store_test.rs | 32 +- .../src/tests/versioned_kv_store_test.rs | 73 +++-- .../src/tests/versioned_recovery_test.rs | 279 ++++++++++++++---- bd-resilient-kv/src/versioned_kv_store.rs | 108 ++++--- bd-resilient-kv/src/versioned_recovery.rs | 33 ++- 11 files changed, 356 insertions(+), 200 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 8bbfe00a..d03ecfee 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -30,6 +30,11 @@ ``` 4. Tests in the same file as the implementation code should be avoided 5. Test names should *not* start with `test_`, as this is redundant +6. Use module-level clippy allow blocks instead of per-test allows: + ```rust + #![allow(clippy::unwrap_used)] + ``` + This should be placed at the top of the test file, after the license header and before imports. ## Code Quality Checks - After generating or modifying code, always run clippy to check for static lint violations: diff --git a/Cargo.lock b/Cargo.lock index 5acb1e4f..d4414c47 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1104,6 +1104,7 @@ version = "1.0.0" dependencies = [ "ahash", "anyhow", + "async-compression", "bd-bonjson", "bd-client-common", "bd-workspace-hack", diff --git a/Cargo.toml b/Cargo.toml index de8de5e7..ed36fca4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,6 +62,7 @@ arbitrary = { version = "1.4.2", features = ["derive"] } arc-swap = "1.7.1" assert_matches = "1.5.0" assert_no_alloc = "1.1.2" +async-compression = { version = "0.4.20", features = ["tokio", "zlib"] } async-trait = "0.1.89" axum = { version = "0.8.6", features = ["http2", "macros"] } axum-server = { version = "0.7.2", features = ["tls-rustls-no-provider"] } diff --git a/bd-resilient-kv/Cargo.toml b/bd-resilient-kv/Cargo.toml index 5b52539c..0aa6644c 100644 --- a/bd-resilient-kv/Cargo.toml +++ b/bd-resilient-kv/Cargo.toml @@ -14,6 +14,7 @@ tempfile.workspace = true [dependencies] ahash.workspace = true anyhow.workspace = true +async-compression.workspace = true bd-bonjson = { path = "../bd-bonjson" } bd-client-common = { path = "../bd-client-common" } bd-workspace-hack.workspace = true diff --git a/bd-resilient-kv/src/kv_journal/memmapped.rs b/bd-resilient-kv/src/kv_journal/memmapped.rs index 6af76376..4c7cff07 100644 --- a/bd-resilient-kv/src/kv_journal/memmapped.rs +++ b/bd-resilient-kv/src/kv_journal/memmapped.rs @@ -118,19 +118,6 @@ impl MemMappedKVJournal { Ok(Self { mmap, in_memory_kv }) } - /// Synchronize changes to disk. - /// - /// This forces any changes in the memory-mapped region to be written to the underlying file. - /// Note that changes are typically synced automatically by the OS, but this provides - /// explicit control when needed. - /// - /// # Errors - /// Returns an error if the sync operation fails. - pub fn sync(&self) -> anyhow::Result<()> { - self.mmap.flush()?; - Ok(()) - } - /// Get the size of the underlying file in bytes. #[must_use] pub fn file_size(&self) -> usize { @@ -245,6 +232,6 @@ impl KVJournal for MemMappedKVJournal { /// # Errors /// Returns an error if the sync operation fails. fn sync(&self) -> anyhow::Result<()> { - self.sync() + self.mmap.flush().map_err(Into::into) } } diff --git a/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs b/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs index f6ddc09a..dad77e2c 100644 --- a/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs +++ b/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs @@ -205,15 +205,17 @@ impl MemMappedVersionedKVJournal { /// Synchronize changes to disk. /// - /// This forces any changes in the memory-mapped region to be written to the underlying file. + /// This method explicitly flushes any pending changes to the underlying file. /// Note that changes are typically synced automatically by the OS, but this provides /// explicit control when needed. /// + /// This is a blocking operation that performs synchronous I/O (`msync()` system call). + /// In async contexts, consider wrapping this call with `tokio::task::spawn_blocking`. + /// /// # Errors /// Returns an error if the sync operation fails. pub fn sync(&self) -> anyhow::Result<()> { - self.mmap.flush()?; - Ok(()) + self.mmap.flush().map_err(Into::into) } /// Get the size of the underlying file in bytes. diff --git a/bd-resilient-kv/src/tests/kv_store_test.rs b/bd-resilient-kv/src/tests/kv_store_test.rs index bad5bcb0..3b01d3a8 100644 --- a/bd-resilient-kv/src/tests/kv_store_test.rs +++ b/bd-resilient-kv/src/tests/kv_store_test.rs @@ -286,13 +286,13 @@ fn test_kv_store_persistence() -> anyhow::Result<()> { } #[test] -fn test_kv_store_file_resizing() -> anyhow::Result<()> { +fn test_kv_store_constructor_cache_coherency_with_file_resize() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let base_path = temp_dir.path().join("test_store"); - // Create store with small size + // Create store with small buffer and add data { - let mut store = KVStore::new(&base_path, 1024, None)?; + let mut store = KVStore::new(&base_path, 512, None)?; store.insert("key1".to_string(), Value::String("value1".to_string()))?; store.sync()?; } @@ -552,32 +552,6 @@ fn test_kv_store_constructor_cache_coherency_with_existing_data() -> anyhow::Res Ok(()) } -#[test] -fn test_kv_store_constructor_cache_coherency_with_file_resize() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - let base_path = temp_dir.path().join("test_store"); - - // Create store with small buffer and add data - { - let mut store = KVStore::new(&base_path, 512, None)?; - store.insert("key1".to_string(), Value::String("value1".to_string()))?; - store.sync()?; - } - - // Re-open with larger buffer - cache should be coherent with existing data - let store = KVStore::new(&base_path, 4096, None)?; - - // Verify cache is coherent after file resize - assert_eq!(store.len(), 1); - assert!(store.contains_key("key1")); - assert_eq!( - store.get("key1"), - Some(&Value::String("value1".to_string())) - ); - - Ok(()) -} - #[test] fn test_kv_store_constructor_cache_coherency_with_corrupted_data() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index 571ee185..f040da91 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -5,6 +5,8 @@ // LICENSE file or at: // https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt +#![allow(clippy::unwrap_used)] + use crate::VersionedKVStore; use bd_bonjson::Value; use std::sync::{Arc, Mutex}; @@ -13,9 +15,8 @@ use tempfile::TempDir; #[test] fn test_versioned_store_new() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - let store = VersionedKVStore::new(&file_path, 4096, None)?; + let store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; // Should start empty assert!(store.is_empty()); @@ -29,9 +30,8 @@ fn test_versioned_store_new() -> anyhow::Result<()> { #[tokio::test] async fn test_versioned_store_basic_operations() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; // Test insert with version tracking let v1 = store @@ -69,9 +69,9 @@ async fn test_versioned_store_basic_operations() -> anyhow::Result<()> { #[tokio::test] async fn test_versioned_store_remove() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; // Insert some values let v1 = store @@ -103,13 +103,13 @@ async fn test_versioned_store_remove() -> anyhow::Result<()> { #[tokio::test] async fn test_persistence_and_reload() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); + let v2; // Create store and write some data { - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; let _v1 = store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; @@ -119,7 +119,7 @@ async fn test_persistence_and_reload() -> anyhow::Result<()> { // Reopen and verify data persisted { - let store = VersionedKVStore::open_existing(&file_path, 4096, None)?; + let store = VersionedKVStore::open_existing(temp_dir.path(), "test", 4096, None)?; assert_eq!(store.len(), 2); assert_eq!( store.get("key1"), @@ -137,9 +137,9 @@ async fn test_persistence_and_reload() -> anyhow::Result<()> { #[tokio::test] async fn test_null_value_is_deletion() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; // Insert a value store @@ -158,10 +158,10 @@ async fn test_null_value_is_deletion() -> anyhow::Result<()> { #[tokio::test] async fn test_rotation_callback() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); + // Use a small buffer and low high water mark to trigger rotation easily - let mut store = VersionedKVStore::new(&file_path, 1024, Some(0.3))?; + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 1024, Some(0.3))?; // Set up callback to track rotation events let callback_data = Arc::new(Mutex::new(Vec::new())); @@ -192,7 +192,7 @@ async fn test_rotation_callback() -> anyhow::Result<()> { let (old_path, new_path, rotation_version) = &data[0]; assert!(old_path.to_string_lossy().contains(".v")); - assert_eq!(new_path, &file_path); + assert_eq!(new_path, &temp_dir.path().join("test.jrn")); assert!(*rotation_version <= last_version); Ok(()) @@ -201,9 +201,9 @@ async fn test_rotation_callback() -> anyhow::Result<()> { #[tokio::test] async fn test_manual_rotation() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; // Insert some data let _v1 = store @@ -253,9 +253,9 @@ async fn test_manual_rotation() -> anyhow::Result<()> { #[tokio::test] async fn test_rotation_preserves_state() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; // Create complex state store @@ -291,9 +291,9 @@ async fn test_rotation_preserves_state() -> anyhow::Result<()> { #[tokio::test] async fn test_empty_store_operations() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; // Operations on empty store assert_eq!(store.get("nonexistent"), None); @@ -308,9 +308,9 @@ async fn test_empty_store_operations() -> anyhow::Result<()> { #[tokio::test] async fn test_version_monotonicity() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; let mut last_version = store.current_version(); @@ -349,16 +349,15 @@ async fn test_version_monotonicity() -> anyhow::Result<()> { #[tokio::test] async fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); + // Create store with small buffer to trigger rotation easily - let mut store = VersionedKVStore::new(&file_path, 2048, Some(0.5))?; + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 2048, Some(0.5))?; // Insert some keys and capture their timestamps store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; - #[allow(clippy::unwrap_used)] let ts1 = store .get_with_timestamp("key1") .map(|tv| tv.timestamp) @@ -370,7 +369,6 @@ async fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { store .insert("key2".to_string(), Value::String("value2".to_string())) .await?; - #[allow(clippy::unwrap_used)] let ts2 = store .get_with_timestamp("key2") .map(|tv| tv.timestamp) @@ -386,13 +384,11 @@ async fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { } // Verify that after rotation, the original timestamps are preserved - #[allow(clippy::unwrap_used)] let ts1_after = store .get_with_timestamp("key1") .map(|tv| tv.timestamp) .unwrap(); - #[allow(clippy::unwrap_used)] let ts2_after = store .get_with_timestamp("key2") .map(|tv| tv.timestamp) @@ -419,9 +415,9 @@ async fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { #[tokio::test] async fn test_compression_during_rotation() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; // Insert some data let data = "x".repeat(1000); // Large value to make compression effective @@ -436,7 +432,7 @@ async fn test_compression_during_rotation() -> anyhow::Result<()> { .await?; // Get size of uncompressed journal before rotation - let uncompressed_size = std::fs::metadata(&file_path)?.len(); + let uncompressed_size = std::fs::metadata(temp_dir.path().join("test.jrn"))?.len(); // Get current version before rotation (this is what will be used in the archive name) let rotation_version = store.current_version(); @@ -482,9 +478,9 @@ async fn test_compression_during_rotation() -> anyhow::Result<()> { #[tokio::test] async fn test_compression_ratio() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - let mut store = VersionedKVStore::new(&file_path, 8192, None)?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 8192, None)?; // Insert highly compressible data let compressible_data = "A".repeat(500); @@ -497,7 +493,7 @@ async fn test_compression_ratio() -> anyhow::Result<()> { .await?; } - let uncompressed_size = std::fs::metadata(&file_path)?.len(); + let uncompressed_size = std::fs::metadata(temp_dir.path().join("test.jrn"))?.len(); let rotation_version = store.current_version(); store.rotate_journal().await?; @@ -523,9 +519,9 @@ async fn test_compression_ratio() -> anyhow::Result<()> { #[tokio::test] async fn test_multiple_rotations_with_compression() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; let mut rotation_versions = Vec::new(); @@ -554,9 +550,9 @@ async fn test_multiple_rotations_with_compression() -> anyhow::Result<()> { #[tokio::test] async fn test_rotation_callback_receives_compressed_path() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; let callback_data = Arc::new(Mutex::new(None)); let callback_data_clone = Arc::clone(&callback_data); @@ -573,7 +569,6 @@ async fn test_rotation_callback_receives_compressed_path() -> anyhow::Result<()> // Verify callback received compressed path let data = callback_data.lock().unwrap(); - #[allow(clippy::unwrap_used)] let archived_path = data.as_ref().unwrap(); assert!( diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index 87554afc..08a7a0ff 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -5,6 +5,8 @@ // LICENSE file or at: // https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt +#![allow(clippy::unwrap_used)] + use crate::VersionedKVStore; use crate::versioned_recovery::VersionedRecovery; use bd_bonjson::Value; @@ -13,10 +15,10 @@ use tempfile::TempDir; #[tokio::test] async fn test_recovery_single_journal() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); + // Create a store and write some versioned data - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; let v1 = store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; @@ -29,7 +31,7 @@ async fn test_recovery_single_journal() -> anyhow::Result<()> { store.sync()?; // Read the journal data - let journal_data = std::fs::read(&file_path)?; + let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; // Create recovery utility let recovery = VersionedRecovery::new(vec![&journal_data])?; @@ -37,7 +39,6 @@ async fn test_recovery_single_journal() -> anyhow::Result<()> { // Verify version range let version_range = recovery.version_range(); assert!(version_range.is_some()); - #[allow(clippy::unwrap_used)] let (min, max) = version_range.unwrap(); assert_eq!(min, 1); assert_eq!(max, v3); @@ -81,54 +82,209 @@ async fn test_recovery_single_journal() -> anyhow::Result<()> { Ok(()) } +// Tests for compression detection heuristic + #[tokio::test] -async fn test_recovery_with_deletions() -> anyhow::Result<()> { +async fn test_detection_uncompressed_format_v1() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - // Create a store with deletions - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; - let v1 = store + + // Create a v1 format journal (though VersionedKVStore creates v2, we test the detection) + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; - let v2 = store - .insert("key2".to_string(), Value::String("value2".to_string())) - .await?; - let v3_opt = store.remove("key1").await?; - assert!(v3_opt.is_some()); - #[allow(clippy::unwrap_used)] - let v3 = v3_opt.unwrap(); store.sync()?; - // Read the journal data - let journal_data = std::fs::read(&file_path)?; + let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; + + // Verify first 8 bytes contain format version 2 + let version_bytes: [u8; 8] = journal_data[0 .. 8].try_into()?; + let version = u64::from_le_bytes(version_bytes); + assert_eq!(version, 2, "VersionedKVStore should create v2 format"); + + // Should successfully detect as uncompressed let recovery = VersionedRecovery::new(vec![&journal_data])?; + let state = recovery.recover_current()?; + assert_eq!(state.len(), 1); - // At v1: key1 exists - let state_v1 = recovery.recover_at_version(v1)?; - assert_eq!(state_v1.len(), 1); - assert!(state_v1.contains_key("key1")); + Ok(()) +} - // At v2: both keys exist - let state_v2 = recovery.recover_at_version(v2)?; - assert_eq!(state_v2.len(), 2); +#[tokio::test] +async fn test_detection_compressed_journal() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; - // At v3: only key2 exists (key1 deleted) - let state_v3 = recovery.recover_at_version(v3)?; - assert_eq!(state_v3.len(), 1); - assert!(!state_v3.contains_key("key1")); - assert!(state_v3.contains_key("key2")); + + // Create and rotate to get compressed archive + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + let archive_version = store.current_version(); + store.rotate_journal().await?; + + let archived_path = temp_dir + .path() + .join(format!("test.jrn.v{}.zz", archive_version)); + let compressed_data = std::fs::read(&archived_path)?; + + // Verify it starts with zlib magic bytes (0x78) + assert_eq!( + compressed_data[0], 0x78, + "Compressed data should start with zlib magic byte" + ); + + // Should successfully detect and decompress + let recovery = VersionedRecovery::new(vec![&compressed_data])?; + let state = recovery.recover_current()?; + assert_eq!(state.len(), 1); + assert_eq!( + state.get("key1").map(|tv| &tv.value), + Some(&Value::String("value1".to_string())) + ); + + Ok(()) +} + +#[test] +fn test_detection_invalid_format_version() { + // Create data with invalid format version (e.g., 999) + let mut invalid_data = vec![0u8; 32]; + let version_bytes = 999u64.to_le_bytes(); + invalid_data[0 .. 8].copy_from_slice(&version_bytes); + + // Should fail with clear error message about invalid version + let result = VersionedRecovery::new(vec![&invalid_data]); + assert!(result.is_err()); + let err_msg = result.unwrap_err().to_string(); + assert!( + err_msg.contains("Invalid journal format version"), + "Expected error about invalid version, got: {err_msg}" + ); +} + +#[test] +fn test_detection_data_too_small() { + // Data smaller than header size (16 bytes) + let small_data = vec![0u8; 8]; + + let result = VersionedRecovery::new(vec![&small_data]); + assert!(result.is_err()); + let err_msg = result.unwrap_err().to_string(); + assert!( + err_msg.contains("Data too small"), + "Expected error about data too small, got: {err_msg}" + ); +} + +#[test] +fn test_detection_empty_data() { + let empty_data = vec![]; + + let result = VersionedRecovery::new(vec![&empty_data]); + assert!(result.is_err()); + let err_msg = result.unwrap_err().to_string(); + assert!( + err_msg.contains("Data too small"), + "Expected error about data too small, got: {err_msg}" + ); +} + +#[test] +fn test_detection_corrupted_zlib_header() { + // Create data that looks like zlib (starts with 0x78) but is invalid + let mut fake_zlib = vec![0x78, 0x9C]; // Valid zlib magic bytes + fake_zlib.extend_from_slice(&[0xFF; 100]); // But garbage data + + let result = VersionedRecovery::new(vec![&fake_zlib]); + assert!(result.is_err()); + // Should fail during decompression +} + +#[test] +fn test_detection_random_garbage() { + // Random data that doesn't match any valid format + let garbage = vec![0xAB, 0xCD, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x90]; + + let result = VersionedRecovery::new(vec![&garbage]); + assert!(result.is_err()); + let err_msg = result.unwrap_err().to_string(); + // Should try to decompress it and fail + assert!(err_msg.contains("Data too small") || err_msg.contains("corrupt")); +} + +#[tokio::test] +async fn test_detection_mixed_valid_and_invalid() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + + + // Create valid journal + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + store.sync()?; + + let valid_data = std::fs::read(temp_dir.path().join("test.jrn"))?; + + // Create invalid data + let mut invalid_data = vec![0u8; 32]; + let version_bytes = 999u64.to_le_bytes(); + invalid_data[0 .. 8].copy_from_slice(&version_bytes); + + // Should fail if any journal is invalid + let result = VersionedRecovery::new(vec![&valid_data, &invalid_data]); + assert!(result.is_err()); Ok(()) } +#[test] +fn test_detection_all_zlib_compression_levels() { + use flate2::Compression; + use flate2::write::ZlibEncoder; + use std::io::Write; + + // Create some uncompressed journal-like data + let mut uncompressed = vec![0u8; 64]; + // Version 2 + uncompressed[0 .. 8].copy_from_slice(&2u64.to_le_bytes()); + // Position at end + uncompressed[8 .. 16].copy_from_slice(&64u64.to_le_bytes()); + // Some data + uncompressed[16 .. 32].copy_from_slice(b"[{\"base_version\""); + + // Test different compression levels + for level in [ + Compression::none(), + Compression::fast(), + Compression::default(), + Compression::best(), + ] { + let mut encoder = ZlibEncoder::new(Vec::new(), level); + encoder.write_all(&uncompressed).unwrap(); + let compressed = encoder.finish().unwrap(); + + // Verify it starts with 0x78 + assert_eq!(compressed[0], 0x78); + + // Should be able to detect and decompress + let result = VersionedRecovery::new(vec![&compressed]); + // May succeed or fail depending on whether the data is valid bonjson, + // but should at least attempt decompression without panicking + let _ = result; + } +} + + #[tokio::test] async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); + // Create a store with larger buffer to avoid BufferFull errors during test - let mut store = VersionedKVStore::new(&file_path, 2048, None)?; + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 2048, None)?; // Write data that will trigger rotation let v1 = store @@ -175,7 +331,7 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { } // Read active journal - all_journals.push(std::fs::read(&file_path)?); + all_journals.push(std::fs::read(temp_dir.path().join("test.jrn"))?); // Create recovery utility with all journals let journal_refs: Vec<&[u8]> = all_journals.iter().map(std::vec::Vec::as_slice).collect(); @@ -203,22 +359,21 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_recovery_empty_journal() -> anyhow::Result<()> { +#[tokio::test] +async fn test_recovery_empty_journal() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); + // Create an empty store - let store = VersionedKVStore::new(&file_path, 4096, None)?; + let store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; store.sync()?; - let journal_data = std::fs::read(&file_path)?; + let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; let recovery = VersionedRecovery::new(vec![&journal_data])?; // Should have version range starting at 1 let version_range = recovery.version_range(); assert!(version_range.is_some()); - #[allow(clippy::unwrap_used)] let (min, _max) = version_range.unwrap(); assert_eq!(min, 1); @@ -232,9 +387,9 @@ fn test_recovery_empty_journal() -> anyhow::Result<()> { #[tokio::test] async fn test_recovery_version_range() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; @@ -246,12 +401,11 @@ async fn test_recovery_version_range() -> anyhow::Result<()> { .await?; store.sync()?; - let journal_data = std::fs::read(&file_path)?; + let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; let recovery = VersionedRecovery::new(vec![&journal_data])?; let version_range = recovery.version_range(); assert!(version_range.is_some()); - #[allow(clippy::unwrap_used)] let (min, max) = version_range.unwrap(); assert_eq!(min, 1); // base_version defaults to 1 for new stores assert_eq!(max, v3); @@ -262,15 +416,15 @@ async fn test_recovery_version_range() -> anyhow::Result<()> { #[tokio::test] async fn test_recovery_with_overwrites() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; let v1 = store.insert("key".to_string(), Value::Signed(1)).await?; let v2 = store.insert("key".to_string(), Value::Signed(2)).await?; let v3 = store.insert("key".to_string(), Value::Signed(3)).await?; store.sync()?; - let journal_data = std::fs::read(&file_path)?; + let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; let recovery = VersionedRecovery::new(vec![&journal_data])?; // Each version should show the value at that time @@ -298,9 +452,9 @@ async fn test_recovery_with_overwrites() -> anyhow::Result<()> { #[tokio::test] async fn test_recovery_various_value_types() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; store .insert("string".to_string(), Value::String("hello".to_string())) .await?; @@ -314,7 +468,7 @@ async fn test_recovery_various_value_types() -> anyhow::Result<()> { let v_final = store.current_version(); store.sync()?; - let journal_data = std::fs::read(&file_path)?; + let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; let recovery = VersionedRecovery::new(vec![&journal_data])?; let state = recovery.recover_at_version(v_final)?; @@ -342,10 +496,10 @@ async fn test_recovery_various_value_types() -> anyhow::Result<()> { #[tokio::test] async fn test_recovery_from_compressed_archive() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); + // Create a store and write some data - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; let v1 = store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; @@ -374,7 +528,7 @@ async fn test_recovery_from_compressed_archive() -> anyhow::Result<()> { // Read both journals let compressed_data = std::fs::read(&archived_path)?; - let active_data = std::fs::read(&file_path)?; + let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; // Create recovery from both journals (compressed first, then active) let recovery = VersionedRecovery::new(vec![&compressed_data, &active_data])?; @@ -382,7 +536,6 @@ async fn test_recovery_from_compressed_archive() -> anyhow::Result<()> { // Verify version range spans both journals let version_range = recovery.version_range(); assert!(version_range.is_some()); - #[allow(clippy::unwrap_used)] let (min, max) = version_range.unwrap(); assert_eq!(min, 1); assert_eq!(max, v3); @@ -413,10 +566,10 @@ async fn test_recovery_from_compressed_archive() -> anyhow::Result<()> { #[tokio::test] async fn test_recovery_from_multiple_compressed_archives() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); + // Create a store and perform multiple rotations - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; let v1 = store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; @@ -444,7 +597,7 @@ async fn test_recovery_from_multiple_compressed_archives() -> anyhow::Result<()> let archive1_data = std::fs::read(&archive1_path)?; let archive2_data = std::fs::read(&archive2_path)?; - let active_data = std::fs::read(&file_path)?; + let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; // Create recovery from all journals let recovery = VersionedRecovery::new(vec![&archive1_data, &archive2_data, &active_data])?; @@ -471,10 +624,10 @@ async fn test_recovery_from_multiple_compressed_archives() -> anyhow::Result<()> #[tokio::test] async fn test_recovery_mixed_compressed_and_uncompressed() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); + // Create initial store and archive (will be compressed) - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; let _v1 = store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; @@ -489,12 +642,12 @@ async fn test_recovery_mixed_compressed_and_uncompressed() -> anyhow::Result<()> let compressed_data = std::fs::read(&compressed_archive_path)?; // Create uncompressed journal data manually - let mut uncompressed_store = VersionedKVStore::new(&file_path, 4096, None)?; + let mut uncompressed_store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; let v2 = uncompressed_store .insert("key2".to_string(), Value::String("value2".to_string())) .await?; uncompressed_store.sync()?; - let uncompressed_data = std::fs::read(&file_path)?; + let uncompressed_data = std::fs::read(temp_dir.path().join("test.jrn"))?; // Recovery should handle both compressed and uncompressed let recovery = VersionedRecovery::new(vec![&compressed_data, &uncompressed_data])?; @@ -510,10 +663,10 @@ async fn test_recovery_mixed_compressed_and_uncompressed() -> anyhow::Result<()> #[tokio::test] async fn test_recovery_decompression_transparent() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let file_path = temp_dir.path().join("test.jrn"); + // Create store with compressible data - let mut store = VersionedKVStore::new(&file_path, 4096, None)?; + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; let compressible = "A".repeat(500); let v1 = store .insert("data".to_string(), Value::String(compressible.clone())) @@ -521,7 +674,7 @@ async fn test_recovery_decompression_transparent() -> anyhow::Result<()> { store.sync()?; // Create uncompressed recovery baseline - let uncompressed_data = std::fs::read(&file_path)?; + let uncompressed_data = std::fs::read(temp_dir.path().join("test.jrn"))?; let recovery_uncompressed = VersionedRecovery::new(vec![&uncompressed_data])?; let state_uncompressed = recovery_uncompressed.recover_at_version(v1)?; diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs index 1896c12b..2c5bfd35 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -7,11 +7,10 @@ use crate::kv_journal::{MemMappedVersionedKVJournal, TimestampedValue, VersionedKVJournal}; use ahash::AHashMap; +use async_compression::tokio::write::ZlibEncoder; use bd_bonjson::Value; -use flate2::Compression; -use flate2::write::ZlibEncoder; -use std::io::Write; use std::path::{Path, PathBuf}; +use tokio::io::AsyncWriteExt; /// Callback invoked when journal rotation occurs. /// @@ -24,15 +23,25 @@ use std::path::{Path, PathBuf}; /// storage, perform cleanup, or other post-rotation operations. pub type RotationCallback = Box; -/// Compress an archived journal using zlib. +/// Compress an archived journal using zlib with streaming I/O. +/// +/// This function uses async I/O to stream data directly from the source file +/// through a zlib encoder to the destination file, without loading the entire +/// journal into memory. async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result<()> { - let journal_bytes = tokio::fs::read(source).await?; + // Open source and destination files + let source_file = tokio::fs::File::open(source).await?; + let dest_file = tokio::fs::File::create(dest).await?; + + // Create zlib encoder that writes directly to the destination file + let mut encoder = ZlibEncoder::new(dest_file); - let mut encoder = ZlibEncoder::new(Vec::new(), Compression::new(3)); - encoder.write_all(&journal_bytes)?; - let compressed = encoder.finish()?; + // Copy data from source through encoder to destination + let mut source_reader = tokio::io::BufReader::new(source_file); + tokio::io::copy(&mut source_reader, &mut encoder).await?; - tokio::fs::write(dest, compressed).await?; + // Flush and finalize compression + encoder.shutdown().await?; Ok(()) } @@ -61,7 +70,7 @@ async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result /// use bd_resilient_kv::VersionedKVStore; /// use bd_bonjson::Value; /// -/// let mut store = VersionedKVStore::new("mystore.jrn", 1024 * 1024, None)?; +/// let mut store = VersionedKVStore::new("/path/to/dir", "mystore", 1024 * 1024, None)?; /// /// // Insert with version tracking /// let v1 = store.insert("key1".to_string(), Value::from(42))?; @@ -70,46 +79,49 @@ async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result pub struct VersionedKVStore { journal: MemMappedVersionedKVJournal, cached_map: AHashMap, - base_path: PathBuf, + dir_path: PathBuf, + journal_name: String, buffer_size: usize, high_water_mark_ratio: Option, rotation_callback: Option, } impl VersionedKVStore { - /// Create a new `VersionedKVStore` with the specified path and buffer size. + /// Create a new `VersionedKVStore` with the specified directory, name, and buffer size. /// + /// The journal file will be named `.jrn` within the specified directory. /// If the file already exists, it will be loaded with its existing contents. /// If the specified size is larger than an existing file, it will be resized while preserving /// data. If the specified size is smaller and the existing data doesn't fit, a fresh journal /// will be created. /// /// # Arguments - /// * `file_path` - Path for the journal file + /// * `dir_path` - Directory path where the journal will be stored + /// * `name` - Base name for the journal (e.g., "store" will create "store.jrn") /// * `buffer_size` - Size in bytes for the journal buffer /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 /// /// # Errors /// Returns an error if the journal file cannot be created/opened or if initialization fails. pub fn new>( - file_path: P, + dir_path: P, + name: &str, buffer_size: usize, high_water_mark_ratio: Option, ) -> anyhow::Result { - let path = file_path.as_ref(); - let base_path = path.to_path_buf(); + let dir = dir_path.as_ref(); + let journal_path = dir.join(format!("{name}.jrn")); - let journal = if path.exists() { + let journal = if journal_path.exists() { // Try to open existing journal - MemMappedVersionedKVJournal::from_file(path, buffer_size, high_water_mark_ratio).or_else( - |_| { + MemMappedVersionedKVJournal::from_file(&journal_path, buffer_size, high_water_mark_ratio) + .or_else(|_| { // Data is corrupt or unreadable, create fresh with base version 1 - MemMappedVersionedKVJournal::new(path, buffer_size, 1, high_water_mark_ratio) - }, - )? + MemMappedVersionedKVJournal::new(&journal_path, buffer_size, 1, high_water_mark_ratio) + })? } else { // Create new journal with base version 1 - MemMappedVersionedKVJournal::new(path, buffer_size, 1, high_water_mark_ratio)? + MemMappedVersionedKVJournal::new(&journal_path, buffer_size, 1, high_water_mark_ratio)? }; let cached_map = journal.as_hashmap_with_timestamps()?; @@ -117,7 +129,8 @@ impl VersionedKVStore { Ok(Self { journal, cached_map, - base_path, + dir_path: dir.to_path_buf(), + journal_name: name.to_string(), buffer_size, high_water_mark_ratio, rotation_callback: None, @@ -130,7 +143,8 @@ impl VersionedKVStore { /// missing. /// /// # Arguments - /// * `file_path` - Path to the existing journal file + /// * `dir_path` - Directory path where the journal is stored + /// * `name` - Base name of the journal (e.g., "store" for "store.jrn") /// * `buffer_size` - Size in bytes for the journal buffer /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 /// @@ -141,26 +155,34 @@ impl VersionedKVStore { /// - The journal file contains invalid data /// - Initialization fails pub fn open_existing>( - file_path: P, + dir_path: P, + name: &str, buffer_size: usize, high_water_mark_ratio: Option, ) -> anyhow::Result { - let path = file_path.as_ref(); - let base_path = path.to_path_buf(); + let dir = dir_path.as_ref(); + let journal_path = dir.join(format!("{name}.jrn")); - let journal = MemMappedVersionedKVJournal::from_file(path, buffer_size, high_water_mark_ratio)?; + let journal = + MemMappedVersionedKVJournal::from_file(&journal_path, buffer_size, high_water_mark_ratio)?; let cached_map = journal.as_hashmap_with_timestamps()?; Ok(Self { journal, cached_map, - base_path, + dir_path: dir.to_path_buf(), + journal_name: name.to_string(), buffer_size, high_water_mark_ratio, rotation_callback: None, }) } + /// Get the path to the active journal file. + fn journal_path(&self) -> PathBuf { + self.dir_path.join(format!("{}.jrn", self.journal_name)) + } + /// Set a callback to be invoked when journal rotation occurs. /// /// The callback receives the path to the archived journal file, the new active journal file, @@ -298,6 +320,9 @@ impl VersionedKVStore { /// Synchronize changes to disk. /// + /// This is a blocking operation that performs synchronous I/O. In async contexts, + /// consider wrapping this call with `tokio::task::spawn_blocking`. + /// /// # Errors /// Returns an error if the sync operation fails. pub fn sync(&self) -> anyhow::Result<()> { @@ -339,12 +364,13 @@ impl VersionedKVStore { // Move old journal to temporary location drop(old_journal); // Release mmap before moving file - let temp_uncompressed = self.base_path.with_extension("jrn.old"); - tokio::fs::rename(&self.base_path, &temp_uncompressed).await?; + let journal_path = self.journal_path(); + let temp_uncompressed = self.dir_path.join(format!("{}.jrn.old", self.journal_name)); + tokio::fs::rename(&journal_path, &temp_uncompressed).await?; // Rename new journal to base path - let temp_path = self.base_path.with_extension("jrn.tmp"); - tokio::fs::rename(&temp_path, &self.base_path).await?; + let temp_path = self.dir_path.join(format!("{}.jrn.tmp", self.journal_name)); + tokio::fs::rename(&temp_path, &journal_path).await?; // Compress the archived journal compress_archived_journal(&temp_uncompressed, &archived_path).await?; @@ -354,7 +380,7 @@ impl VersionedKVStore { // Invoke rotation callback if set if let Some(ref mut callback) = self.rotation_callback { - callback(&archived_path, &self.base_path, rotation_version); + callback(&archived_path, &journal_path, rotation_version); } Ok(()) @@ -363,12 +389,10 @@ impl VersionedKVStore { /// Generate the archived journal path for a given rotation version. /// Archived journals use the .zz extension to indicate zlib compression. fn generate_archived_path(&self, rotation_version: u64) -> PathBuf { - let mut path = self.base_path.clone(); - if let Some(file_name) = path.file_name() { - let new_name = format!("{}.v{}.zz", file_name.to_string_lossy(), rotation_version); - path.set_file_name(new_name); - } - path + self.dir_path.join(format!( + "{}.jrn.v{}.zz", + self.journal_name, rotation_version + )) } /// Create a new rotated journal with compacted state. @@ -377,7 +401,7 @@ impl VersionedKVStore { rotation_version: u64, ) -> anyhow::Result { // Create temporary journal file - let temp_path = self.base_path.with_extension("jrn.tmp"); + let temp_path = self.dir_path.join(format!("{}.jrn.tmp", self.journal_name)); // Create in-memory buffer for new journal let mut buffer = vec![0u8; self.buffer_size]; diff --git a/bd-resilient-kv/src/versioned_recovery.rs b/bd-resilient-kv/src/versioned_recovery.rs index d99ea041..1c6ffa7b 100644 --- a/bd-resilient-kv/src/versioned_recovery.rs +++ b/bd-resilient-kv/src/versioned_recovery.rs @@ -176,12 +176,28 @@ impl VersionedRecovery { /// Decompress journal data if it's zlib-compressed, otherwise return as-is. /// -/// Detection: Try to read the header. If it's a valid journal header (format version at offset 0), -/// it's uncompressed. Otherwise, attempt zlib decompression. +/// Detection: Checks for zlib magic bytes first (RFC 1950). If not present, validates +/// as uncompressed journal by checking format version. fn decompress_if_needed(data: &[u8]) -> anyhow::Result> { const HEADER_SIZE: usize = 16; - // Check if data looks like a valid uncompressed journal + // Check for zlib magic bytes first (RFC 1950) + // Zlib compressed data starts with 0x78 followed by a second byte where: + // - 0x01 (no/low compression) + // - 0x5E (also valid) + // - 0x9C (default compression) + // - 0xDA (best compression) + // The second byte's lower 5 bits are the window size, and bit 5 is the FDICT flag. + // We check that bit 5 (0x20) is not set for typical zlib streams without preset dictionary. + if data.len() >= 2 && data[0] == 0x78 && (data[1] & 0x20) == 0 { + // Looks like zlib compressed data + let mut decoder = ZlibDecoder::new(data); + let mut decompressed = Vec::new(); + decoder.read_to_end(&mut decompressed)?; + return Ok(decompressed); + } + + // Otherwise, treat as uncompressed and validate it's a proper journal if data.len() >= HEADER_SIZE { // Read format version (first 8 bytes as u64 little-endian) let version_bytes: [u8; 8] = data[0 .. 8] @@ -189,18 +205,15 @@ fn decompress_if_needed(data: &[u8]) -> anyhow::Result> { .map_err(|_| anyhow::anyhow!("Failed to read version bytes"))?; let format_version = u64::from_le_bytes(version_bytes); - // If format version is 1 or 2, it's likely uncompressed + // Check for known format versions if format_version == 1 || format_version == 2 { return Ok(data.to_vec()); } - } - // Try to decompress as zlib - let mut decoder = ZlibDecoder::new(data); - let mut decompressed = Vec::new(); - decoder.read_to_end(&mut decompressed)?; + anyhow::bail!("Invalid journal format version: {format_version}"); + } - Ok(decompressed) + anyhow::bail!("Data too small to be valid journal (size: {})", data.len()) } /// Extract the base version and maximum version from a journal. From 5e83fa725d10c3b39daf47bac02c4edcf5bf1b4a Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 08:01:16 -0800 Subject: [PATCH 06/66] revert --- bd-resilient-kv/src/tests/kv_store_test.rs | 32 ++++++++++++++++++++-- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/bd-resilient-kv/src/tests/kv_store_test.rs b/bd-resilient-kv/src/tests/kv_store_test.rs index 3b01d3a8..bad5bcb0 100644 --- a/bd-resilient-kv/src/tests/kv_store_test.rs +++ b/bd-resilient-kv/src/tests/kv_store_test.rs @@ -286,13 +286,13 @@ fn test_kv_store_persistence() -> anyhow::Result<()> { } #[test] -fn test_kv_store_constructor_cache_coherency_with_file_resize() -> anyhow::Result<()> { +fn test_kv_store_file_resizing() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let base_path = temp_dir.path().join("test_store"); - // Create store with small buffer and add data + // Create store with small size { - let mut store = KVStore::new(&base_path, 512, None)?; + let mut store = KVStore::new(&base_path, 1024, None)?; store.insert("key1".to_string(), Value::String("value1".to_string()))?; store.sync()?; } @@ -552,6 +552,32 @@ fn test_kv_store_constructor_cache_coherency_with_existing_data() -> anyhow::Res Ok(()) } +#[test] +fn test_kv_store_constructor_cache_coherency_with_file_resize() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + let base_path = temp_dir.path().join("test_store"); + + // Create store with small buffer and add data + { + let mut store = KVStore::new(&base_path, 512, None)?; + store.insert("key1".to_string(), Value::String("value1".to_string()))?; + store.sync()?; + } + + // Re-open with larger buffer - cache should be coherent with existing data + let store = KVStore::new(&base_path, 4096, None)?; + + // Verify cache is coherent after file resize + assert_eq!(store.len(), 1); + assert!(store.contains_key("key1")); + assert_eq!( + store.get("key1"), + Some(&Value::String("value1".to_string())) + ); + + Ok(()) +} + #[test] fn test_kv_store_constructor_cache_coherency_with_corrupted_data() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; From a2a7dcc8a276be7f0c6faa6a2ecffb2c285d97b3 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 08:12:21 -0800 Subject: [PATCH 07/66] add u64 helper --- bd-resilient-kv/src/kv_journal/versioned.rs | 85 +++++++++------------ bd-resilient-kv/src/versioned_recovery.rs | 59 +++++++------- 2 files changed, 63 insertions(+), 81 deletions(-) diff --git a/bd-resilient-kv/src/kv_journal/versioned.rs b/bd-resilient-kv/src/kv_journal/versioned.rs index 89a7a4fd..8809048a 100644 --- a/bd-resilient-kv/src/kv_journal/versioned.rs +++ b/bd-resilient-kv/src/kv_journal/versioned.rs @@ -65,6 +65,25 @@ const METADATA_OFFSET: usize = 17; // Minimum buffer size for a valid journal const MIN_BUFFER_SIZE: usize = HEADER_SIZE + 4; +/// Helper function to read a u64 field from a BONJSON object. +/// +/// BONJSON's decoder automatically converts unsigned values that fit in i64 to signed values +/// during decoding (see bd-bonjson/src/decoder.rs:227-234). This means that even though we +/// write `Value::Unsigned(version)`, the decoder returns `Value::Signed(version as i64)`. +/// +/// TODO(snowp): Consider changing BONJSON's decoder to preserve the original unsigned type +/// to avoid this normalization behavior and eliminate the need for this helper. +fn read_u64_field(obj: &AHashMap, key: &str) -> Option { + match obj.get(key) { + Some(Value::Unsigned(v)) => Some(*v), + Some(Value::Signed(v)) if *v >= 0 => { + #[allow(clippy::cast_sign_loss)] + Some(*v as u64) + }, + _ => None, + } +} + /// Get current timestamp in nanoseconds since UNIX epoch. fn current_timestamp() -> anyhow::Result { SystemTime::now() @@ -152,23 +171,10 @@ fn extract_metadata_from_buffer(buffer: &[u8]) -> anyhow::Result<(u64, u64)> { if let Value::Array(entries) = array && let Some(Value::Object(obj)) = entries.first() { - let timestamp = if let Some(Value::Unsigned(ts)) = obj.get("initialized") { - *ts - } else if let Some(Value::Signed(ts)) = obj.get("initialized") { - #[allow(clippy::cast_sign_loss)] - (*ts as u64) - } else { - anyhow::bail!("No initialized timestamp found in metadata"); - }; - - let base_version = if let Some(Value::Unsigned(bv)) = obj.get("base_version") { - *bv - } else if let Some(Value::Signed(bv)) = obj.get("base_version") { - #[allow(clippy::cast_sign_loss)] - (*bv as u64) - } else { - 0 // Default to 0 if not found (for compatibility) - }; + let timestamp = read_u64_field(obj, "initialized") + .ok_or_else(|| anyhow::anyhow!("No initialized timestamp found in metadata"))?; + + let base_version = read_u64_field(obj, "base_version").unwrap_or(0); return Ok((timestamp, base_version)); } @@ -283,31 +289,24 @@ impl<'a> VersionedKVJournal<'a> { }) } - /// Find the highest version number in the journal by scanning all entries. + /// Find the highest version number in the journal. + /// + /// Since versions are monotonically increasing, this simply returns the version + /// from the last entry in the journal. fn find_highest_version(buffer: &[u8]) -> anyhow::Result> { let array = read_bonjson_payload(buffer)?; - let mut max_version: Option = None; if let Value::Array(entries) = array { - for (index, entry) in entries.iter().enumerate() { - // Skip metadata (first entry) - if index == 0 { - continue; - } - - if let Value::Object(obj) = entry { - if let Some(Value::Unsigned(v)) = obj.get("v") { - max_version = Some(max_version.map_or(*v, |current| current.max(*v))); - } else if let Some(Value::Signed(v)) = obj.get("v") { - #[allow(clippy::cast_sign_loss)] - let version = *v as u64; - max_version = Some(max_version.map_or(version, |current| current.max(version))); - } - } + // Skip metadata (index 0) and get the last actual entry + // Since versions are monotonically increasing, the last entry has the highest version + if entries.len() > 1 + && let Some(Value::Object(obj)) = entries.last() + { + return Ok(read_u64_field(obj, "v")); } } - Ok(max_version) + Ok(None) } /// Get the current version number. @@ -456,14 +455,7 @@ impl<'a> VersionedKVJournal<'a> { && let Some(operation) = obj.get("o") { // Extract timestamp (default to 0 if not found) - let timestamp = if let Some(Value::Unsigned(t)) = obj.get("t") { - *t - } else if let Some(Value::Signed(t)) = obj.get("t") { - #[allow(clippy::cast_sign_loss)] - (*t as u64) - } else { - 0 - }; + let timestamp = read_u64_field(obj, "t").unwrap_or(0); if operation.is_null() { map.remove(key); @@ -501,12 +493,7 @@ impl<'a> VersionedKVJournal<'a> { if let Value::Object(obj) = entry { // Check version - let entry_version = if let Some(Value::Unsigned(v)) = obj.get("v") { - *v - } else if let Some(Value::Signed(v)) = obj.get("v") { - #[allow(clippy::cast_sign_loss)] - (*v as u64) - } else { + let Some(entry_version) = read_u64_field(obj, "v") else { continue; // Skip entries without version }; diff --git a/bd-resilient-kv/src/versioned_recovery.rs b/bd-resilient-kv/src/versioned_recovery.rs index 1c6ffa7b..bc53cc8b 100644 --- a/bd-resilient-kv/src/versioned_recovery.rs +++ b/bd-resilient-kv/src/versioned_recovery.rs @@ -13,6 +13,25 @@ use flate2::read::ZlibDecoder; use std::io::Read; use std::path::Path; +/// Helper function to read a u64 field from a BONJSON object. +/// +/// BONJSON's decoder automatically converts unsigned values that fit in i64 to signed values +/// during decoding (see bd-bonjson/src/decoder.rs:227-234). This means that even though we +/// write `Value::Unsigned(version)`, the decoder returns `Value::Signed(version as i64)`. +/// +/// TODO(snowp): Consider changing BONJSON's decoder to preserve the original unsigned type +/// to avoid this normalization behavior and eliminate the need for this helper. +fn read_u64_field(obj: &AHashMap, key: &str) -> Option { + match obj.get(key) { + Some(Value::Unsigned(v)) => Some(*v), + Some(Value::Signed(v)) if *v >= 0 => { + #[allow(clippy::cast_sign_loss)] + Some(*v as u64) + }, + _ => None, + } +} + /// A utility for recovering state at arbitrary versions from raw journal data. /// /// This utility operates on raw byte slices from versioned journals and can reconstruct @@ -224,14 +243,7 @@ fn extract_version_range(buffer: &[u8]) -> anyhow::Result<(u64, u64)> { let base_version = if let Value::Array(entries) = &array && let Some(Value::Object(obj)) = entries.first() { - if let Some(Value::Unsigned(base)) = obj.get("base_version") { - *base - } else if let Some(Value::Signed(base)) = obj.get("base_version") { - #[allow(clippy::cast_sign_loss)] - (*base as u64) - } else { - 1 // Default to 1 for compatibility - } + read_u64_field(obj, "base_version").unwrap_or(1) } else { anyhow::bail!("Failed to extract metadata from journal"); }; @@ -244,15 +256,10 @@ fn extract_version_range(buffer: &[u8]) -> anyhow::Result<(u64, u64)> { continue; // Skip metadata } - if let Value::Object(obj) = entry { - if let Some(Value::Unsigned(v)) = obj.get("v") { - max_version = max_version.max(*v); - } else if let Some(Value::Signed(v)) = obj.get("v") { - #[allow(clippy::cast_sign_loss)] - { - max_version = max_version.max(*v as u64); - } - } + if let Value::Object(obj) = entry + && let Some(v) = read_u64_field(obj, "v") + { + max_version = max_version.max(v); } } } @@ -277,12 +284,7 @@ fn replay_journal_to_version( if let Value::Object(obj) = entry { // Check version - let entry_version = if let Some(Value::Unsigned(v)) = obj.get("v") { - *v - } else if let Some(Value::Signed(v)) = obj.get("v") { - #[allow(clippy::cast_sign_loss)] - (*v as u64) - } else { + let Some(entry_version) = read_u64_field(obj, "v") else { continue; // Skip entries without version }; @@ -291,15 +293,8 @@ fn replay_journal_to_version( break; } - // Extract timestamp - let timestamp = if let Some(Value::Unsigned(t)) = obj.get("t") { - *t - } else if let Some(Value::Signed(t)) = obj.get("t") { - #[allow(clippy::cast_sign_loss)] - (*t as u64) - } else { - 0 // Default to 0 if not found (shouldn't happen in v2 format) - }; + // Extract timestamp (default to 0 if not found) + let timestamp = read_u64_field(obj, "t").unwrap_or(0); // Extract key and operation if let Some(Value::String(key)) = obj.get("k") From a22768fd3ea7be0e95363d9952074712dd314901 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 08:30:20 -0800 Subject: [PATCH 08/66] only read from latest journal --- .../src/kv_journal/memmapped_versioned.rs | 17 --- bd-resilient-kv/src/kv_journal/versioned.rs | 94 +++++----------- .../src/tests/versioned_recovery_test.rs | 104 +++++++++++++++--- bd-resilient-kv/src/versioned_recovery.rs | 10 +- 4 files changed, 127 insertions(+), 98 deletions(-) diff --git a/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs b/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs index dad77e2c..f9a38b9c 100644 --- a/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs +++ b/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs @@ -170,12 +170,6 @@ impl MemMappedVersionedKVJournal { self.versioned_kv.buffer_usage_ratio() } - /// Get the time when the journal was initialized (nanoseconds since UNIX epoch). - #[must_use] - pub fn get_init_time(&self) -> u64 { - self.versioned_kv.get_init_time() - } - /// Reconstruct the hashmap by replaying all journal entries. /// /// # Errors @@ -192,17 +186,6 @@ impl MemMappedVersionedKVJournal { self.versioned_kv.as_hashmap_with_timestamps() } - /// Reconstruct the hashmap at a specific version by replaying entries up to that version. - /// - /// # Errors - /// Returns an error if the buffer cannot be decoded. - pub fn as_hashmap_at_version( - &self, - target_version: u64, - ) -> anyhow::Result> { - self.versioned_kv.as_hashmap_at_version(target_version) - } - /// Synchronize changes to disk. /// /// This method explicitly flushes any pending changes to the underlying file. diff --git a/bd-resilient-kv/src/kv_journal/versioned.rs b/bd-resilient-kv/src/kv_journal/versioned.rs index 8809048a..6677342f 100644 --- a/bd-resilient-kv/src/kv_journal/versioned.rs +++ b/bd-resilient-kv/src/kv_journal/versioned.rs @@ -12,7 +12,6 @@ use bd_bonjson::encoder::encode_into_buf; use bd_bonjson::serialize_primitives::serialize_array_begin; use bd_client_common::error::InvariantError; use bytes::BufMut; -use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{SystemTime, UNIX_EPOCH}; /// Represents a value with its associated timestamp. @@ -38,7 +37,7 @@ pub struct VersionedKVJournal<'a> { high_water_mark: usize, high_water_mark_triggered: bool, initialized_at_unix_time_ns: u64, - current_version: AtomicU64, + current_version: u64, base_version: u64, // First version in this journal } @@ -76,7 +75,8 @@ const MIN_BUFFER_SIZE: usize = HEADER_SIZE + 4; fn read_u64_field(obj: &AHashMap, key: &str) -> Option { match obj.get(key) { Some(Value::Unsigned(v)) => Some(*v), - Some(Value::Signed(v)) if *v >= 0 => { + Some(Value::Signed(v)) if *v >= 0 => + { #[allow(clippy::cast_sign_loss)] Some(*v as u64) }, @@ -249,7 +249,7 @@ impl<'a> VersionedKVJournal<'a> { high_water_mark, high_water_mark_triggered: false, initialized_at_unix_time_ns: timestamp, - current_version: AtomicU64::new(base_version), + current_version: base_version, base_version, }) } @@ -284,7 +284,7 @@ impl<'a> VersionedKVJournal<'a> { high_water_mark, high_water_mark_triggered: position >= high_water_mark, initialized_at_unix_time_ns: init_timestamp, - current_version: AtomicU64::new(current_version), + current_version, base_version, }) } @@ -312,7 +312,7 @@ impl<'a> VersionedKVJournal<'a> { /// Get the current version number. #[must_use] pub fn current_version(&self) -> u64 { - self.current_version.load(Ordering::SeqCst) + self.current_version } /// Get the base version (first version in this journal). @@ -349,11 +349,15 @@ impl<'a> VersionedKVJournal<'a> { // Create entry object: {"v": version, "t": timestamp, "k": key, "o": value} let timestamp = current_timestamp()?; - let mut entry = AHashMap::new(); - entry.insert("v".to_string(), Value::Unsigned(version)); - entry.insert("t".to_string(), Value::Unsigned(timestamp)); - entry.insert("k".to_string(), Value::String(key.to_string())); - entry.insert("o".to_string(), value.clone()); + + // TODO(snowp): It would be nice to be able to pass impl AsRef for key or Cow to avoid + // allocating small strings repeatedly. + let entry = AHashMap::from([ + ("v".to_string(), Value::Unsigned(version)), + ("t".to_string(), Value::Unsigned(timestamp)), + ("k".to_string(), Value::String(key.to_string())), + ("o".to_string(), value.clone()), + ]); encode_into_buf(&mut cursor, &Value::Object(entry)) .map_err(|e| anyhow::anyhow!("Failed to encode versioned entry: {e:?}"))?; @@ -366,7 +370,8 @@ impl<'a> VersionedKVJournal<'a> { /// Set a key-value pair with automatic version increment. /// Returns a tuple of (version, timestamp). pub fn set_versioned(&mut self, key: &str, value: &Value) -> anyhow::Result<(u64, u64)> { - let version = self.current_version.fetch_add(1, Ordering::SeqCst) + 1; + self.current_version += 1; + let version = self.current_version; let timestamp = self.write_versioned_entry(version, key, value)?; Ok((version, timestamp)) } @@ -374,7 +379,8 @@ impl<'a> VersionedKVJournal<'a> { /// Delete a key with automatic version increment. /// Returns a tuple of (version, timestamp). pub fn delete_versioned(&mut self, key: &str) -> anyhow::Result<(u64, u64)> { - let version = self.current_version.fetch_add(1, Ordering::SeqCst) + 1; + self.current_version += 1; + let version = self.current_version; let timestamp = self.write_versioned_entry(version, key, &Value::Null)?; Ok((version, timestamp)) } @@ -476,49 +482,6 @@ impl<'a> VersionedKVJournal<'a> { Ok(map) } - /// Reconstruct the hashmap at a specific version by replaying entries up to that version. - pub fn as_hashmap_at_version( - &self, - target_version: u64, - ) -> anyhow::Result> { - let array = read_bonjson_payload(self.buffer)?; - let mut map = AHashMap::new(); - - if let Value::Array(entries) = array { - for (index, entry) in entries.iter().enumerate() { - // Skip metadata (first entry) - if index == 0 { - continue; - } - - if let Value::Object(obj) = entry { - // Check version - let Some(entry_version) = read_u64_field(obj, "v") else { - continue; // Skip entries without version - }; - - // Only apply entries up to target version - if entry_version > target_version { - break; - } - - // Extract key and operation - if let Some(Value::String(key)) = obj.get("k") - && let Some(operation) = obj.get("o") - { - if operation.is_null() { - map.remove(key); - } else { - map.insert(key.clone(), operation.clone()); - } - } - } - } - } - - Ok(map) - } - /// Get a copy of the buffer for testing purposes #[cfg(test)] #[must_use] @@ -558,14 +521,17 @@ impl<'a> VersionedKVJournal<'a> { let mut cursor = &mut journal.buffer[journal.position ..]; // Create entry object: {"v": version, "t": timestamp, "k": key, "o": value} - let mut entry = AHashMap::new(); - entry.insert("v".to_string(), Value::Unsigned(snapshot_version)); - entry.insert( - "t".to_string(), - Value::Unsigned(timestamped_value.timestamp), - ); - entry.insert("k".to_string(), Value::String(key.clone())); - entry.insert("o".to_string(), timestamped_value.value.clone()); + // TODO(snowp): It would be nice to be able to pass impl AsRef for key or Cow to avoid + // allocating small strings repeatedly. + let entry = AHashMap::from([ + ("v".to_string(), Value::Unsigned(snapshot_version)), + ( + "t".to_string(), + Value::Unsigned(timestamped_value.timestamp), + ), + ("k".to_string(), Value::String(key.clone())), + ("o".to_string(), timestamped_value.value.clone()), + ]); encode_into_buf(&mut cursor, &Value::Object(entry)) .map_err(|e| anyhow::anyhow!("Failed to encode state entry: {e:?}"))?; diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index 08a7a0ff..9ebfd2f3 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -82,35 +82,111 @@ async fn test_recovery_single_journal() -> anyhow::Result<()> { Ok(()) } -// Tests for compression detection heuristic - #[tokio::test] -async fn test_detection_uncompressed_format_v1() -> anyhow::Result<()> { +async fn test_recover_current_only_needs_last_journal() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - - // Create a v1 format journal (though VersionedKVStore creates v2, we test the detection) + // Create a store with multiple rotations to build up history let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + + // Add initial data store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; + store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; + + // First rotation + let _archive1_version = store.current_version(); + store.rotate_journal().await?; + + // Update key1 and add key3 + store + .insert("key1".to_string(), Value::String("updated1".to_string())) + .await?; + store + .insert("key3".to_string(), Value::String("value3".to_string())) + .await?; + + // Second rotation + let _archive2_version = store.current_version(); + store.rotate_journal().await?; + + // Add more data and delete key2 + store + .insert("key4".to_string(), Value::String("value4".to_string())) + .await?; + store.remove("key2").await?; + store.sync()?; - let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; + // Get expected current state using the store's hashmap + let expected_state = store.as_hashmap(); - // Verify first 8 bytes contain format version 2 - let version_bytes: [u8; 8] = journal_data[0 .. 8].try_into()?; - let version = u64::from_le_bytes(version_bytes); - assert_eq!(version, 2, "VersionedKVStore should create v2 format"); + // Read ALL journals + let mut all_journals = Vec::new(); + let mut archived_paths = std::fs::read_dir(temp_dir.path())? + .filter_map(|entry| { + let entry = entry.ok()?; + let path = entry.path(); + if path.file_name()?.to_str()?.starts_with("test.jrn.v") { + Some(path) + } else { + None + } + }) + .collect::>(); - // Should successfully detect as uncompressed - let recovery = VersionedRecovery::new(vec![&journal_data])?; - let state = recovery.recover_current()?; - assert_eq!(state.len(), 1); + // Sort to ensure chronological order + archived_paths.sort(); + + for archived_path in &archived_paths { + all_journals.push(std::fs::read(archived_path)?); + } + + // Read active journal (the last one) + let active_journal = std::fs::read(temp_dir.path().join("test.jrn"))?; + all_journals.push(active_journal.clone()); + + // Test 1: Verify recover_current() with ALL journals gives correct state + let all_journal_refs: Vec<&[u8]> = all_journals.iter().map(Vec::as_slice).collect(); + let recovery_all = VersionedRecovery::new(all_journal_refs)?; + let state_all = recovery_all.recover_current()?; + + // Convert to comparable format (Value only, not TimestampedValue) + let state_all_values: ahash::AHashMap = + state_all.into_iter().map(|(k, tv)| (k, tv.value)).collect(); + + assert_eq!(state_all_values, expected_state); + + // Test 2: Verify recover_current() with ONLY the last journal gives the same state + // This is the optimization we want to prove works! + let recovery_last = VersionedRecovery::new(vec![&active_journal])?; + let state_last = recovery_last.recover_current()?; + + let state_last_values: ahash::AHashMap = state_last + .into_iter() + .map(|(k, tv)| (k, tv.value)) + .collect(); + + // The last journal alone should give us the same current state + assert_eq!(state_last_values, expected_state); + + // Verify the expected final state has the right keys + assert!(state_last_values.contains_key("key1")); + assert!(!state_last_values.contains_key("key2")); // deleted + assert!(state_last_values.contains_key("key3")); + assert!(state_last_values.contains_key("key4")); + assert_eq!( + state_last_values.get("key1"), + Some(&Value::String("updated1".to_string())) + ); Ok(()) } + #[tokio::test] async fn test_detection_compressed_journal() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; diff --git a/bd-resilient-kv/src/versioned_recovery.rs b/bd-resilient-kv/src/versioned_recovery.rs index bc53cc8b..e8864b1c 100644 --- a/bd-resilient-kv/src/versioned_recovery.rs +++ b/bd-resilient-kv/src/versioned_recovery.rs @@ -24,7 +24,8 @@ use std::path::Path; fn read_u64_field(obj: &AHashMap, key: &str) -> Option { match obj.get(key) { Some(Value::Unsigned(v)) => Some(*v), - Some(Value::Signed(v)) if *v >= 0 => { + Some(Value::Signed(v)) if *v >= 0 => + { #[allow(clippy::cast_sign_loss)] Some(*v as u64) }, @@ -185,8 +186,11 @@ impl VersionedRecovery { pub fn recover_current(&self) -> anyhow::Result> { let mut map = AHashMap::new(); - for journal in &self.journals { - replay_journal_to_version(&journal.data, u64::MAX, &mut map)?; + // Optimization: Only read the last journal since journal rotation writes + // the complete state at the snapshot version, so the last journal contains + // all current state. + if let Some(last_journal) = self.journals.last() { + replay_journal_to_version(&last_journal.data, u64::MAX, &mut map)?; } Ok(map) From ebc6c062613f69bfcf2a1a5e59e7ae34af72bd27 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 08:42:27 -0800 Subject: [PATCH 09/66] docs(bd-resilient-kv): update documentation to reflect async API changes --- bd-resilient-kv/AGENTS.md | 28 ++++-- bd-resilient-kv/README.md | 145 +++++++++++++++------------- bd-resilient-kv/VERSIONED_FORMAT.md | 101 ++++++++++--------- 3 files changed, 160 insertions(+), 114 deletions(-) diff --git a/bd-resilient-kv/AGENTS.md b/bd-resilient-kv/AGENTS.md index 09e3fd80..120a7bed 100644 --- a/bd-resilient-kv/AGENTS.md +++ b/bd-resilient-kv/AGENTS.md @@ -46,7 +46,13 @@ The `VersionedKVStore` provides a higher-level API built on top of `VersionedKVJ **Key Components**: - **VersionedKVJournal**: Low-level journal that tracks version numbers for each entry - **MemMappedVersionedKVJournal**: Memory-mapped persistence layer -- **VersionedKVStore**: High-level HashMap-like API with automatic rotation +- **VersionedKVStore**: High-level HashMap-like API with automatic rotation and async write operations + +**Async API**: +- Write operations (`insert()`, `remove()`, `rotate_journal()`) are async and require a Tokio runtime +- Compression of archived journals is performed asynchronously using streaming I/O +- Read operations remain synchronous and operate on the in-memory cache +- The async API enables efficient background compression without blocking the main thread **Version Tracking**: - Every write operation (`insert`, `remove`) returns a monotonically increasing version number @@ -54,23 +60,33 @@ The `VersionedKVStore` provides a higher-level API built on top of `VersionedKVJ - Entries with `Value::Null` are treated as deletions but still versioned **Rotation Strategy**: -- Automatic rotation when journal size exceeds high water mark +- Automatic rotation when journal size exceeds high water mark (triggered during async write operations) - Current state is compacted into a new journal as versioned entries - Old journal is archived with `.v{version}.zz` suffix -- Archived journals are automatically compressed using zlib (RFC 1950, level 3) +- Archived journals are automatically compressed using zlib (RFC 1950, level 3) asynchronously - Optional callback invoked with archived path and version - Application controls upload/cleanup of archived journals **Compression**: -- All archived journals are automatically compressed during rotation +- All archived journals are automatically compressed during rotation using async I/O - Active journals remain uncompressed for write performance - Compression uses zlib format (RFC 1950) with level 3 for balanced speed/ratio +- Streaming compression avoids loading entire journals into memory - Typical compression achieves >50% size reduction for text-based data - File extension `.zz` indicates compressed archives - Recovery transparently decompresses archived journals when needed -**Note on Point-in-Time Recovery**: -The `VersionedKVJournal` trait provides `as_hashmap_at_version()` for replaying entries within a single journal. However, `VersionedKVStore` does not expose this functionality because it only works within the current journal - once rotation occurs, historical versions in archived journals cannot be accessed. For true point-in-time recovery across rotations, applications would need to implement their own mechanism to load and replay archived journal files. +**Point-in-Time Recovery**: +The `VersionedRecovery` utility provides point-in-time recovery capabilities for versioned journals. It works with raw journal bytes and can reconstruct state at any historical version, including across rotation boundaries. `VersionedRecovery` is designed for offline analysis, audit tooling, and server-side operations - it is separate from `VersionedKVStore` which is focused on active write operations. Applications can use `VersionedRecovery` to analyze archived journals and recover state at specific versions. The `from_files()` constructor is async for efficient file reading. + +**Recovery Optimization**: +The `recover_current()` method in `VersionedRecovery` is optimized to only read the last journal rather than replaying all journals from the beginning. This is possible because journal rotation writes the complete current state into the new journal at the snapshot version, so the last journal alone contains the full current state. For historical version recovery, `recover_at_version()` intelligently selects and replays only the necessary journals. + +**Snapshot Cleanup**: +The `SnapshotCleanup` utility provides async methods for managing archived journal snapshots: +- All cleanup operations are async and require a Tokio runtime +- `list_snapshots()`, `cleanup_before_version()`, `cleanup_keep_recent()` are all async +- Enables efficient disk space management without blocking operations ## Critical Design Insights diff --git a/bd-resilient-kv/README.md b/bd-resilient-kv/README.md index 33475df1..6bf44fa3 100644 --- a/bd-resilient-kv/README.md +++ b/bd-resilient-kv/README.md @@ -194,26 +194,28 @@ For applications that require version tracking, audit logs, or point-in-time rec use bd_resilient_kv::VersionedKVStore; use bd_bonjson::Value; -fn main() -> anyhow::Result<()> { +#[tokio::main] +async fn main() -> anyhow::Result<()> { // Create a versioned store with automatic rotation at 1MB let mut store = VersionedKVStore::new( - "versioned_store.jrn", - 1024 * 1024, // Rotate when journal reaches 1MB - None // Optional rotation callback + ".", // Directory path + "versioned_store", // Journal name + 1024 * 1024, // Rotate when journal reaches 1MB + None // Optional high water mark ratio )?; - // All write operations return version numbers - let v1 = store.insert("config".to_string(), Value::String("v1".to_string()))?; + // Write operations are async and return version numbers + let v1 = store.insert("config".to_string(), Value::String("v1".to_string())).await?; println!("Inserted at version: {}", v1); - let v2 = store.insert("config".to_string(), Value::String("v2".to_string()))?; + let v2 = store.insert("config".to_string(), Value::String("v2".to_string())).await?; println!("Updated at version: {}", v2); // Read current state (O(1) from cache) assert_eq!(store.get("config"), Some(&Value::String("v2".to_string()))); - // Removing a key also returns a version - let v3 = store.remove("config")?; + // Removing a key is also async and returns a version + let v3 = store.remove("config").await?; if let Some(version) = v3 { println!("Removed at version: {}", version); } @@ -229,20 +231,21 @@ Monitor journal rotation events for remote backup or cleanup: ```rust use bd_resilient_kv::{VersionedKVStore, RotationCallback}; use bd_bonjson::Value; -use std::sync::Arc; -fn upload_to_remote(path: &str, version: u64) { - println!("Uploading archived journal {} at version {}", path, version); +fn upload_to_remote(path: &std::path::Path, version: u64) { + println!("Uploading archived journal {:?} at version {}", path, version); // Upload to S3, backup server, etc. } -fn main() -> anyhow::Result<()> { - let callback: RotationCallback = Arc::new(|archived_path, version| { +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let callback: RotationCallback = Box::new(|archived_path, _new_path, version| { upload_to_remote(archived_path, version); }); let mut store = VersionedKVStore::new( - "my_store.jrn", + ".", // Directory path + "my_store", // Journal name 512 * 1024, // 512KB rotation threshold Some(callback) )?; @@ -250,12 +253,12 @@ fn main() -> anyhow::Result<()> { // When high water mark is reached during insert/remove, // the callback will be invoked with archived journal path for i in 0..10000 { - store.insert(format!("key_{}", i), Value::Integer(i as i64))?; + store.insert(format!("key_{}", i), Value::Integer(i as i64)).await?; // Automatic rotation happens when journal exceeds 512KB } - // Manual rotation is also supported - store.rotate()?; + // Manual rotation is also supported (also async) + store.rotate_journal().await?; Ok(()) } @@ -263,18 +266,20 @@ fn main() -> anyhow::Result<()> { ### Key Features of VersionedKVStore +- **Async API**: Write operations (`insert()`, `remove()`, `rotate_journal()`) are async and require a Tokio runtime - **Version Tracking**: Every `insert()` and `remove()` returns a monotonically increasing version number - **Timestamp Preservation**: Write timestamps are internally tracked and preserved during journal rotation for recovery purposes - **Automatic Rotation**: When the journal exceeds the high water mark, it automatically: - Creates a new journal with the current state as versioned entries (compaction) - Preserves original timestamps from the initial writes - Archives the old journal with `.v{version}.zz` suffix - - Compresses the archived journal using zlib (RFC 1950, level 3) + - Compresses the archived journal using zlib (RFC 1950, level 3) asynchronously - Invokes the rotation callback (if provided) for upload/cleanup - **Automatic Compression**: Archived journals are automatically compressed to save disk space - Active journals remain uncompressed for write performance - Typically achieves >50% size reduction for text-based data - Transparent decompression during recovery operations + - Compression is performed asynchronously using streaming I/O - **O(1) Reads**: In-memory cache provides constant-time access to current state - **Persistent**: Uses memory-mapped journals for crash-resilient storage @@ -320,42 +325,42 @@ pub fn clear(&mut self) -> anyhow::Result<()> ### VersionedKVStore (Version-Tracked Key-Value Store) -A higher-level store that tracks versions for every write operation and supports point-in-time recovery. +A higher-level store that tracks versions for every write operation and supports point-in-time recovery. Write operations are async and require a Tokio runtime. #### Constructor ```rust pub fn new>( - journal_path: P, - high_water_mark: usize, - rotation_callback: Option + dir_path: P, + name: &str, + buffer_size: usize, + high_water_mark_ratio: Option ) -> anyhow::Result ``` -- `journal_path`: Path to the journal file (e.g., "my_store.jrn") -- `high_water_mark`: Size threshold for automatic rotation (in bytes) -- `rotation_callback`: Optional callback invoked when journal is rotated - - Signature: `Arc` - - Parameters: `(archived_journal_path, version_at_rotation)` +- `dir_path`: Directory path where the journal will be stored +- `name`: Base name for the journal (e.g., "store" will create "store.jrn") +- `buffer_size`: Size in bytes for the journal buffer +- `high_water_mark_ratio`: Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 #### Core Methods ```rust -// Read operations (O(1) from cache) +// Read operations (O(1) from cache, synchronous) pub fn get(&self, key: &str) -> Option<&Value> pub fn contains_key(&self, key: &str) -> bool pub fn len(&self) -> usize pub fn is_empty(&self) -> bool pub fn as_hashmap(&self) -> HashMap -// Write operations (return version numbers) -pub fn insert(&mut self, key: String, value: Value) -> anyhow::Result -pub fn remove(&mut self, key: &str) -> anyhow::Result> +// Write operations (async, return version numbers) +pub async fn insert(&mut self, key: String, value: Value) -> anyhow::Result +pub async fn remove(&mut self, key: &str) -> anyhow::Result> -// Manual rotation -pub fn rotate(&mut self) -> anyhow::Result<()> +// Manual rotation (async) +pub async fn rotate_journal(&mut self) -> anyhow::Result<()> -// Version information +// Version information (synchronous) pub fn current_version(&self) -> u64 ``` @@ -364,9 +369,14 @@ pub fn current_version(&self) -> u64 #### Type Aliases ```rust -pub type RotationCallback = Arc; +pub type RotationCallback = Box; ``` +**Note**: The callback receives three parameters: +- `old_journal_path`: Path to the archived journal that was rotated out +- `new_journal_path`: Path to the new active journal +- `rotation_version`: The version at which rotation occurred + ## Architecture ### Storage Models @@ -450,12 +460,12 @@ Both `KVStore` and `VersionedKVStore` use the same caching approach: | Operation | Time Complexity | Notes | |--------------------|-----------------|-------------------------------------| | `get()` | O(1) | Reads from in-memory cache | -| `insert()` | O(1) amortized | Journal write + cache + version | -| `remove()` | O(1) amortized | Journal write + cache + version | +| `insert()` | O(1) amortized | Async journal write + cache + version | +| `remove()` | O(1) amortized | Async journal write + cache + version | | `contains_key()` | O(1) | Cache lookup | | `len()` | O(1) | Cache size | | `as_hashmap()` | O(n) | Creates temporary map of values | -| `rotate()` | O(n) | Serializes current state to new journal | +| `rotate_journal()` | O(n) | Async - serializes current state to new journal | | `current_version()`| O(1) | Returns version counter | ## Error Handling @@ -532,27 +542,29 @@ let store = Arc::new(Mutex::new( ### Archived Journal Compression -**VersionedKVStore** automatically compresses archived journals to save disk space: +**VersionedKVStore** automatically compresses archived journals asynchronously to save disk space: ```rust use bd_resilient_kv::VersionedKVStore; use bd_bonjson::Value; -fn main() -> anyhow::Result<()> { +#[tokio::main] +async fn main() -> anyhow::Result<()> { let mut store = VersionedKVStore::new( - "my_store.jrn", + ".", // Directory path + "my_store", // Journal name 512 * 1024, // 512KB rotation threshold None )?; // Write data that will trigger rotation for i in 0..10000 { - store.insert(format!("key_{}", i), Value::Integer(i as i64))?; + store.insert(format!("key_{}", i), Value::Integer(i as i64)).await?; } // After rotation, archived journals are automatically compressed: // - my_store.jrn (active, uncompressed) - // - my_store.jrn.v10000.zz (archived, compressed with zlib) + // - my_store.jrn.v10000.zz (archived, compressed with zlib asynchronously) Ok(()) } @@ -560,50 +572,51 @@ fn main() -> anyhow::Result<()> { **Compression Details**: - **Format**: zlib (RFC 1950) with compression level 3 -- **Performance**: Balanced speed/compression ratio +- **Performance**: Balanced speed/compression ratio, performed asynchronously with streaming I/O - **Transparency**: Recovery automatically detects and decompresses archived journals - **Naming**: `.zz` extension indicates compressed archives - **Typical Savings**: >50% size reduction for text-based data **Active vs Archived**: - Active journals remain **uncompressed** for maximum write performance -- Only archived journals are compressed during rotation +- Only archived journals are compressed during rotation (asynchronously) - No configuration needed - compression is automatic ### Snapshot Cleanup Management -**SnapshotCleanup** provides utilities for managing disk space by cleaning up old archived journals: +**SnapshotCleanup** provides utilities for managing disk space by cleaning up old archived journals. Its methods are async and require a Tokio runtime. ```rust use bd_resilient_kv::SnapshotCleanup; -fn main() -> anyhow::Result<()> { +#[tokio::main] +async fn main() -> anyhow::Result<()> { // Create cleanup utility for your journal let cleanup = SnapshotCleanup::new("my_store.jrn")?; - // List all archived snapshots - let snapshots = cleanup.list_snapshots()?; + // List all archived snapshots (async) + let snapshots = cleanup.list_snapshots().await?; for snapshot in &snapshots { println!("Version: {}, Size: {} bytes, Path: {:?}", snapshot.version, snapshot.size_bytes, snapshot.path); } - // Strategy 1: Remove snapshots older than a specific version + // Strategy 1: Remove snapshots older than a specific version (async) // (e.g., your system determined you need to keep data back to version 5000) - let removed = cleanup.cleanup_before_version(5000)?; + let removed = cleanup.cleanup_before_version(5000).await?; println!("Removed {} old snapshots", removed.len()); - // Strategy 2: Keep only the N most recent snapshots - let removed = cleanup.cleanup_keep_recent(10)?; + // Strategy 2: Keep only the N most recent snapshots (async) + let removed = cleanup.cleanup_keep_recent(10).await?; println!("Removed {} snapshots, kept 10 most recent", removed.len()); - // Check disk usage - let total_size = cleanup.total_snapshot_size()?; + // Check disk usage (async) + let total_size = cleanup.total_snapshot_size().await?; println!("Total snapshot size: {} bytes", total_size); - // Get version range - if let Some(oldest) = cleanup.oldest_snapshot_version()? { - if let Some(newest) = cleanup.newest_snapshot_version()? { + // Get version range (async) + if let Some(oldest) = cleanup.oldest_snapshot_version().await? { + if let Some(newest) = cleanup.newest_snapshot_version().await? { println!("Snapshots range from version {} to {}", oldest, newest); } } @@ -613,6 +626,7 @@ fn main() -> anyhow::Result<()> { ``` **Key Features**: +- **Async operations**: All methods are async and require a Tokio runtime - **Version-based cleanup**: Remove snapshots before a specific version - **Count-based cleanup**: Keep only N most recent snapshots - **Safe operations**: Only removes compressed archives (`.zz` files), never active journals @@ -624,19 +638,20 @@ fn main() -> anyhow::Result<()> { use bd_resilient_kv::{VersionedKVStore, SnapshotCleanup}; use bd_bonjson::Value; -fn main() -> anyhow::Result<()> { +#[tokio::main] +async fn main() -> anyhow::Result<()> { // Your application logic determines minimum required version let min_version_from_external_system = get_minimum_required_version(); // Create store - let mut store = VersionedKVStore::new("my_store.jrn", 1024 * 1024, None)?; + let mut store = VersionedKVStore::new(".", "my_store", 1024 * 1024, None)?; - // Perform operations... - store.insert("key".to_string(), Value::from(42))?; + // Perform operations... (async) + store.insert("key".to_string(), Value::from(42)).await?; - // Periodically clean up old snapshots + // Periodically clean up old snapshots (async) let cleanup = SnapshotCleanup::new("my_store.jrn")?; - cleanup.cleanup_before_version(min_version_from_external_system)?; + cleanup.cleanup_before_version(min_version_from_external_system).await?; Ok(()) } diff --git a/bd-resilient-kv/VERSIONED_FORMAT.md b/bd-resilient-kv/VERSIONED_FORMAT.md index 38a724e2..cdc70438 100644 --- a/bd-resilient-kv/VERSIONED_FORMAT.md +++ b/bd-resilient-kv/VERSIONED_FORMAT.md @@ -84,9 +84,9 @@ When first created with base version 1: After rotation at version 30000, the new journal contains: ```json {"initialized": 1699564900000000000, "format_version": 2, "base_version": 30000} -{"v": 30000, "t": 1699564900000000000, "k": "key1", "o": "value1"} // Compacted state -{"v": 30000, "t": 1699564900000000000, "k": "key2", "o": "value2"} // Compacted state -{"v": 30000, "t": 1699564900000000000, "k": "key3", "o": "value3"} // Compacted state +{"v": 30000, "t": 1699564800123456789, "k": "key1", "o": "value1"} // Compacted state (original timestamp) +{"v": 30000, "t": 1699564850987654321, "k": "key2", "o": "value2"} // Compacted state (original timestamp) +{"v": 30000, "t": 1699564875111222333, "k": "key3", "o": "value3"} // Compacted state (original timestamp) {"v": 30001, "t": 1699564901000000000, "k": "key4", "o": "value4"} // New write {"v": 30002, "t": 1699564902000000000, "k": "key1", "o": "updated1"} // New write ... @@ -94,6 +94,7 @@ After rotation at version 30000, the new journal contains: Key observations: - All compacted state entries have the same version (30000) +- **Timestamps are preserved**: Each compacted entry retains its original write timestamp (not the rotation time) - These are regular journal entries, not a special format - Incremental writes continue with version 30001+ - Each rotated journal is self-contained and can be read independently @@ -104,6 +105,8 @@ When high water mark is reached at version N: 1. **Create New Journal**: Initialize fresh journal file (e.g., `my_store.jrn.tmp`) 2. **Write Compacted State**: Write all current key-value pairs as versioned entries at version N + - **Timestamp Preservation**: Each entry retains its original write timestamp, not the rotation timestamp + - This preserves historical accuracy and allows proper temporal analysis of the data 3. **Archive Old Journal**: Rename `my_store.jrn` → `my_store.jrn.old` (temporary) 4. **Activate New Journal**: Rename `my_store.jrn.tmp` → `my_store.jrn` 5. **Compress Archive**: Compress `my_store.jrn.old` → `my_store.jrn.v{N}.zz` using zlib @@ -144,6 +147,8 @@ While `VersionedKVStore` does not support point-in-time recovery through its API The version numbers in each entry allow you to understand the exact sequence of operations and build custom tooling for analyzing historical data. +**Timestamp Accuracy**: All entries preserve their original write timestamps, even after rotation. This means you can accurately track when each write originally occurred, making the journals suitable for temporal analysis, compliance auditing, and debugging time-sensitive issues. + ### Point-in-Time Recovery with VersionedRecovery While `VersionedKVStore` is designed for active operation and does not support point-in-time recovery through its API, the `VersionedRecovery` utility provides a way to reconstruct state at arbitrary historical versions from raw journal bytes. @@ -151,8 +156,8 @@ While `VersionedKVStore` is designed for active operation and does not support p #### Overview `VersionedRecovery` is a separate utility that: -- Works with raw journal bytes (`&[u8]`), not file paths -- Does not perform any file I/O operations +- Loads journals from file paths and automatically handles decompression of `.zz` archives +- Uses async I/O for efficient file loading - Can process multiple journals for cross-rotation recovery - Designed for offline analysis, server-side tooling, and audit systems - Completely independent from `VersionedKVStore` @@ -168,8 +173,12 @@ While `VersionedKVStore` is designed for active operation and does not support p #### API Methods ```rust -// Create recovery utility from journal byte slices (oldest to newest) -let recovery = VersionedRecovery::new(vec![&archived_journal, &active_journal])?; +// Create recovery utility from journal files (oldest to newest) - async +let recovery = VersionedRecovery::from_files(vec![ + "store.jrn.v20000.zz", + "store.jrn.v30000.zz", + "store.jrn" +]).await?; // Recover state at specific version let state = recovery.recover_at_version(25000)?; @@ -187,35 +196,36 @@ if let Some((min, max)) = recovery.version_range() { ```rust use bd_resilient_kv::VersionedRecovery; -use std::fs; - -// Load archived journals from remote storage or local disk -let journal_v20000 = fs::read("store.jrn.v20000")?; -let journal_v30000 = fs::read("store.jrn.v30000")?; -let journal_active = fs::read("store.jrn")?; - -// Create recovery utility with all journals -let recovery = VersionedRecovery::new(vec![ - &journal_v20000, - &journal_v30000, - &journal_active, -])?; -// Recover state at version 25000 (in archived journal) -let state_at_25000 = recovery.recover_at_version(25000)?; - -// Recover state at version 35000 (across rotation boundary) -let state_at_35000 = recovery.recover_at_version(35000)?; - -// Process the recovered state -for (key, value) in state_at_25000 { - println!("{key} = {value:?}"); +#[tokio::main] +async fn main() -> anyhow::Result<()> { + // Create recovery utility from files (automatically decompresses .zz archives) + // Provide journal paths in chronological order (oldest to newest) + let recovery = VersionedRecovery::from_files(vec![ + "store.jrn.v20000.zz", + "store.jrn.v30000.zz", + "store.jrn", + ]).await?; + + // Recover state at version 25000 (in archived journal) + let state_at_25000 = recovery.recover_at_version(25000)?; + + // Recover state at version 35000 (across rotation boundary) + let state_at_35000 = recovery.recover_at_version(35000)?; + + // Process the recovered state + for (key, value) in state_at_25000 { + println!("{key} = {value:?}"); + } + + Ok(()) } ``` #### Implementation Details -- **No File I/O**: Works purely with byte slices, caller is responsible for loading data +- **Async File Loading**: Constructor uses async I/O to load journal files efficiently +- **Automatic Decompression**: Transparently decompresses `.zz` archives when loading - **Chronological Order**: Journals should be provided oldest to newest - **Efficient Replay**: Automatically skips journals outside the target version range - **Cross-Rotation**: Seamlessly handles recovery across multiple archived journals @@ -246,15 +256,20 @@ for (key, value) in state_at_25000 { use bd_resilient_kv::VersionedKVStore; use bd_bonjson::Value; -// Create or open store -let mut store = VersionedKVStore::new("mystore.jrn", 1024 * 1024, None)?; +#[tokio::main] +async fn main() -> anyhow::Result<()> { + // Create or open store (requires directory path and name) + let mut store = VersionedKVStore::new("/path/to/dir", "mystore", 1024 * 1024, None)?; -// Writes return version numbers -let v1 = store.insert("key1".to_string(), Value::from(42))?; -let v2 = store.insert("key2".to_string(), Value::from("hello"))?; + // Writes return version numbers (async operations) + let v1 = store.insert("key1".to_string(), Value::from(42)).await?; + let v2 = store.insert("key2".to_string(), Value::from("hello")).await?; -// Read current values -let value = store.get("key1")?; + // Read current values (synchronous) + let value = store.get("key1")?; + + Ok(()) +} ``` ### Rotation Callback @@ -262,9 +277,9 @@ let value = store.get("key1")?; ```rust // Set callback for rotation events store.set_rotation_callback(Box::new(|old_path, new_path, version| { - println!("Rotated at version {}", version); - println!("Archived journal: {:?}", old_path); - println!("New active journal: {:?}", new_path); + println!("Rotated at version {version}"); + println!("Archived journal (compressed): {old_path:?}"); + println!("New active journal: {new_path:?}"); // Upload old_path to remote storage... })); ``` @@ -273,11 +288,11 @@ store.set_rotation_callback(Box::new(|old_path, new_path, version| { ```rust // Automatic rotation on high water mark -let version = store.insert("key".to_string(), Value::from("value"))?; +let version = store.insert("key".to_string(), Value::from("value")).await?; // Rotation happens automatically if high water mark exceeded -// Or manually trigger rotation -store.rotate_journal()?; +// Or manually trigger rotation (async) +store.rotate_journal().await?; ``` ## Migration from VERSION 1 From 96394a2539ccb92e2354695574ef1802c70d8553 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 09:30:38 -0800 Subject: [PATCH 10/66] test(bd-resilient-kv): add test verifying rotation with many unique keys Add test that documents and verifies the impossibility of buffer overflow during rotation. The test demonstrates that rotation always succeeds because compacted state fits in the same-sized buffer used during normal operation. --- .../src/tests/versioned_kv_store_test.rs | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index f040da91..6b6bbd79 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -585,3 +585,46 @@ async fn test_rotation_callback_receives_compressed_path() -> anyhow::Result<()> Ok(()) } + +#[tokio::test] +async fn test_rotation_with_many_unique_keys() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + + // This test verifies that rotation works correctly even with many unique keys + // Note: The scenario where compacted state exceeds buffer size is impossible because: + // - Compacted state only includes current key-value pairs + // - If keys fit in the journal during normal operation, they will fit during rotation + // - Rotation uses the same buffer size, and compaction removes redundant updates + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 2048, Some(0.8))?; + + // Insert many unique keys + for i in 0 .. 50 { + let key = format!("key_{}", i); + let value = Value::String(format!("value_{}", i)); + match store.insert(key, value).await { + Ok(_) => {}, + Err(_) => break, // Buffer full + } + } + + let entries_before = store.len(); + assert!(entries_before > 20, "Should have written many entries"); + + // Rotation should succeed because compacted state fits in same-sized buffer + store.rotate_journal().await?; + + // Verify all data is preserved + assert_eq!(store.len(), entries_before); + + // Verify we can continue writing after rotation + store + .insert( + "new_key".to_string(), + Value::String("new_value".to_string()), + ) + .await?; + assert_eq!(store.len(), entries_before + 1); + + Ok(()) +} From 6eae295257342fc98324faa228aee3bd97fe6dc3 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 09:31:55 -0800 Subject: [PATCH 11/66] Revert "test(bd-resilient-kv): add test verifying rotation with many unique keys" This reverts commit 96394a2539ccb92e2354695574ef1802c70d8553. --- .../src/tests/versioned_kv_store_test.rs | 43 ------------------- 1 file changed, 43 deletions(-) diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index 6b6bbd79..f040da91 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -585,46 +585,3 @@ async fn test_rotation_callback_receives_compressed_path() -> anyhow::Result<()> Ok(()) } - -#[tokio::test] -async fn test_rotation_with_many_unique_keys() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - // This test verifies that rotation works correctly even with many unique keys - // Note: The scenario where compacted state exceeds buffer size is impossible because: - // - Compacted state only includes current key-value pairs - // - If keys fit in the journal during normal operation, they will fit during rotation - // - Rotation uses the same buffer size, and compaction removes redundant updates - - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 2048, Some(0.8))?; - - // Insert many unique keys - for i in 0 .. 50 { - let key = format!("key_{}", i); - let value = Value::String(format!("value_{}", i)); - match store.insert(key, value).await { - Ok(_) => {}, - Err(_) => break, // Buffer full - } - } - - let entries_before = store.len(); - assert!(entries_before > 20, "Should have written many entries"); - - // Rotation should succeed because compacted state fits in same-sized buffer - store.rotate_journal().await?; - - // Verify all data is preserved - assert_eq!(store.len(), entries_before); - - // Verify we can continue writing after rotation - store - .insert( - "new_key".to_string(), - Value::String("new_value".to_string()), - ) - .await?; - assert_eq!(store.len(), entries_before + 1); - - Ok(()) -} From 30a18b86c90627ccaa658113c86223a937406433 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 09:34:40 -0800 Subject: [PATCH 12/66] docs(bd-resilient-kv): document rotation buffer overflow impossibility Add explicit documentation about failure modes applications must handle versus those that are architecturally impossible. Clarifies that rotation cannot fail due to buffer overflow since compacted state always fits in same-sized buffer. --- bd-resilient-kv/AGENTS.md | 48 +++++++++++++++++++++++++++++ bd-resilient-kv/VERSIONED_FORMAT.md | 18 +++++++++++ 2 files changed, 66 insertions(+) diff --git a/bd-resilient-kv/AGENTS.md b/bd-resilient-kv/AGENTS.md index 120a7bed..45aa3002 100644 --- a/bd-resilient-kv/AGENTS.md +++ b/bd-resilient-kv/AGENTS.md @@ -67,6 +67,11 @@ The `VersionedKVStore` provides a higher-level API built on top of `VersionedKVJ - Optional callback invoked with archived path and version - Application controls upload/cleanup of archived journals +**Rotation Guarantees**: +- **Impossible Failure Mode**: Rotation cannot fail due to insufficient buffer space +- **Reasoning**: Rotation creates a new journal with the same buffer size as the original. Since compaction only removes redundant updates (old versions of keys), the compacted state is always ≤ the current journal size. If data fits in the journal during normal operation, it will always fit during rotation. +- **Implication**: Applications do not need to handle "buffer overflow during rotation" errors. This is an architectural guarantee. + **Compression**: - All archived journals are automatically compressed during rotation using async I/O - Active journals remain uncompressed for write performance @@ -228,6 +233,45 @@ fn set_multiple(&mut self, entries: &[(String, Value)]) -> anyhow::Result<()> { - **Retry Logic**: `set_multiple` should retry on failure if high water mark not triggered - **Error Scenarios**: Actual errors should be propagated, not masked +## Failure Modes: What to Handle vs What's Impossible + +### Failure Modes Applications Must Handle + +1. **Buffer Full During Normal Writes** + - **When**: Writing to journal when buffer is at capacity + - **Result**: Write operations return `SerializationError::BufferFull` + - **Action**: Check `is_high_water_mark_triggered()`, trigger compaction/rotation if needed + - **Note**: Even after compaction, if unique data exceeds buffer size, writes will still fail + +2. **High Water Mark Triggered After Compaction (KVStore)** + - **When**: Compacted state still exceeds high water mark threshold + - **Result**: `is_high_water_mark_triggered()` returns true after `switch_journals()` + - **Action**: Indicates buffer size is too small for the unique data volume, not a transient issue + +3. **I/O Errors During Persistence** + - **When**: File operations fail (disk full, permissions, etc.) + - **Result**: I/O errors propagated from memory-mapped operations + - **Action**: Handle as standard I/O errors + +4. **Compression/Archive Errors (VersionedKVStore)** + - **When**: Rotation callback receives archived journal path but compression fails + - **Result**: Application-level error in rotation callback + - **Action**: Retry compression, handle cleanup appropriately + +### Impossible Failure Modes (Architectural Guarantees) + +1. **Buffer Overflow During Rotation (VersionedKVStore)** + - **Why Impossible**: Rotation creates new journal with same buffer size. Compaction only removes redundant updates, so compacted state ≤ current journal size. If data fits during normal operation, it always fits during rotation. + - **Implication**: No need to handle "insufficient buffer during rotation" errors + +2. **Buffer Overflow During Compaction (KVStore)** + - **Why Impossible**: Compaction via `reinit_from()` writes to inactive buffer of the same size. Same reasoning as rotation. + - **Implication**: `switch_journals()` may set high water mark flag, but won't fail due to buffer overflow + +3. **Version Number Overflow (VersionedKVStore)** + - **Why Practically Impossible**: Uses u64, would require 58+ million years at 10,000 writes/second + - **Implication**: No overflow handling needed in practice + ## Common Pitfalls ### 1. Assuming Compaction Always Works @@ -242,6 +286,10 @@ fn set_multiple(&mut self, entries: &[(String, Value)]) -> anyhow::Result<()> { **Wrong**: Assuming high water mark flag means operation failed **Right**: High water mark flag indicates resource pressure, operations may still succeed +### 4. Over-Engineering for Impossible Scenarios +**Wrong**: Adding error handling for rotation buffer overflow +**Right**: Trust architectural guarantees, focus on actual failure modes + ## Key Methods and Their Purposes ### `set_multiple(entries: &[(String, Value)])` diff --git a/bd-resilient-kv/VERSIONED_FORMAT.md b/bd-resilient-kv/VERSIONED_FORMAT.md index cdc70438..4926c430 100644 --- a/bd-resilient-kv/VERSIONED_FORMAT.md +++ b/bd-resilient-kv/VERSIONED_FORMAT.md @@ -132,6 +132,24 @@ Archived journals are automatically compressed using zlib (compression level 3) - **Benefits**: Reduced storage space and bandwidth for remote backups - **Performance**: Compression level 3 provides good balance between speed and compression ratio +### Rotation Failure Modes + +**Impossible Failure: Buffer Overflow During Rotation** + +Rotation **cannot fail** due to insufficient buffer space. This is an architectural guarantee: + +- **Why**: Rotation creates a new journal with the same buffer size as the original journal +- **Compaction Property**: The compacted state only includes current key-value pairs (removes redundant/old versions) +- **Mathematical Guarantee**: Compacted state size ≤ current journal size +- **Conclusion**: If data fits in the journal during normal operation, it will always fit during rotation + +**What Can Fail:** +- I/O errors (disk full, permissions, etc.) +- Compression errors in the callback phase (application-level) + +**What Cannot Fail:** +- Writing compacted state to new journal buffer (guaranteed to fit) + ## Recovery and Audit ### Current State Recovery From f14c523241e015c0f4f57e00299c08a433f4464e Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 16:33:37 -0800 Subject: [PATCH 13/66] use timestamp as version --- bd-logger/Cargo.toml | 2 +- bd-resilient-kv/Cargo.toml | 2 +- bd-resilient-kv/src/kv_journal/versioned.rs | 78 ++++++- .../src/tests/versioned_kv_store_test.rs | 189 +++++++++++++-- .../src/tests/versioned_recovery_test.rs | 216 ++++++++++++++++++ bd-resilient-kv/src/versioned_kv_store.rs | 18 +- bd-resilient-kv/src/versioned_recovery.rs | 157 ++++++++++++- 7 files changed, 625 insertions(+), 37 deletions(-) diff --git a/bd-logger/Cargo.toml b/bd-logger/Cargo.toml index f65a9371..85dd6345 100644 --- a/bd-logger/Cargo.toml +++ b/bd-logger/Cargo.toml @@ -63,7 +63,7 @@ bd-hyper-network = { path = "../bd-hyper-network" } bd-noop-network = { path = "../bd-noop-network" } bd-test-helpers = { path = "../bd-test-helpers", default-features = false } ctor.workspace = true -flate2.workspace = true +flate2 = { workspace = true, features = ["zlib"] } pretty_assertions.workspace = true tempfile.workspace = true tokio-test.workspace = true diff --git a/bd-resilient-kv/Cargo.toml b/bd-resilient-kv/Cargo.toml index 0aa6644c..c5ff14f5 100644 --- a/bd-resilient-kv/Cargo.toml +++ b/bd-resilient-kv/Cargo.toml @@ -19,6 +19,6 @@ bd-bonjson = { path = "../bd-bonjson" } bd-client-common = { path = "../bd-client-common" } bd-workspace-hack.workspace = true bytes.workspace = true -flate2.workspace = true +flate2 = { workspace = true, features = ["zlib"] } memmap2.workspace = true tokio.workspace = true diff --git a/bd-resilient-kv/src/kv_journal/versioned.rs b/bd-resilient-kv/src/kv_journal/versioned.rs index 6677342f..9d1e9229 100644 --- a/bd-resilient-kv/src/kv_journal/versioned.rs +++ b/bd-resilient-kv/src/kv_journal/versioned.rs @@ -23,11 +23,13 @@ pub struct TimestampedValue { pub timestamp: u64, } -/// Versioned implementation of a key-value journaling system that tracks write versions +/// Versioned implementation of a key-value journaling system that tracks write timestamps /// for point-in-time recovery. /// -/// Each write operation is assigned a monotonically increasing version number, enabling -/// exact state reconstruction at any historical version. +/// Each write operation is assigned a monotonically increasing timestamp (in nanoseconds +/// since UNIX epoch), enabling exact state reconstruction at any historical timestamp. +/// The monotonicity is enforced by clamping: if the system clock goes backwards, we reuse +/// the same timestamp value to maintain ordering guarantees without artificial clock skew. #[derive(Debug)] pub struct VersionedKVJournal<'a> { #[allow(dead_code)] @@ -39,6 +41,7 @@ pub struct VersionedKVJournal<'a> { initialized_at_unix_time_ns: u64, current_version: u64, base_version: u64, // First version in this journal + last_timestamp: u64, // Most recent timestamp written (for monotonic enforcement) } // Versioned KV files have the following structure: @@ -53,6 +56,14 @@ pub struct VersionedKVJournal<'a> { // // Metadata object: {"initialized": , "format_version": 2, "base_version": } // Journal entries: {"v": , "t": , "k": "", "o": } +// +// # Timestamp Semantics +// +// Timestamps serve as logical clocks with monotonic guarantees rather than pure wall time: +// - Each write gets a timestamp that is guaranteed to be >= previous writes +// - If system clock goes backward, timestamps are clamped to last_timestamp (reuse same value) +// - This ensures total ordering while allowing correlation with external timestamped systems +// - Version numbers (v) are maintained for backward compatibility and as secondary ordering const VERSION: u64 = 2; // The versioned format version const INVALID_VERSION: u64 = 0; // 0 will never be a valid version @@ -251,6 +262,7 @@ impl<'a> VersionedKVJournal<'a> { initialized_at_unix_time_ns: timestamp, current_version: base_version, base_version, + last_timestamp: timestamp, }) } @@ -277,6 +289,10 @@ impl<'a> VersionedKVJournal<'a> { let highest_version = Self::find_highest_version(buffer)?; let current_version = highest_version.unwrap_or(base_version); + // Find the highest timestamp in the journal + let highest_timestamp = Self::find_highest_timestamp(buffer)?; + let last_timestamp = highest_timestamp.unwrap_or(init_timestamp); + Ok(Self { format_version, position, @@ -286,6 +302,7 @@ impl<'a> VersionedKVJournal<'a> { initialized_at_unix_time_ns: init_timestamp, current_version, base_version, + last_timestamp, }) } @@ -309,6 +326,26 @@ impl<'a> VersionedKVJournal<'a> { Ok(None) } + /// Find the highest timestamp in the journal. + /// + /// Since timestamps are monotonically increasing, this simply returns the timestamp + /// from the last entry in the journal. + fn find_highest_timestamp(buffer: &[u8]) -> anyhow::Result> { + let array = read_bonjson_payload(buffer)?; + + if let Value::Array(entries) = array { + // Skip metadata (index 0) and get the last actual entry + // Since timestamps are monotonically increasing, the last entry has the highest timestamp + if entries.len() > 1 + && let Some(Value::Object(obj)) = entries.last() + { + return Ok(read_u64_field(obj, "t")); + } + } + + Ok(None) + } + /// Get the current version number. #[must_use] pub fn current_version(&self) -> u64 { @@ -321,6 +358,18 @@ impl<'a> VersionedKVJournal<'a> { self.base_version } + /// Get the next monotonically increasing timestamp. + /// + /// This ensures that even if the system clock goes backwards, timestamps remain + /// monotonically increasing by clamping to `last_timestamp` (reusing the same value). + /// This prevents artificial clock skew while maintaining ordering guarantees. + fn next_monotonic_timestamp(&mut self) -> anyhow::Result { + let current = current_timestamp()?; + let monotonic = std::cmp::max(current, self.last_timestamp); + self.last_timestamp = monotonic; + Ok(monotonic) + } + fn set_position(&mut self, position: usize) { self.position = position; write_position(self.buffer, position); @@ -344,12 +393,13 @@ impl<'a> VersionedKVJournal<'a> { key: &str, value: &Value, ) -> anyhow::Result { + // Get monotonically increasing timestamp before borrowing buffer + let timestamp = self.next_monotonic_timestamp()?; + let buffer_len = self.buffer.len(); let mut cursor = &mut self.buffer[self.position ..]; // Create entry object: {"v": version, "t": timestamp, "k": key, "o": value} - let timestamp = current_timestamp()?; - // TODO(snowp): It would be nice to be able to pass impl AsRef for key or Cow to avoid // allocating small strings repeatedly. let entry = AHashMap::from([ @@ -369,6 +419,9 @@ impl<'a> VersionedKVJournal<'a> { /// Set a key-value pair with automatic version increment. /// Returns a tuple of (version, timestamp). + /// + /// The timestamp is monotonically increasing and serves as the primary ordering mechanism. + /// If the system clock goes backwards, timestamps are clamped to maintain monotonicity. pub fn set_versioned(&mut self, key: &str, value: &Value) -> anyhow::Result<(u64, u64)> { self.current_version += 1; let version = self.current_version; @@ -378,6 +431,9 @@ impl<'a> VersionedKVJournal<'a> { /// Delete a key with automatic version increment. /// Returns a tuple of (version, timestamp). + /// + /// The timestamp is monotonically increasing and serves as the primary ordering mechanism. + /// If the system clock goes backwards, timestamps are clamped to maintain monotonicity. pub fn delete_versioned(&mut self, key: &str) -> anyhow::Result<(u64, u64)> { self.current_version += 1; let version = self.current_version; @@ -496,6 +552,8 @@ impl<'a> VersionedKVJournal<'a> { /// /// The new journal will have all current key-value pairs written as versioned entries /// at the `snapshot_version`, using their original timestamps to preserve historical accuracy. + /// The journal's monotonic timestamp enforcement will respect the highest timestamp in the + /// provided state. /// /// # Arguments /// * `buffer` - The buffer to write the new journal to @@ -514,12 +572,19 @@ impl<'a> VersionedKVJournal<'a> { // Create a new journal with the snapshot version as the base let mut journal = Self::new(buffer, snapshot_version, high_water_mark_ratio)?; + // Find the maximum timestamp in the state to maintain monotonicity + let max_state_timestamp = state.values().map(|tv| tv.timestamp).max().unwrap_or(0); + // Write all current state as versioned entries at the snapshot version // Use the original timestamp from each entry to preserve historical accuracy for (key, timestamped_value) in state { let buffer_len = journal.buffer.len(); let mut cursor = &mut journal.buffer[journal.position ..]; + // Update last_timestamp to ensure monotonicity is maintained + // We use the actual timestamp from the entry, but track the maximum for future writes + journal.last_timestamp = std::cmp::max(journal.last_timestamp, timestamped_value.timestamp); + // Create entry object: {"v": version, "t": timestamp, "k": key, "o": value} // TODO(snowp): It would be nice to be able to pass impl AsRef for key or Cow to avoid // allocating small strings repeatedly. @@ -540,6 +605,9 @@ impl<'a> VersionedKVJournal<'a> { journal.set_position(buffer_len - remaining); } + // Ensure last_timestamp reflects the maximum timestamp we've written + journal.last_timestamp = std::cmp::max(journal.last_timestamp, max_state_timestamp); + Ok(journal) } } diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index f040da91..1199fed4 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -28,44 +28,67 @@ fn test_versioned_store_new() -> anyhow::Result<()> { } #[tokio::test] -async fn test_versioned_store_basic_operations() -> anyhow::Result<()> { +async fn test_timestamp_collision_on_clamping() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - // Test insert with version tracking - let v1 = store + // Insert first value - this establishes a timestamp + store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; - assert_eq!(v1, 2); // First write is version 2 (base is 1) - - let retrieved = store.get("key1"); - assert_eq!(retrieved, Some(&Value::String("value1".to_string()))); + let ts1 = store + .get_with_timestamp("key1") + .map(|tv| tv.timestamp) + .unwrap(); - // Test overwrite - let v2 = store - .insert("key1".to_string(), Value::String("value2".to_string())) + // Perform rapid successive writes - these might share timestamps if system clock hasn't advanced + store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; + store + .insert("key3".to_string(), Value::String("value3".to_string())) + .await?; + store + .insert("key4".to_string(), Value::String("value4".to_string())) .await?; - assert_eq!(v2, 3); // Second write is version 3 - assert!(v2 > v1); - - let retrieved = store.get("key1"); - assert_eq!(retrieved, Some(&Value::String("value2".to_string()))); - - // Test contains_key - assert!(store.contains_key("key1")); - assert!(!store.contains_key("nonexistent")); - // Test len and is_empty - assert_eq!(store.len(), 1); - assert!(!store.is_empty()); + let ts2 = store + .get_with_timestamp("key2") + .map(|tv| tv.timestamp) + .unwrap(); + let ts3 = store + .get_with_timestamp("key3") + .map(|tv| tv.timestamp) + .unwrap(); + let ts4 = store + .get_with_timestamp("key4") + .map(|tv| tv.timestamp) + .unwrap(); - // Current version should track latest write - assert_eq!(store.current_version(), v2); + // Verify monotonicity: timestamps should never decrease + assert!(ts2 >= ts1, "Timestamps should be monotonically non-decreasing"); + assert!(ts3 >= ts2, "Timestamps should be monotonically non-decreasing"); + assert!(ts4 >= ts3, "Timestamps should be monotonically non-decreasing"); + + // Document that timestamps CAN be equal (this is the key difference from the old +1 behavior) + // When system clock doesn't advance or goes backwards, we reuse the same timestamp + // This is acceptable because version numbers provide total ordering + + // Count unique timestamps - with rapid operations, we might have collisions + let timestamps = [ts1, ts2, ts3, ts4]; + let unique_count = timestamps.iter().collect::>().len(); + + // We should have at least 1 unique timestamp (all could be the same in extreme cases) + assert!( + unique_count >= 1 && unique_count <= 4, + "Should have 1-4 unique timestamps, got {}", + unique_count + ); Ok(()) } + #[tokio::test] async fn test_versioned_store_remove() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; @@ -412,6 +435,122 @@ async fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { Ok(()) } +#[tokio::test] +async fn test_timestamp_monotonicity() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + + // Track timestamps across multiple writes + let mut timestamps = Vec::new(); + + // Perform multiple writes and collect their timestamps + for i in 0 .. 20 { + store + .insert(format!("key{}", i), Value::Signed(i as i64)) + .await?; + + let ts = store + .get_with_timestamp(&format!("key{}", i)) + .map(|tv| tv.timestamp) + .unwrap(); + + timestamps.push(ts); + } + + // Verify all timestamps are monotonically increasing + for i in 1 .. timestamps.len() { + assert!( + timestamps[i] >= timestamps[i - 1], + "Timestamp at index {} ({}) should be >= timestamp at index {} ({})", + i, + timestamps[i], + i - 1, + timestamps[i - 1] + ); + } + + // Verify that timestamps are actually different (at least some of them) + // This ensures we're not just assigning the same timestamp to everything + let unique_timestamps: std::collections::HashSet<_> = timestamps.iter().collect(); + assert!( + unique_timestamps.len() > 1, + "Expected multiple unique timestamps, got only {}", + unique_timestamps.len() + ); + + Ok(()) +} + +#[tokio::test] +async fn test_timestamp_monotonicity_across_rotation() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + + // Write before rotation + store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + let ts1 = store + .get_with_timestamp("key1") + .map(|tv| tv.timestamp) + .unwrap(); + + std::thread::sleep(std::time::Duration::from_millis(10)); + + store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; + let ts2 = store + .get_with_timestamp("key2") + .map(|tv| tv.timestamp) + .unwrap(); + + // Rotate journal + store.rotate_journal().await?; + + std::thread::sleep(std::time::Duration::from_millis(10)); + + // Write after rotation + store + .insert("key3".to_string(), Value::String("value3".to_string())) + .await?; + let ts3 = store + .get_with_timestamp("key3") + .map(|tv| tv.timestamp) + .unwrap(); + + std::thread::sleep(std::time::Duration::from_millis(10)); + + store + .insert("key4".to_string(), Value::String("value4".to_string())) + .await?; + let ts4 = store + .get_with_timestamp("key4") + .map(|tv| tv.timestamp) + .unwrap(); + + // Verify monotonicity across rotation boundary + assert!(ts2 >= ts1, "ts2 should be >= ts1"); + assert!(ts3 >= ts2, "ts3 should be >= ts2 (across rotation)"); + assert!(ts4 >= ts3, "ts4 should be >= ts3"); + + // Verify ordering + let timestamps = [ts1, ts2, ts3, ts4]; + for i in 1 .. timestamps.len() { + assert!( + timestamps[i] >= timestamps[i - 1], + "Timestamp monotonicity violated at index {}: {} < {}", + i, + timestamps[i], + timestamps[i - 1] + ); + } + + Ok(()) +} + #[tokio::test] async fn test_compression_during_rotation() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index 9ebfd2f3..d3baabf8 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -736,6 +736,222 @@ async fn test_recovery_mixed_compressed_and_uncompressed() -> anyhow::Result<()> Ok(()) } +#[tokio::test] +async fn test_recovery_at_timestamp() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + + // Create a store and write some timestamped data + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + + store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + let ts1 = store + .get_with_timestamp("key1") + .map(|tv| tv.timestamp) + .unwrap(); + + // Small sleep to ensure different timestamps + std::thread::sleep(std::time::Duration::from_millis(10)); + + store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; + let ts2 = store + .get_with_timestamp("key2") + .map(|tv| tv.timestamp) + .unwrap(); + + // Small sleep to ensure different timestamps + std::thread::sleep(std::time::Duration::from_millis(10)); + + store + .insert("key1".to_string(), Value::String("updated1".to_string())) + .await?; + let ts3 = store + .get_with_timestamp("key1") + .map(|tv| tv.timestamp) + .unwrap(); + + store.sync()?; + + // Read the journal data + let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; + + // Create recovery utility + let recovery = VersionedRecovery::new(vec![&journal_data])?; + + // Verify timestamp range + let timestamp_range = recovery.timestamp_range(); + assert!(timestamp_range.is_some()); + let (min, max) = timestamp_range.unwrap(); + assert!(min <= ts1); + assert!(max >= ts3); + + // Recover at ts1: should have only key1=value1 + let state_ts1 = recovery.recover_at_timestamp(ts1)?; + assert_eq!(state_ts1.len(), 1); + assert_eq!( + state_ts1.get("key1").map(|tv| &tv.value), + Some(&Value::String("value1".to_string())) + ); + + // Recover at ts2: should have key1=value1, key2=value2 + let state_ts2 = recovery.recover_at_timestamp(ts2)?; + assert_eq!(state_ts2.len(), 2); + assert_eq!( + state_ts2.get("key1").map(|tv| &tv.value), + Some(&Value::String("value1".to_string())) + ); + assert_eq!( + state_ts2.get("key2").map(|tv| &tv.value), + Some(&Value::String("value2".to_string())) + ); + + // Recover at ts3: should have key1=updated1, key2=value2 + let state_ts3 = recovery.recover_at_timestamp(ts3)?; + assert_eq!(state_ts3.len(), 2); + assert_eq!( + state_ts3.get("key1").map(|tv| &tv.value), + Some(&Value::String("updated1".to_string())) + ); + assert_eq!( + state_ts3.get("key2").map(|tv| &tv.value), + Some(&Value::String("value2".to_string())) + ); + + Ok(()) +} + +#[tokio::test] +async fn test_recovery_at_timestamp_with_rotation() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + + // Write some data before rotation + store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + let ts1 = store + .get_with_timestamp("key1") + .map(|tv| tv.timestamp) + .unwrap(); + + std::thread::sleep(std::time::Duration::from_millis(10)); + + store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; + let ts2 = store + .get_with_timestamp("key2") + .map(|tv| tv.timestamp) + .unwrap(); + + // Rotate journal + let archive_version = store.current_version(); + store.rotate_journal().await?; + + std::thread::sleep(std::time::Duration::from_millis(10)); + + // Write data after rotation + store + .insert("key3".to_string(), Value::String("value3".to_string())) + .await?; + let ts3 = store + .get_with_timestamp("key3") + .map(|tv| tv.timestamp) + .unwrap(); + + store.sync()?; + + // Read both journals + let archived_path = temp_dir + .path() + .join(format!("test.jrn.v{}.zz", archive_version)); + let archived_data = std::fs::read(&archived_path)?; + let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; + + // Create recovery from both journals + let recovery = VersionedRecovery::new(vec![&archived_data, &active_data])?; + + // Verify timestamp range spans both journals + let timestamp_range = recovery.timestamp_range(); + assert!(timestamp_range.is_some()); + let (min, max) = timestamp_range.unwrap(); + assert!(min <= ts1); + assert!(max >= ts3); + + // Recover at ts1 (should be in archived journal) + let state_ts1 = recovery.recover_at_timestamp(ts1)?; + assert_eq!(state_ts1.len(), 1); + assert!(state_ts1.contains_key("key1")); + + // Recover at ts2 (should be in archived journal) + let state_ts2 = recovery.recover_at_timestamp(ts2)?; + assert_eq!(state_ts2.len(), 2); + assert!(state_ts2.contains_key("key1")); + assert!(state_ts2.contains_key("key2")); + + // Recover at ts3 (should include all data) + let state_ts3 = recovery.recover_at_timestamp(ts3)?; + assert_eq!(state_ts3.len(), 3); + assert!(state_ts3.contains_key("key1")); + assert!(state_ts3.contains_key("key2")); + assert!(state_ts3.contains_key("key3")); + + Ok(()) +} + +#[tokio::test] +async fn test_timestamp_range() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + + store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + let ts1 = store + .get_with_timestamp("key1") + .map(|tv| tv.timestamp) + .unwrap(); + + std::thread::sleep(std::time::Duration::from_millis(10)); + + store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; + + std::thread::sleep(std::time::Duration::from_millis(10)); + + store + .insert("key3".to_string(), Value::String("value3".to_string())) + .await?; + let ts3 = store + .get_with_timestamp("key3") + .map(|tv| tv.timestamp) + .unwrap(); + + store.sync()?; + + let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; + let recovery = VersionedRecovery::new(vec![&journal_data])?; + + let timestamp_range = recovery.timestamp_range(); + assert!(timestamp_range.is_some()); + let (min, max) = timestamp_range.unwrap(); + + // Min should be <= first timestamp, max should be >= last timestamp + assert!(min <= ts1); + assert!(max >= ts3); + + // Timestamps should be ordered + assert!(ts3 > ts1); + + Ok(()) +} + #[tokio::test] async fn test_recovery_decompression_transparent() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs index 2c5bfd35..bd00219b 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -46,14 +46,26 @@ async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result Ok(()) } -/// A persistent key-value store with version tracking. +/// A persistent key-value store with version and timestamp tracking. /// /// `VersionedKVStore` provides HashMap-like semantics backed by a versioned journal that -/// assigns a monotonically increasing version number to each write operation. This enables: -/// - Audit logs with version tracking for every write +/// assigns both a version number and a monotonically increasing timestamp to each write +/// operation. This enables: +/// - Audit logs with timestamp tracking for every write (timestamps serve as logical clocks) +/// - Point-in-time recovery at any historical timestamp +/// - Correlation with external timestamped event streams /// - Automatic journal rotation when high water mark is reached /// - Optional callbacks for post-rotation operations (e.g., remote backup) /// +/// # Timestamp Semantics +/// +/// Timestamps are monotonically increasing logical clocks (nanoseconds since UNIX epoch): +/// - Each write gets a timestamp >= all previous writes +/// - If system clock goes backward, timestamps are clamped to maintain ordering +/// - Multiple operations may share the same timestamp if system clock hasn't advanced +/// - Enables natural correlation with timestamped event buffers for upload +/// - Version numbers provide secondary ordering and backward compatibility +/// /// For performance optimization, `VersionedKVStore` maintains an in-memory cache of the /// current key-value data to provide O(1) read operations and avoid expensive journal /// decoding on every access. diff --git a/bd-resilient-kv/src/versioned_recovery.rs b/bd-resilient-kv/src/versioned_recovery.rs index e8864b1c..3ae8a48b 100644 --- a/bd-resilient-kv/src/versioned_recovery.rs +++ b/bd-resilient-kv/src/versioned_recovery.rs @@ -33,10 +33,20 @@ fn read_u64_field(obj: &AHashMap, key: &str) -> Option { } } -/// A utility for recovering state at arbitrary versions from raw journal data. +/// A utility for recovering state at arbitrary versions or timestamps from raw journal data. /// /// This utility operates on raw byte slices from versioned journals and can reconstruct -/// the key-value state at any historical version by replaying journal entries. +/// the key-value state at any historical version or timestamp by replaying journal entries. +/// +/// # Timestamp-Based Recovery +/// +/// The primary use case is timestamp-based recovery, which enables correlation with +/// external timestamped event streams: +/// - `recover_at_timestamp(ts)` - Recover state at a specific timestamp +/// - Timestamps are monotonically increasing logical clocks (not pure wall time) +/// - Enables uploading KV snapshots that match specific event buffer timestamps +/// +/// Version-based recovery is also supported for backward compatibility. /// /// Supports both compressed (zlib) and uncompressed journals. Compressed journals are /// automatically detected and decompressed transparently. @@ -66,6 +76,8 @@ struct JournalInfo { data: Vec, base_version: u64, max_version: u64, + min_timestamp: u64, + max_timestamp: u64, } impl VersionedRecovery { @@ -85,10 +97,13 @@ impl VersionedRecovery { // Detect and decompress if needed let decompressed = decompress_if_needed(data)?; let (base_version, max_version) = extract_version_range(&decompressed)?; + let (min_timestamp, max_timestamp) = extract_timestamp_range(&decompressed)?; journal_infos.push(JournalInfo { data: decompressed, base_version, max_version, + min_timestamp, + max_timestamp, }); } @@ -178,6 +193,65 @@ impl VersionedRecovery { Some((min, max)) } + /// Get the range of timestamps available in the recovery utility. + /// + /// Returns (`min_timestamp`, `max_timestamp`) tuple representing the earliest and latest + /// timestamps that can be recovered. + #[must_use] + pub fn timestamp_range(&self) -> Option<(u64, u64)> { + if self.journals.is_empty() { + return None; + } + + let min = self.journals.first().map(|j| j.min_timestamp)?; + let max = self.journals.last().map(|j| j.max_timestamp)?; + Some((min, max)) + } + + /// Recover the key-value state at a specific timestamp. + /// + /// This method replays all journal entries from all provided journals up to and including + /// the target timestamp, reconstructing the exact state at that point in time. + /// + /// # Arguments + /// + /// * `target_timestamp` - The timestamp (in nanoseconds since UNIX epoch) to recover state at + /// + /// # Returns + /// + /// A hashmap containing all key-value pairs with their timestamps as they existed at the + /// target timestamp. + /// + /// # Errors + /// + /// Returns an error if: + /// - The target timestamp is not found in any journal + /// - Journal data is corrupted or invalid + pub fn recover_at_timestamp( + &self, + target_timestamp: u64, + ) -> anyhow::Result> { + let mut map = AHashMap::new(); + + // Find all journals that might contain entries up to target timestamp + for journal in &self.journals { + // Skip journals that start after our target + if journal.min_timestamp > target_timestamp { + break; + } + + // Replay entries from this journal + replay_journal_to_timestamp(&journal.data, target_timestamp, &mut map)?; + + // If this journal contains the target timestamp, we're done + if journal.max_timestamp >= target_timestamp { + break; + } + } + + Ok(map) + } + /// Get the current state (at the latest version). /// /// # Errors @@ -271,6 +345,36 @@ fn extract_version_range(buffer: &[u8]) -> anyhow::Result<(u64, u64)> { Ok((base_version, max_version)) } +/// Extract the minimum and maximum timestamps from a journal. +fn extract_timestamp_range(buffer: &[u8]) -> anyhow::Result<(u64, u64)> { + let array = read_bonjson_payload(buffer)?; + + let mut min_timestamp = u64::MAX; + let mut max_timestamp = 0; + + if let Value::Array(entries) = array { + for (index, entry) in entries.iter().enumerate() { + if index == 0 { + continue; // Skip metadata + } + + if let Value::Object(obj) = entry + && let Some(t) = read_u64_field(obj, "t") + { + min_timestamp = min_timestamp.min(t); + max_timestamp = max_timestamp.max(t); + } + } + } + + // If no entries found, default to (0, 0) + if min_timestamp == u64::MAX { + min_timestamp = 0; + } + + Ok((min_timestamp, max_timestamp)) +} + /// Replay journal entries up to a target version. fn replay_journal_to_version( buffer: &[u8], @@ -323,6 +427,55 @@ fn replay_journal_to_version( Ok(()) } +/// Replay journal entries up to a target timestamp. +fn replay_journal_to_timestamp( + buffer: &[u8], + target_timestamp: u64, + map: &mut AHashMap, +) -> anyhow::Result<()> { + let array = read_bonjson_payload(buffer)?; + + if let Value::Array(entries) = array { + for (index, entry) in entries.iter().enumerate() { + // Skip metadata (first entry) + if index == 0 { + continue; + } + + if let Value::Object(obj) = entry { + // Extract timestamp (skip entries without timestamp) + let Some(entry_timestamp) = read_u64_field(obj, "t") else { + continue; + }; + + // Only apply entries up to target timestamp + if entry_timestamp > target_timestamp { + break; + } + + // Extract key and operation + if let Some(Value::String(key)) = obj.get("k") + && let Some(operation) = obj.get("o") + { + if operation.is_null() { + map.remove(key); + } else { + map.insert( + key.clone(), + TimestampedValue { + value: operation.clone(), + timestamp: entry_timestamp, + }, + ); + } + } + } + } + } + + Ok(()) +} + /// Read the bonjson payload from a journal buffer. fn read_bonjson_payload(buffer: &[u8]) -> anyhow::Result { const HEADER_SIZE: usize = 16; From 70a112b36e69ffe74a29fd78da1ec8708b5c2901 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 16:40:42 -0800 Subject: [PATCH 14/66] use flate2 to avoid pulling in minz_oxide --- Cargo.lock | 1 - bd-resilient-kv/Cargo.toml | 17 +++++----- bd-resilient-kv/src/versioned_kv_store.rs | 40 +++++++++++------------ 3 files changed, 27 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d4414c47..5acb1e4f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1104,7 +1104,6 @@ version = "1.0.0" dependencies = [ "ahash", "anyhow", - "async-compression", "bd-bonjson", "bd-client-common", "bd-workspace-hack", diff --git a/bd-resilient-kv/Cargo.toml b/bd-resilient-kv/Cargo.toml index c5ff14f5..d9c1f60a 100644 --- a/bd-resilient-kv/Cargo.toml +++ b/bd-resilient-kv/Cargo.toml @@ -12,13 +12,12 @@ doctest = false tempfile.workspace = true [dependencies] -ahash.workspace = true -anyhow.workspace = true -async-compression.workspace = true -bd-bonjson = { path = "../bd-bonjson" } -bd-client-common = { path = "../bd-client-common" } +ahash.workspace = true +anyhow.workspace = true +bd-bonjson = { path = "../bd-bonjson" } +bd-client-common = { path = "../bd-client-common" } bd-workspace-hack.workspace = true -bytes.workspace = true -flate2 = { workspace = true, features = ["zlib"] } -memmap2.workspace = true -tokio.workspace = true +bytes.workspace = true +flate2 = { workspace = true, features = ["zlib"] } +memmap2.workspace = true +tokio.workspace = true diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs index bd00219b..5b82538e 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -7,10 +7,8 @@ use crate::kv_journal::{MemMappedVersionedKVJournal, TimestampedValue, VersionedKVJournal}; use ahash::AHashMap; -use async_compression::tokio::write::ZlibEncoder; use bd_bonjson::Value; use std::path::{Path, PathBuf}; -use tokio::io::AsyncWriteExt; /// Callback invoked when journal rotation occurs. /// @@ -23,27 +21,27 @@ use tokio::io::AsyncWriteExt; /// storage, perform cleanup, or other post-rotation operations. pub type RotationCallback = Box; -/// Compress an archived journal using zlib with streaming I/O. +/// Compress an archived journal using zlib. /// -/// This function uses async I/O to stream data directly from the source file -/// through a zlib encoder to the destination file, without loading the entire -/// journal into memory. +/// This function compresses the source file to the destination using zlib compression. +/// The compression is performed in a blocking task to avoid holding up the async runtime. async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result<()> { - // Open source and destination files - let source_file = tokio::fs::File::open(source).await?; - let dest_file = tokio::fs::File::create(dest).await?; - - // Create zlib encoder that writes directly to the destination file - let mut encoder = ZlibEncoder::new(dest_file); - - // Copy data from source through encoder to destination - let mut source_reader = tokio::io::BufReader::new(source_file); - tokio::io::copy(&mut source_reader, &mut encoder).await?; - - // Flush and finalize compression - encoder.shutdown().await?; - - Ok(()) + let source = source.to_owned(); + let dest = dest.to_owned(); + + tokio::task::spawn_blocking(move || { + use flate2::Compression; + use flate2::write::ZlibEncoder; + use std::io::{BufReader, copy}; + + let source_file = std::fs::File::open(&source)?; + let dest_file = std::fs::File::create(&dest)?; + let mut encoder = ZlibEncoder::new(dest_file, Compression::default()); + copy(&mut BufReader::new(source_file), &mut encoder)?; + encoder.finish()?; + Ok::<_, anyhow::Error>(()) + }) + .await? } /// A persistent key-value store with version and timestamp tracking. From aa198999f04d82bf06d85f7a45cb32144324a687 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 16:50:14 -0800 Subject: [PATCH 15/66] update some docs --- bd-resilient-kv/src/kv_journal/versioned.rs | 4 +- .../src/tests/versioned_recovery_test.rs | 132 +++++++++++++++++- bd-resilient-kv/src/versioned_recovery.rs | 44 +++--- 3 files changed, 156 insertions(+), 24 deletions(-) diff --git a/bd-resilient-kv/src/kv_journal/versioned.rs b/bd-resilient-kv/src/kv_journal/versioned.rs index 9d1e9229..ab620f7b 100644 --- a/bd-resilient-kv/src/kv_journal/versioned.rs +++ b/bd-resilient-kv/src/kv_journal/versioned.rs @@ -26,7 +26,7 @@ pub struct TimestampedValue { /// Versioned implementation of a key-value journaling system that tracks write timestamps /// for point-in-time recovery. /// -/// Each write operation is assigned a monotonically increasing timestamp (in nanoseconds +/// Each write operation is assigned a monotonically non-decreasing timestamp (in nanoseconds /// since UNIX epoch), enabling exact state reconstruction at any historical timestamp. /// The monotonicity is enforced by clamping: if the system clock goes backwards, we reuse /// the same timestamp value to maintain ordering guarantees without artificial clock skew. @@ -60,7 +60,7 @@ pub struct VersionedKVJournal<'a> { // # Timestamp Semantics // // Timestamps serve as logical clocks with monotonic guarantees rather than pure wall time: -// - Each write gets a timestamp that is guaranteed to be >= previous writes +// - Each write gets a timestamp that is guaranteed to be >= previous writes (non-decreasing) // - If system clock goes backward, timestamps are clamped to last_timestamp (reuse same value) // - This ensures total ordering while allowing correlation with external timestamped systems // - Version numbers (v) are maintained for backward compatibility and as secondary ordering diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index d3baabf8..2a812eb4 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -82,6 +82,137 @@ async fn test_recovery_single_journal() -> anyhow::Result<()> { Ok(()) } +#[tokio::test] +async fn test_timestamp_collision_across_rotation() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + + // Create a store and write data before rotation + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + + store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + let ts1 = store + .get_with_timestamp("key1") + .map(|tv| tv.timestamp) + .unwrap(); + + // Rotate journal - this captures current state at rotation timestamp + let archive_version = store.current_version(); + store.rotate_journal().await?; + + // Now simulate a scenario where the system clock goes backwards + // by manually manipulating the journal's last_timestamp + // In a real scenario, this could happen if the system clock is adjusted + // We'll write entries that would have the same timestamp as ts1 if clamping occurs + + // Write new data - in practice, if clock went backwards, these could get clamped + // to the same timestamp as entries in the previous journal + store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; + let ts2 = store + .get_with_timestamp("key2") + .map(|tv| tv.timestamp) + .unwrap(); + + store + .insert("key3".to_string(), Value::String("value3".to_string())) + .await?; + let ts3 = store + .get_with_timestamp("key3") + .map(|tv| tv.timestamp) + .unwrap(); + + store.sync()?; + + // Read both journals + let archived_path = temp_dir + .path() + .join(format!("test.jrn.v{}.zz", archive_version)); + let archived_data = std::fs::read(&archived_path)?; + let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; + + // Create recovery from both journals + let recovery = VersionedRecovery::new(vec![&archived_data, &active_data])?; + + // Test recovery behavior when timestamps might collide across journals + // + // Key insight: The recovery should include ALL entries at a given timestamp, + // applying them in version order (which is chronological order). + // + // When recovering at ts1: + // - All entries from archived journal with timestamp <= ts1 are included + // - All entries from active journal with timestamp <= ts1 are included + // - If ts2 or ts3 were clamped to ts1 (due to clock going backwards), + // they would also be included + + // Recover at ts1: should include all entries with timestamp <= ts1 + let state_ts1 = recovery.recover_at_timestamp(ts1)?; + + // In normal operation (no clock backwards), only key1 should be at ts1 + assert!(state_ts1.contains_key("key1")); + assert_eq!( + state_ts1.get("key1").map(|tv| &tv.value), + Some(&Value::String("value1".to_string())) + ); + + // Verify timestamp monotonicity is maintained across rotation + assert!( + ts2 >= ts1, + "Timestamps should be monotonically non-decreasing across rotation" + ); + assert!( + ts3 >= ts2, + "Timestamps should be monotonically non-decreasing" + ); + + // Test recovery at later timestamps + if ts2 > ts1 { + let state_ts2 = recovery.recover_at_timestamp(ts2)?; + // Should include key1 (from archive) and key2 (from active) + assert_eq!(state_ts2.len(), 2); + assert!(state_ts2.contains_key("key1")); + assert!(state_ts2.contains_key("key2")); + } + + if ts3 > ts2 { + let state_ts3 = recovery.recover_at_timestamp(ts3)?; + // Should include all keys + assert_eq!(state_ts3.len(), 3); + assert!(state_ts3.contains_key("key1")); + assert!(state_ts3.contains_key("key2")); + assert!(state_ts3.contains_key("key3")); + } + + // Edge case: If timestamps were the same (due to clamping), verify "last write wins" + // This is important because recovery processes entries in order, so later versions + // should overwrite earlier ones with the same timestamp + if ts2 == ts1 && ts3 == ts1 { + // All entries have the same timestamp + let state_at_shared_ts = recovery.recover_at_timestamp(ts1)?; + + // All entries should be included since they all have timestamp == ts1 + assert_eq!(state_at_shared_ts.len(), 3); + + // Verify values are from the latest versions (last write wins) + assert_eq!( + state_at_shared_ts.get("key1").map(|tv| &tv.value), + Some(&Value::String("value1".to_string())) + ); + assert_eq!( + state_at_shared_ts.get("key2").map(|tv| &tv.value), + Some(&Value::String("value2".to_string())) + ); + assert_eq!( + state_at_shared_ts.get("key3").map(|tv| &tv.value), + Some(&Value::String("value3".to_string())) + ); + } + + Ok(()) +} + #[tokio::test] async fn test_recover_current_only_needs_last_journal() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; @@ -186,7 +317,6 @@ async fn test_recover_current_only_needs_last_journal() -> anyhow::Result<()> { Ok(()) } - #[tokio::test] async fn test_detection_compressed_journal() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; diff --git a/bd-resilient-kv/src/versioned_recovery.rs b/bd-resilient-kv/src/versioned_recovery.rs index 3ae8a48b..061b448a 100644 --- a/bd-resilient-kv/src/versioned_recovery.rs +++ b/bd-resilient-kv/src/versioned_recovery.rs @@ -41,31 +41,13 @@ fn read_u64_field(obj: &AHashMap, key: &str) -> Option { /// # Timestamp-Based Recovery /// /// The primary use case is timestamp-based recovery, which enables correlation with -/// external timestamped event streams: -/// - `recover_at_timestamp(ts)` - Recover state at a specific timestamp -/// - Timestamps are monotonically increasing logical clocks (not pure wall time) -/// - Enables uploading KV snapshots that match specific event buffer timestamps +/// external timestamped event streams. Timestamps are monotonically non-decreasing logical +/// clocks (not pure wall time), enabling snapshots that match specific event buffer timestamps. /// /// Version-based recovery is also supported for backward compatibility. /// /// Supports both compressed (zlib) and uncompressed journals. Compressed journals are /// automatically detected and decompressed transparently. -/// -/// # Usage -/// -/// ```ignore -/// use bd_resilient_kv::VersionedRecovery; -/// -/// // Load journal data as byte slices (may be compressed or uncompressed) -/// let archived_journal = std::fs::read("store.jrn.v30000.zz")?; // Compressed -/// let active_journal = std::fs::read("store.jrn")?; // Uncompressed -/// -/// // Create recovery utility with both journals -/// let recovery = VersionedRecovery::new(vec![&archived_journal, &active_journal])?; -/// -/// // Recover state at specific version -/// let state_at_25000 = recovery.recover_at_version(25000)?; -/// ``` #[derive(Debug)] pub struct VersionedRecovery { journals: Vec, @@ -213,6 +195,17 @@ impl VersionedRecovery { /// This method replays all journal entries from all provided journals up to and including /// the target timestamp, reconstructing the exact state at that point in time. /// + /// ## Important: "Up to and including" semantics + /// + /// When recovering at timestamp T, **ALL entries with timestamp ≤ T are included**. + /// This is critical because timestamps are monotonically non-decreasing (not strictly + /// increasing): if the system clock doesn't advance between writes, multiple entries + /// will share the same timestamp value. These entries must all be included to ensure + /// a consistent view of the state. + /// + /// Entries with the same timestamp are applied in version order (which reflects write + /// order), so later writes correctly overwrite earlier ones ("last write wins"). + /// /// # Arguments /// /// * `target_timestamp` - The timestamp (in nanoseconds since UNIX epoch) to recover state at @@ -427,7 +420,16 @@ fn replay_journal_to_version( Ok(()) } -/// Replay journal entries up to a target timestamp. +/// Replay journal entries up to and including the target timestamp. +/// +/// This function processes all journal entries with timestamp ≤ `target_timestamp`. +/// The "up to and including" behavior is essential because timestamps are monotonically +/// non-decreasing (not strictly increasing): if the system clock doesn't advance between +/// writes, multiple entries may share the same timestamp. All such entries must be +/// applied to ensure state consistency. +/// +/// Entries are processed in version order, ensuring "last write wins" semantics when +/// multiple operations affect the same key at the same timestamp. fn replay_journal_to_timestamp( buffer: &[u8], target_timestamp: u64, From f0910bd52f0f80d277ce2aa5993cf801aaa6ad1d Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 19:39:55 -0800 Subject: [PATCH 16/66] some more docs --- bd-resilient-kv/AGENTS.md | 12 +- bd-resilient-kv/VERSIONED_FORMAT.md | 120 +--- .../src/kv_journal/memmapped_versioned.rs | 34 +- bd-resilient-kv/src/kv_journal/versioned.rs | 201 +++--- .../src/tests/versioned_kv_store_test.rs | 144 ++-- .../src/tests/versioned_recovery_test.rs | 638 ++++++++++-------- bd-resilient-kv/src/versioned_kv_store.rs | 103 +-- bd-resilient-kv/src/versioned_recovery.rs | 264 +++----- 8 files changed, 704 insertions(+), 812 deletions(-) diff --git a/bd-resilient-kv/AGENTS.md b/bd-resilient-kv/AGENTS.md index 45aa3002..6dc6e394 100644 --- a/bd-resilient-kv/AGENTS.md +++ b/bd-resilient-kv/AGENTS.md @@ -55,9 +55,19 @@ The `VersionedKVStore` provides a higher-level API built on top of `VersionedKVJ - The async API enables efficient background compression without blocking the main thread **Version Tracking**: -- Every write operation (`insert`, `remove`) returns a monotonically increasing version number +- Every write operation (`insert`, `remove`) returns a monotonically non-decreasing version number - Version numbers start at 1 (base version), first write is version 2 - Entries with `Value::Null` are treated as deletions but still versioned +- During rotation, all snapshot entries share the same version (the rotation version) + +**Timestamp Tracking**: +- Each entry records a timestamp (nanoseconds since UNIX epoch) when the write occurred +- Timestamps are monotonically non-decreasing, not strictly increasing +- Multiple entries may share the same timestamp if the system clock doesn't advance between writes +- This is expected behavior, particularly during rapid writes or in test environments +- Recovery at timestamp T includes ALL entries with timestamp ≤ T, applied in version order +- Timestamps are preserved during rotation, maintaining temporal accuracy for audit purposes +- Test coverage: `test_timestamp_collision_across_rotation` in `versioned_recovery_test.rs` **Rotation Strategy**: - Automatic rotation when journal size exceeds high water mark (triggered during async write operations) diff --git a/bd-resilient-kv/VERSIONED_FORMAT.md b/bd-resilient-kv/VERSIONED_FORMAT.md index 4926c430..18a5e681 100644 --- a/bd-resilient-kv/VERSIONED_FORMAT.md +++ b/bd-resilient-kv/VERSIONED_FORMAT.md @@ -2,14 +2,13 @@ ## Overview -This document describes the versioned journal format (VERSION 2) that enables version tracking for audit logs and remote backup by tracking write versions for each operation. +This document describes the versioned journal format (VERSION 2) that enables audit logs and remote backup by tracking both version numbers and timestamps for each write operation. ## Goals -1. **Version Tracking**: Each write operation gets a unique, monotonically increasing version number +1. **Version and Timestamp Tracking**: Each write operation records a monotonically non-decreasing version number and timestamp 2. **Journal Rotation**: Periodic compaction with self-contained state in each journal 3. **Remote Backup**: Archived journals can be uploaded to remote storage -4. **Backward Compatible**: New format coexists with existing VERSION 1 ## Design Philosophy @@ -64,11 +63,14 @@ Previous journals, archived during rotation. Each contains complete state at rot ``` Fields: -- `v` (version): Monotonic write version number -- `t` (timestamp): When the write occurred (ns since UNIX epoch) +- `v` (version): Monotonically non-decreasing write version number +- `t` (timestamp): When the write occurred (ns since UNIX epoch), monotonically non-decreasing - `k` (key): The key being written - `o` (operation): The value (for SET) or null (for DELETE) +**Timestamp Semantics:** +Timestamps are monotonically non-decreasing, not strictly increasing. If the system clock doesn't advance between writes, multiple entries may share the same timestamp. This is expected behavior and ensures proper ordering without clock skew. + ## Journal Structure ### Initial Journal @@ -188,58 +190,6 @@ While `VersionedKVStore` is designed for active operation and does not support p - **Compliance**: Extract state at specific points in time for regulatory requirements - **Testing**: Validate that state at historical versions matches expectations -#### API Methods - -```rust -// Create recovery utility from journal files (oldest to newest) - async -let recovery = VersionedRecovery::from_files(vec![ - "store.jrn.v20000.zz", - "store.jrn.v30000.zz", - "store.jrn" -]).await?; - -// Recover state at specific version -let state = recovery.recover_at_version(25000)?; - -// Get current state (latest version) -let current = recovery.recover_current()?; - -// Get available version range -if let Some((min, max)) = recovery.version_range() { - println!("Can recover versions {min} to {max}"); -} -``` - -#### Example: Cross-Rotation Recovery - -```rust -use bd_resilient_kv::VersionedRecovery; - -#[tokio::main] -async fn main() -> anyhow::Result<()> { - // Create recovery utility from files (automatically decompresses .zz archives) - // Provide journal paths in chronological order (oldest to newest) - let recovery = VersionedRecovery::from_files(vec![ - "store.jrn.v20000.zz", - "store.jrn.v30000.zz", - "store.jrn", - ]).await?; - - // Recover state at version 25000 (in archived journal) - let state_at_25000 = recovery.recover_at_version(25000)?; - - // Recover state at version 35000 (across rotation boundary) - let state_at_35000 = recovery.recover_at_version(35000)?; - - // Process the recovered state - for (key, value) in state_at_25000 { - println!("{key} = {value:?}"); - } - - Ok(()) -} -``` - #### Implementation Details - **Async File Loading**: Constructor uses async I/O to load journal files efficiently @@ -266,65 +216,11 @@ async fn main() -> anyhow::Result<()> { - Upload archived journals to remote storage - Delete old archived journals after successful upload -## API Usage - -### Basic Operations - -```rust -use bd_resilient_kv::VersionedKVStore; -use bd_bonjson::Value; - -#[tokio::main] -async fn main() -> anyhow::Result<()> { - // Create or open store (requires directory path and name) - let mut store = VersionedKVStore::new("/path/to/dir", "mystore", 1024 * 1024, None)?; - - // Writes return version numbers (async operations) - let v1 = store.insert("key1".to_string(), Value::from(42)).await?; - let v2 = store.insert("key2".to_string(), Value::from("hello")).await?; - - // Read current values (synchronous) - let value = store.get("key1")?; - - Ok(()) -} -``` - -### Rotation Callback - -```rust -// Set callback for rotation events -store.set_rotation_callback(Box::new(|old_path, new_path, version| { - println!("Rotated at version {version}"); - println!("Archived journal (compressed): {old_path:?}"); - println!("New active journal: {new_path:?}"); - // Upload old_path to remote storage... -})); -``` - -### Manual Rotation - -```rust -// Automatic rotation on high water mark -let version = store.insert("key".to_string(), Value::from("value")).await?; -// Rotation happens automatically if high water mark exceeded - -// Or manually trigger rotation (async) -store.rotate_journal().await?; -``` - -## Migration from VERSION 1 - -VERSION 1 journals (without versioning) can coexist with VERSION 2: -- Existing VERSION 1 files continue to work with current `KVStore` -- New `VersionedKVStore` creates VERSION 2 journals -- No automatic migration (opt-in by using `VersionedKVStore`) - ## Implementation Notes 1. **Version Counter Persistence**: Stored in metadata, initialized from journal on restart 2. **Atomicity**: Version increments are atomic with writes -3. **Monotonicity**: Versions never decrease or skip +3. **Monotonicity**: Versions are monotonically non-decreasing (multiple entries may share the same version during rotation) 4. **Concurrency**: Not thread-safe by design (same as current implementation) 5. **Format Field Names**: Use short names (`v`, `t`, `k`, `o`) to minimize storage overhead 6. **Self-Contained Journals**: Each rotated journal can be read independently without dependencies diff --git a/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs b/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs index f9a38b9c..367627c3 100644 --- a/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs +++ b/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs @@ -12,11 +12,11 @@ use memmap2::{MmapMut, MmapOptions}; use std::fs::OpenOptions; use std::path::Path; -/// Memory-mapped implementation of a versioned key-value journal. +/// Memory-mapped implementation of a timestamped key-value journal. /// /// This implementation uses memory-mapped files to provide persistence while maintaining /// the efficiency of in-memory operations. All changes are automatically synced to disk. -/// Each write operation receives a unique version number for point-in-time recovery. +/// Each write operation receives a timestamp for point-in-time recovery. /// /// # Safety /// During construction, we unsafely declare mmap's internal buffer as having a static @@ -58,7 +58,7 @@ impl MemMappedVersionedKVJournal { /// # Arguments /// * `file_path` - Path to the file to use for storage /// * `size` - Minimum size of the file in bytes - /// * `base_version` - The starting version for this journal + /// * `base_timestamp` - The starting timestamp for this journal (typically current time) /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 /// /// # Errors @@ -66,7 +66,7 @@ impl MemMappedVersionedKVJournal { pub fn new>( file_path: P, size: usize, - base_version: u64, + base_timestamp: u64, high_water_mark_ratio: Option, ) -> anyhow::Result { let file = OpenOptions::new() @@ -83,7 +83,7 @@ impl MemMappedVersionedKVJournal { let (mmap, buffer) = unsafe { Self::create_mmap_buffer(file)? }; - let versioned_kv = VersionedKVJournal::new(buffer, base_version, high_water_mark_ratio)?; + let versioned_kv = VersionedKVJournal::new(buffer, base_timestamp, high_water_mark_ratio)?; Ok(Self { mmap, versioned_kv }) } @@ -120,38 +120,26 @@ impl MemMappedVersionedKVJournal { Ok(Self { mmap, versioned_kv }) } - /// Set a key-value pair with automatic version increment. + /// Set a key-value pair with automatic timestamp assignment. /// - /// Returns a tuple of (version, timestamp). + /// Returns the timestamp of the operation. /// /// # Errors /// Returns an error if the journal entry cannot be written. - pub fn set_versioned(&mut self, key: &str, value: &Value) -> anyhow::Result<(u64, u64)> { + pub fn set_versioned(&mut self, key: &str, value: &Value) -> anyhow::Result { self.versioned_kv.set_versioned(key, value) } - /// Delete a key with automatic version increment. + /// Delete a key with automatic timestamp assignment. /// - /// Returns a tuple of (version, timestamp). + /// Returns the timestamp of the operation. /// /// # Errors /// Returns an error if the journal entry cannot be written. - pub fn delete_versioned(&mut self, key: &str) -> anyhow::Result<(u64, u64)> { + pub fn delete_versioned(&mut self, key: &str) -> anyhow::Result { self.versioned_kv.delete_versioned(key) } - /// Get the current version number. - #[must_use] - pub fn current_version(&self) -> u64 { - self.versioned_kv.current_version() - } - - /// Get the base version (first version in this journal). - #[must_use] - pub fn base_version(&self) -> u64 { - self.versioned_kv.base_version() - } - /// Get the current high water mark position. #[must_use] pub fn high_water_mark(&self) -> usize { diff --git a/bd-resilient-kv/src/kv_journal/versioned.rs b/bd-resilient-kv/src/kv_journal/versioned.rs index ab620f7b..c2447a0b 100644 --- a/bd-resilient-kv/src/kv_journal/versioned.rs +++ b/bd-resilient-kv/src/kv_journal/versioned.rs @@ -23,24 +23,21 @@ pub struct TimestampedValue { pub timestamp: u64, } -/// Versioned implementation of a key-value journaling system that tracks write timestamps -/// for point-in-time recovery. +/// Timestamped implementation of a key-value journaling system that uses timestamps +/// as the version identifier for point-in-time recovery. /// /// Each write operation is assigned a monotonically non-decreasing timestamp (in nanoseconds /// since UNIX epoch), enabling exact state reconstruction at any historical timestamp. /// The monotonicity is enforced by clamping: if the system clock goes backwards, we reuse -/// the same timestamp value to maintain ordering guarantees without artificial clock skew. +/// the same timestamp value to maintain ordering guarantees. When timestamps collide, +/// journal ordering determines precedence. #[derive(Debug)] pub struct VersionedKVJournal<'a> { - #[allow(dead_code)] - format_version: u64, position: usize, buffer: &'a mut [u8], high_water_mark: usize, high_water_mark_triggered: bool, initialized_at_unix_time_ns: u64, - current_version: u64, - base_version: u64, // First version in this journal last_timestamp: u64, // Most recent timestamp written (for monotonic enforcement) } @@ -51,22 +48,21 @@ pub struct VersionedKVJournal<'a> { // | 8 | Position | u64 | // | 16 | Type Code: Array Start | u8 | // | 17 | Metadata Object | BONJSON Object | -// | ... | Versioned Journal Entry | BONJSON Object | -// | ... | Versioned Journal Entry | BONJSON Object | +// | ... | Timestamped Journal Entry| BONJSON Object | +// | ... | Timestamped Journal Entry| BONJSON Object | // -// Metadata object: {"initialized": , "format_version": 2, "base_version": } -// Journal entries: {"v": , "t": , "k": "", "o": } +// Metadata object: {"initialized": , "format_version": 2} +// Journal entries: {"t": , "k": "", "o": } // // # Timestamp Semantics // -// Timestamps serve as logical clocks with monotonic guarantees rather than pure wall time: +// Timestamps serve as both version identifiers and logical clocks with monotonic guarantees: // - Each write gets a timestamp that is guaranteed to be >= previous writes (non-decreasing) // - If system clock goes backward, timestamps are clamped to last_timestamp (reuse same value) +// - When timestamps collide, journal ordering determines precedence // - This ensures total ordering while allowing correlation with external timestamped systems -// - Version numbers (v) are maintained for backward compatibility and as secondary ordering const VERSION: u64 = 2; // The versioned format version -const INVALID_VERSION: u64 = 0; // 0 will never be a valid version const HEADER_SIZE: usize = 16; const ARRAY_BEGIN: usize = 16; @@ -103,14 +99,6 @@ fn current_timestamp() -> anyhow::Result { .and_then(|d| u64::try_from(d.as_nanos()).map_err(|_| InvariantError::Invariant.into())) } -fn read_version(buffer: &[u8]) -> anyhow::Result { - let version_bytes: [u8; 8] = buffer[.. 8].try_into()?; - let version = u64::from_le_bytes(version_bytes); - if version != VERSION { - anyhow::bail!("Unsupported version: {version}, expected {VERSION}"); - } - Ok(version) -} /// Write to the version field of a journal buffer. fn write_version_field(buffer: &mut [u8], version: u64) { @@ -123,11 +111,6 @@ fn write_version(buffer: &mut [u8]) { write_version_field(buffer, VERSION); } -/// Invalidate the version field of a journal buffer. -fn invalidate_version(buffer: &mut [u8]) { - write_version_field(buffer, INVALID_VERSION); -} - fn read_position(buffer: &[u8]) -> anyhow::Result { let position_bytes: [u8; 8] = buffer[8 .. 16].try_into()?; let position_u64 = u64::from_le_bytes(position_bytes); @@ -159,7 +142,7 @@ fn read_bonjson_payload(buffer: &[u8]) -> anyhow::Result { } /// Create and write the metadata section of a versioned journal. -fn write_metadata(buffer: &mut [u8], timestamp: u64, base_version: u64) -> anyhow::Result { +fn write_metadata(buffer: &mut [u8], timestamp: u64, base_timestamp: u64) -> anyhow::Result { let buffer_len = buffer.len(); let mut cursor = &mut buffer[METADATA_OFFSET ..]; @@ -167,7 +150,10 @@ fn write_metadata(buffer: &mut [u8], timestamp: u64, base_version: u64) -> anyho let mut metadata = AHashMap::new(); metadata.insert("initialized".to_string(), Value::Unsigned(timestamp)); metadata.insert("format_version".to_string(), Value::Unsigned(VERSION)); - metadata.insert("base_version".to_string(), Value::Unsigned(base_version)); + metadata.insert( + "base_timestamp".to_string(), + Value::Unsigned(base_timestamp), + ); // Write metadata object encode_into_buf(&mut cursor, &Value::Object(metadata)) @@ -177,7 +163,7 @@ fn write_metadata(buffer: &mut [u8], timestamp: u64, base_version: u64) -> anyho } /// Extract metadata from the buffer. -fn extract_metadata_from_buffer(buffer: &[u8]) -> anyhow::Result<(u64, u64)> { +fn extract_metadata_from_buffer(buffer: &[u8]) -> anyhow::Result { let array = read_bonjson_payload(buffer)?; if let Value::Array(entries) = array && let Some(Value::Object(obj)) = entries.first() @@ -185,9 +171,7 @@ fn extract_metadata_from_buffer(buffer: &[u8]) -> anyhow::Result<(u64, u64)> { let timestamp = read_u64_field(obj, "initialized") .ok_or_else(|| anyhow::anyhow!("No initialized timestamp found in metadata"))?; - let base_version = read_u64_field(obj, "base_version").unwrap_or(0); - - return Ok((timestamp, base_version)); + return Ok(timestamp); } anyhow::bail!("No valid metadata found"); } @@ -226,19 +210,16 @@ impl<'a> VersionedKVJournal<'a> { /// /// # Arguments /// * `buffer` - The storage buffer - /// * `base_version` - The starting version for this journal + /// * `base_timestamp` - The starting timestamp for this journal (typically current time) /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 /// /// # Errors - /// Returns an error if serialization fails or if `high_water_mark_ratio` is invalid. + /// Returns an error if the buffer is too small or if `high_water_mark_ratio` is invalid. pub fn new( buffer: &'a mut [u8], - base_version: u64, + base_timestamp: u64, high_water_mark_ratio: Option, ) -> anyhow::Result { - // If this operation gets interrupted, the buffer must be considered invalid. - invalidate_version(buffer); - let buffer_len = validate_buffer_len(buffer)?; let high_water_mark = calculate_high_water_mark(buffer_len, high_water_mark_ratio)?; @@ -246,23 +227,20 @@ impl<'a> VersionedKVJournal<'a> { let mut cursor = &mut buffer[HEADER_SIZE ..]; serialize_array_begin(&mut cursor).map_err(|_| InvariantError::Invariant)?; - // Write metadata with current timestamp and base version + // Write metadata with current timestamp let timestamp = current_timestamp()?; - let position = write_metadata(buffer, timestamp, base_version)?; + let position = write_metadata(buffer, timestamp, base_timestamp)?; write_position(buffer, position); write_version(buffer); Ok(Self { - format_version: VERSION, position, buffer, high_water_mark, high_water_mark_triggered: false, initialized_at_unix_time_ns: timestamp, - current_version: base_version, - base_version, - last_timestamp: timestamp, + last_timestamp: std::cmp::max(timestamp, base_timestamp), }) } @@ -280,52 +258,24 @@ impl<'a> VersionedKVJournal<'a> { high_water_mark_ratio: Option, ) -> anyhow::Result { let buffer_len = validate_buffer_len(buffer)?; - let format_version = read_version(buffer)?; let position = read_position(buffer)?; - let (init_timestamp, base_version) = extract_metadata_from_buffer(buffer)?; + let init_timestamp = extract_metadata_from_buffer(buffer)?; let high_water_mark = calculate_high_water_mark(buffer_len, high_water_mark_ratio)?; - // Find the highest version in the journal - let highest_version = Self::find_highest_version(buffer)?; - let current_version = highest_version.unwrap_or(base_version); - // Find the highest timestamp in the journal let highest_timestamp = Self::find_highest_timestamp(buffer)?; let last_timestamp = highest_timestamp.unwrap_or(init_timestamp); Ok(Self { - format_version, position, buffer, high_water_mark, high_water_mark_triggered: position >= high_water_mark, initialized_at_unix_time_ns: init_timestamp, - current_version, - base_version, last_timestamp, }) } - /// Find the highest version number in the journal. - /// - /// Since versions are monotonically increasing, this simply returns the version - /// from the last entry in the journal. - fn find_highest_version(buffer: &[u8]) -> anyhow::Result> { - let array = read_bonjson_payload(buffer)?; - - if let Value::Array(entries) = array { - // Skip metadata (index 0) and get the last actual entry - // Since versions are monotonically increasing, the last entry has the highest version - if entries.len() > 1 - && let Some(Value::Object(obj)) = entries.last() - { - return Ok(read_u64_field(obj, "v")); - } - } - - Ok(None) - } - /// Find the highest timestamp in the journal. /// /// Since timestamps are monotonically increasing, this simply returns the timestamp @@ -346,18 +296,6 @@ impl<'a> VersionedKVJournal<'a> { Ok(None) } - /// Get the current version number. - #[must_use] - pub fn current_version(&self) -> u64 { - self.current_version - } - - /// Get the base version (first version in this journal). - #[must_use] - pub fn base_version(&self) -> u64 { - self.base_version - } - /// Get the next monotonically increasing timestamp. /// /// This ensures that even if the system clock goes backwards, timestamps remain @@ -387,23 +325,17 @@ impl<'a> VersionedKVJournal<'a> { } /// Write a versioned journal entry and return the timestamp. - fn write_versioned_entry( - &mut self, - version: u64, - key: &str, - value: &Value, - ) -> anyhow::Result { + fn write_versioned_entry(&mut self, key: &str, value: &Value) -> anyhow::Result { // Get monotonically increasing timestamp before borrowing buffer let timestamp = self.next_monotonic_timestamp()?; let buffer_len = self.buffer.len(); let mut cursor = &mut self.buffer[self.position ..]; - // Create entry object: {"v": version, "t": timestamp, "k": key, "o": value} + // Create entry object: {"t": timestamp, "k": key, "o": value} // TODO(snowp): It would be nice to be able to pass impl AsRef for key or Cow to avoid // allocating small strings repeatedly. let entry = AHashMap::from([ - ("v".to_string(), Value::Unsigned(version)), ("t".to_string(), Value::Unsigned(timestamp)), ("k".to_string(), Value::String(key.to_string())), ("o".to_string(), value.clone()), @@ -417,28 +349,22 @@ impl<'a> VersionedKVJournal<'a> { Ok(timestamp) } - /// Set a key-value pair with automatic version increment. - /// Returns a tuple of (version, timestamp). + /// Set a key-value pair. + /// Returns the timestamp of the operation. /// - /// The timestamp is monotonically increasing and serves as the primary ordering mechanism. + /// The timestamp is monotonically non-decreasing and serves as the version identifier. /// If the system clock goes backwards, timestamps are clamped to maintain monotonicity. - pub fn set_versioned(&mut self, key: &str, value: &Value) -> anyhow::Result<(u64, u64)> { - self.current_version += 1; - let version = self.current_version; - let timestamp = self.write_versioned_entry(version, key, value)?; - Ok((version, timestamp)) + pub fn set_versioned(&mut self, key: &str, value: &Value) -> anyhow::Result { + self.write_versioned_entry(key, value) } - /// Delete a key with automatic version increment. - /// Returns a tuple of (version, timestamp). + /// Delete a key. + /// Returns the timestamp of the operation. /// - /// The timestamp is monotonically increasing and serves as the primary ordering mechanism. + /// The timestamp is monotonically non-decreasing and serves as the version identifier. /// If the system clock goes backwards, timestamps are clamped to maintain monotonicity. - pub fn delete_versioned(&mut self, key: &str) -> anyhow::Result<(u64, u64)> { - self.current_version += 1; - let version = self.current_version; - let timestamp = self.write_versioned_entry(version, key, &Value::Null)?; - Ok((version, timestamp)) + pub fn delete_versioned(&mut self, key: &str) -> anyhow::Result { + self.write_versioned_entry(key, &Value::Null) } /// Get the high water mark position. @@ -548,16 +474,47 @@ impl<'a> VersionedKVJournal<'a> { /// Rotation utilities for creating new journals with compacted state impl<'a> VersionedKVJournal<'a> { - /// Create a new journal initialized with the compacted state from a snapshot version. + /// Create a new journal initialized with the compacted state from a snapshot. + /// + /// The new journal will have all current key-value pairs written with their **original + /// timestamps** to preserve historical accuracy. The journal's monotonic timestamp + /// enforcement will respect the highest timestamp in the provided state. + /// + /// ## Timestamp Preservation and Snapshot Overlaps + /// + /// This function preserves the original timestamps of all entries, which means the new + /// journal's entry timestamps may overlap with or equal timestamps from the previous journal. + /// + /// Example during rotation: + /// ```text + /// Old journal (about to be archived as store.jrn.t300.zz): + /// - Entries: key="foo" t=100, key="foo" t=200, key="foo" t=300 + /// - Final state: foo=v3@300, bar=v1@200 + /// - rotation_timestamp = 300 (max of all timestamps) + /// + /// New journal (created by this function with base_timestamp=300): + /// - Compacted entries: foo=v3@300, bar=v1@200 ← Original timestamps preserved! + /// - These timestamps (300, 200) may equal/overlap with old journal's range [100, 300] + /// - Future entries will have t >= 300 (enforced by base_timestamp) + /// ``` + /// + /// ## Design Rationale + /// + /// Preserving original timestamps is **not strictly required** for point-in-time state + /// reconstruction (we could rewrite all compacted entries to `rotation_timestamp`), but it + /// provides benefits at zero cost: + /// + /// - **Implementation simplicity**: No timestamp rewriting logic needed + /// - **Semantic accuracy**: Preserves "when was this value last modified" for audit trails + /// - **Future-proof**: Maintains historical information that may be useful later + /// - **Zero overhead**: No performance difference vs rewriting timestamps /// - /// The new journal will have all current key-value pairs written as versioned entries - /// at the `snapshot_version`, using their original timestamps to preserve historical accuracy. - /// The journal's monotonic timestamp enforcement will respect the highest timestamp in the - /// provided state. + /// Recovery systems bucket logs to snapshots using min/max timestamp ranges and replay + /// journals sequentially to reconstruct state at any point in time. /// /// # Arguments /// * `buffer` - The buffer to write the new journal to - /// * `snapshot_version` - The version to assign to all compacted state entries + /// * `base_timestamp` - The starting timestamp for the journal (for monotonic enforcement) /// * `state` - The current key-value state with timestamps to write /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark /// @@ -565,18 +522,17 @@ impl<'a> VersionedKVJournal<'a> { /// Returns an error if serialization fails or buffer is too small. pub fn create_rotated_journal( buffer: &'a mut [u8], - snapshot_version: u64, + base_timestamp: u64, state: &AHashMap, high_water_mark_ratio: Option, ) -> anyhow::Result { - // Create a new journal with the snapshot version as the base - let mut journal = Self::new(buffer, snapshot_version, high_water_mark_ratio)?; + // Create a new journal with the base timestamp + let mut journal = Self::new(buffer, base_timestamp, high_water_mark_ratio)?; // Find the maximum timestamp in the state to maintain monotonicity let max_state_timestamp = state.values().map(|tv| tv.timestamp).max().unwrap_or(0); - // Write all current state as versioned entries at the snapshot version - // Use the original timestamp from each entry to preserve historical accuracy + // Write all current state with their original timestamps for (key, timestamped_value) in state { let buffer_len = journal.buffer.len(); let mut cursor = &mut journal.buffer[journal.position ..]; @@ -585,11 +541,10 @@ impl<'a> VersionedKVJournal<'a> { // We use the actual timestamp from the entry, but track the maximum for future writes journal.last_timestamp = std::cmp::max(journal.last_timestamp, timestamped_value.timestamp); - // Create entry object: {"v": version, "t": timestamp, "k": key, "o": value} + // Create entry object: {"t": timestamp, "k": key, "o": value} // TODO(snowp): It would be nice to be able to pass impl AsRef for key or Cow to avoid // allocating small strings repeatedly. let entry = AHashMap::from([ - ("v".to_string(), Value::Unsigned(snapshot_version)), ( "t".to_string(), Value::Unsigned(timestamped_value.timestamp), diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index 1199fed4..da9254c7 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -21,8 +21,6 @@ fn test_versioned_store_new() -> anyhow::Result<()> { // Should start empty assert!(store.is_empty()); assert_eq!(store.len(), 0); - assert_eq!(store.base_version(), 1); // Base version starts at 1 - assert_eq!(store.current_version(), 1); Ok(()) } @@ -66,18 +64,30 @@ async fn test_timestamp_collision_on_clamping() -> anyhow::Result<()> { .unwrap(); // Verify monotonicity: timestamps should never decrease - assert!(ts2 >= ts1, "Timestamps should be monotonically non-decreasing"); - assert!(ts3 >= ts2, "Timestamps should be monotonically non-decreasing"); - assert!(ts4 >= ts3, "Timestamps should be monotonically non-decreasing"); + assert!( + ts2 >= ts1, + "Timestamps should be monotonically non-decreasing" + ); + assert!( + ts3 >= ts2, + "Timestamps should be monotonically non-decreasing" + ); + assert!( + ts4 >= ts3, + "Timestamps should be monotonically non-decreasing" + ); // Document that timestamps CAN be equal (this is the key difference from the old +1 behavior) // When system clock doesn't advance or goes backwards, we reuse the same timestamp // This is acceptable because version numbers provide total ordering - + // Count unique timestamps - with rapid operations, we might have collisions let timestamps = [ts1, ts2, ts3, ts4]; - let unique_count = timestamps.iter().collect::>().len(); - + let unique_count = timestamps + .iter() + .collect::>() + .len(); + // We should have at least 1 unique timestamp (all could be the same in extreme cases) assert!( unique_count >= 1 && unique_count <= 4, @@ -97,20 +107,20 @@ async fn test_versioned_store_remove() -> anyhow::Result<()> { let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; // Insert some values - let v1 = store + let ts1 = store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; - let v2 = store + let ts2 = store .insert("key2".to_string(), Value::String("value2".to_string())) .await?; assert_eq!(store.len(), 2); - assert!(v2 > v1); + assert!(ts2 >= ts1); // Remove a key - let v3 = store.remove("key1").await?; - assert!(v3.is_some()); - assert!(v3.unwrap() > v2); + let ts3 = store.remove("key1").await?; + assert!(ts3.is_some()); + assert!(ts3.unwrap() >= ts2); assert_eq!(store.len(), 1); assert!(!store.contains_key("key1")); @@ -128,15 +138,13 @@ async fn test_persistence_and_reload() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let v2; - // Create store and write some data { let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - let _v1 = store + let _ts1 = store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; - v2 = store.insert("key2".to_string(), Value::Signed(42)).await?; + let _ts2 = store.insert("key2".to_string(), Value::Signed(42)).await?; store.sync()?; } @@ -149,9 +157,6 @@ async fn test_persistence_and_reload() -> anyhow::Result<()> { Some(&Value::String("value1".to_string())) ); assert_eq!(store.get("key2"), Some(&Value::Signed(42))); - - // Version numbers should be preserved - assert_eq!(store.current_version(), v2); } Ok(()) @@ -190,17 +195,17 @@ async fn test_rotation_callback() -> anyhow::Result<()> { let callback_data = Arc::new(Mutex::new(Vec::new())); let callback_data_clone = Arc::clone(&callback_data); - store.set_rotation_callback(Box::new(move |old_path, new_path, version| { + store.set_rotation_callback(Box::new(move |old_path, new_path, timestamp| { let mut data = callback_data_clone.lock().unwrap(); - data.push((old_path.to_path_buf(), new_path.to_path_buf(), version)); + data.push((old_path.to_path_buf(), new_path.to_path_buf(), timestamp)); })); // Write enough data to trigger rotation - let mut last_version = 0; + let mut last_timestamp = 0; for i in 0 .. 100 { let key = format!("key{}", i); let value = Value::String(format!("value_{}_with_some_extra_padding", i)); - last_version = store.insert(key, value).await?; + last_timestamp = store.insert(key, value).await?; // Rotation happens automatically inside insert when high water mark is triggered let data = callback_data.lock().unwrap(); @@ -213,10 +218,10 @@ async fn test_rotation_callback() -> anyhow::Result<()> { let data = callback_data.lock().unwrap(); assert!(data.len() >= 1, "Expected at least one rotation event"); - let (old_path, new_path, rotation_version) = &data[0]; - assert!(old_path.to_string_lossy().contains(".v")); + let (old_path, new_path, rotation_timestamp) = &data[0]; + assert!(old_path.to_string_lossy().contains(".t")); assert_eq!(new_path, &temp_dir.path().join("test.jrn")); - assert!(*rotation_version <= last_version); + assert!(*rotation_timestamp <= last_timestamp); Ok(()) } @@ -229,28 +234,33 @@ async fn test_manual_rotation() -> anyhow::Result<()> { let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; // Insert some data - let _v1 = store + let _ts1 = store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; - let v2 = store + let ts2 = store .insert("key2".to_string(), Value::String("value2".to_string())) .await?; + // Get max timestamp before rotation (this will be used in the archive name) + let rotation_timestamp = store + .get_with_timestamp("key2") + .map(|tv| tv.timestamp) + .unwrap(); + // Manually trigger rotation - let rotation_version = store.current_version(); store.rotate_journal().await?; // Verify archived file exists (compressed) let archived_path = temp_dir .path() - .join(format!("test.jrn.v{}.zz", rotation_version)); + .join(format!("test.jrn.t{}.zz", rotation_timestamp)); assert!(archived_path.exists()); // Verify active journal still works - let v3 = store + let ts3 = store .insert("key3".to_string(), Value::String("value3".to_string())) .await?; - assert!(v3 > v2); + assert!(ts3 >= ts2); assert_eq!(store.len(), 3); // Verify data is intact @@ -267,9 +277,6 @@ async fn test_manual_rotation() -> anyhow::Result<()> { Some(&Value::String("value3".to_string())) ); - // New journal should have base version at rotation point - assert_eq!(store.base_version(), rotation_version); - Ok(()) } @@ -291,7 +298,10 @@ async fn test_rotation_preserves_state() -> anyhow::Result<()> { .await?; let pre_rotation_state = store.as_hashmap(); - let pre_rotation_version = store.current_version(); + let pre_rotation_ts = store + .get_with_timestamp("key4") + .map(|tv| tv.timestamp) + .unwrap(); // Rotate store.rotate_journal().await?; @@ -302,10 +312,10 @@ async fn test_rotation_preserves_state() -> anyhow::Result<()> { assert_eq!(store.len(), 4); // Verify we can continue writing - let v_new = store + let ts_new = store .insert("key5".to_string(), Value::String("value5".to_string())) .await?; - assert!(v_new > pre_rotation_version); + assert!(ts_new >= pre_rotation_ts); assert_eq!(store.len(), 5); Ok(()) @@ -335,11 +345,11 @@ async fn test_version_monotonicity() -> anyhow::Result<()> { let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - let mut last_version = store.current_version(); + let mut last_timestamp = 0u64; - // Perform various operations and ensure version always increases + // Perform various operations and ensure timestamp always increases for i in 0 .. 20 { - let op_version = if i % 3 == 0 { + let op_timestamp = if i % 3 == 0 { store .insert(format!("key{}", i), Value::Signed(i as i64)) .await? @@ -354,18 +364,16 @@ async fn test_version_monotonicity() -> anyhow::Result<()> { store .remove(&format!("key{}", i / 3)) .await? - .unwrap_or(last_version) + .unwrap_or(last_timestamp) }; assert!( - op_version >= last_version, - "Version should be monotonically increasing" + op_timestamp >= last_timestamp, + "Timestamp should be monotonically non-decreasing" ); - last_version = op_version; + last_timestamp = op_timestamp; } - assert_eq!(store.current_version(), last_version); - Ok(()) } @@ -449,12 +457,12 @@ async fn test_timestamp_monotonicity() -> anyhow::Result<()> { store .insert(format!("key{}", i), Value::Signed(i as i64)) .await?; - + let ts = store .get_with_timestamp(&format!("key{}", i)) .map(|tv| tv.timestamp) .unwrap(); - + timestamps.push(ts); } @@ -573,8 +581,11 @@ async fn test_compression_during_rotation() -> anyhow::Result<()> { // Get size of uncompressed journal before rotation let uncompressed_size = std::fs::metadata(temp_dir.path().join("test.jrn"))?.len(); - // Get current version before rotation (this is what will be used in the archive name) - let rotation_version = store.current_version(); + // Get max timestamp before rotation (this will be used in the archive name) + let rotation_timestamp = store + .get_with_timestamp("key3") + .map(|tv| tv.timestamp) + .unwrap(); // Trigger rotation store.rotate_journal().await?; @@ -582,7 +593,7 @@ async fn test_compression_during_rotation() -> anyhow::Result<()> { // Verify compressed archive exists let archived_path = temp_dir .path() - .join(format!("test.jrn.v{}.zz", rotation_version)); + .join(format!("test.jrn.t{}.zz", rotation_timestamp)); assert!( archived_path.exists(), "Compressed archive should exist at {:?}", @@ -633,13 +644,16 @@ async fn test_compression_ratio() -> anyhow::Result<()> { } let uncompressed_size = std::fs::metadata(temp_dir.path().join("test.jrn"))?.len(); - let rotation_version = store.current_version(); + let rotation_timestamp = store + .get_with_timestamp(&format!("key{}", 9)) + .map(|tv| tv.timestamp) + .unwrap(); store.rotate_journal().await?; let archived_path = temp_dir .path() - .join(format!("test.jrn.v{}.zz", rotation_version)); + .join(format!("test.jrn.t{}.zz", rotation_timestamp)); let compressed_size = std::fs::metadata(&archived_path)?.len(); // With highly compressible data, we should get significant compression @@ -662,24 +676,28 @@ async fn test_multiple_rotations_with_compression() -> anyhow::Result<()> { let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - let mut rotation_versions = Vec::new(); + let mut rotation_timestamps = Vec::new(); // Perform multiple rotations for i in 0 .. 3 { let key = format!("key{}", i); let value = Value::String(format!("value{}", i)); - let version = store.insert(key, value).await?; - rotation_versions.push(version); + store.insert(key.clone(), value).await?; + let timestamp = store + .get_with_timestamp(&key) + .map(|tv| tv.timestamp) + .unwrap(); + rotation_timestamps.push(timestamp); store.rotate_journal().await?; } // Verify all compressed archives exist - for version in rotation_versions { - let archived_path = temp_dir.path().join(format!("test.jrn.v{}.zz", version)); + for timestamp in rotation_timestamps { + let archived_path = temp_dir.path().join(format!("test.jrn.t{}.zz", timestamp)); assert!( archived_path.exists(), - "Compressed archive for version {} should exist", - version + "Compressed archive for timestamp {} should exist", + timestamp ); } diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index 2a812eb4..1727b486 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -19,76 +19,6 @@ async fn test_recovery_single_journal() -> anyhow::Result<()> { // Create a store and write some versioned data let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - let v1 = store - .insert("key1".to_string(), Value::String("value1".to_string())) - .await?; - let v2 = store - .insert("key2".to_string(), Value::String("value2".to_string())) - .await?; - let v3 = store - .insert("key1".to_string(), Value::String("updated1".to_string())) - .await?; - store.sync()?; - - // Read the journal data - let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - - // Create recovery utility - let recovery = VersionedRecovery::new(vec![&journal_data])?; - - // Verify version range - let version_range = recovery.version_range(); - assert!(version_range.is_some()); - let (min, max) = version_range.unwrap(); - assert_eq!(min, 1); - assert_eq!(max, v3); - - // Recover at v1: should have only key1=value1 - let state_v1 = recovery.recover_at_version(v1)?; - assert_eq!(state_v1.len(), 1); - assert_eq!( - state_v1.get("key1").map(|tv| &tv.value), - Some(&Value::String("value1".to_string())) - ); - - // Recover at v2: should have key1=value1, key2=value2 - let state_v2 = recovery.recover_at_version(v2)?; - assert_eq!(state_v2.len(), 2); - assert_eq!( - state_v2.get("key1").map(|tv| &tv.value), - Some(&Value::String("value1".to_string())) - ); - assert_eq!( - state_v2.get("key2").map(|tv| &tv.value), - Some(&Value::String("value2".to_string())) - ); - - // Recover at v3: should have key1=updated1, key2=value2 - let state_v3 = recovery.recover_at_version(v3)?; - assert_eq!(state_v3.len(), 2); - assert_eq!( - state_v3.get("key1").map(|tv| &tv.value), - Some(&Value::String("updated1".to_string())) - ); - assert_eq!( - state_v3.get("key2").map(|tv| &tv.value), - Some(&Value::String("value2".to_string())) - ); - - // Recover current should match v3 - let current = recovery.recover_current()?; - assert_eq!(current, state_v3); - - Ok(()) -} - -#[tokio::test] -async fn test_timestamp_collision_across_rotation() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - // Create a store and write data before rotation - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; @@ -97,17 +27,8 @@ async fn test_timestamp_collision_across_rotation() -> anyhow::Result<()> { .map(|tv| tv.timestamp) .unwrap(); - // Rotate journal - this captures current state at rotation timestamp - let archive_version = store.current_version(); - store.rotate_journal().await?; + std::thread::sleep(std::time::Duration::from_millis(10)); - // Now simulate a scenario where the system clock goes backwards - // by manually manipulating the journal's last_timestamp - // In a real scenario, this could happen if the system clock is adjusted - // We'll write entries that would have the same timestamp as ts1 if clamping occurs - - // Write new data - in practice, if clock went backwards, these could get clamped - // to the same timestamp as entries in the previous journal store .insert("key2".to_string(), Value::String("value2".to_string())) .await?; @@ -116,103 +37,71 @@ async fn test_timestamp_collision_across_rotation() -> anyhow::Result<()> { .map(|tv| tv.timestamp) .unwrap(); + std::thread::sleep(std::time::Duration::from_millis(10)); + store - .insert("key3".to_string(), Value::String("value3".to_string())) + .insert("key1".to_string(), Value::String("updated1".to_string())) .await?; let ts3 = store - .get_with_timestamp("key3") + .get_with_timestamp("key1") .map(|tv| tv.timestamp) .unwrap(); store.sync()?; - // Read both journals - let archived_path = temp_dir - .path() - .join(format!("test.jrn.v{}.zz", archive_version)); - let archived_data = std::fs::read(&archived_path)?; - let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; + // Read the journal data + let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - // Create recovery from both journals - let recovery = VersionedRecovery::new(vec![&archived_data, &active_data])?; + // Create recovery utility + let recovery = VersionedRecovery::new(vec![&journal_data])?; + + // Verify timestamp range + let timestamp_range = recovery.timestamp_range(); + assert!(timestamp_range.is_some()); + let (min, max) = timestamp_range.unwrap(); + assert!(min <= ts1); + assert!(max >= ts3); - // Test recovery behavior when timestamps might collide across journals - // - // Key insight: The recovery should include ALL entries at a given timestamp, - // applying them in version order (which is chronological order). - // - // When recovering at ts1: - // - All entries from archived journal with timestamp <= ts1 are included - // - All entries from active journal with timestamp <= ts1 are included - // - If ts2 or ts3 were clamped to ts1 (due to clock going backwards), - // they would also be included - - // Recover at ts1: should include all entries with timestamp <= ts1 + // Recover at ts1: should have only key1=value1 let state_ts1 = recovery.recover_at_timestamp(ts1)?; - - // In normal operation (no clock backwards), only key1 should be at ts1 - assert!(state_ts1.contains_key("key1")); + assert_eq!(state_ts1.len(), 1); assert_eq!( state_ts1.get("key1").map(|tv| &tv.value), Some(&Value::String("value1".to_string())) ); - // Verify timestamp monotonicity is maintained across rotation - assert!( - ts2 >= ts1, - "Timestamps should be monotonically non-decreasing across rotation" + // Recover at ts2: should have key1=value1, key2=value2 + let state_ts2 = recovery.recover_at_timestamp(ts2)?; + assert_eq!(state_ts2.len(), 2); + assert_eq!( + state_ts2.get("key1").map(|tv| &tv.value), + Some(&Value::String("value1".to_string())) ); - assert!( - ts3 >= ts2, - "Timestamps should be monotonically non-decreasing" + assert_eq!( + state_ts2.get("key2").map(|tv| &tv.value), + Some(&Value::String("value2".to_string())) ); - // Test recovery at later timestamps - if ts2 > ts1 { - let state_ts2 = recovery.recover_at_timestamp(ts2)?; - // Should include key1 (from archive) and key2 (from active) - assert_eq!(state_ts2.len(), 2); - assert!(state_ts2.contains_key("key1")); - assert!(state_ts2.contains_key("key2")); - } - - if ts3 > ts2 { - let state_ts3 = recovery.recover_at_timestamp(ts3)?; - // Should include all keys - assert_eq!(state_ts3.len(), 3); - assert!(state_ts3.contains_key("key1")); - assert!(state_ts3.contains_key("key2")); - assert!(state_ts3.contains_key("key3")); - } + // Recover at ts3: should have key1=updated1, key2=value2 + let state_ts3 = recovery.recover_at_timestamp(ts3)?; + assert_eq!(state_ts3.len(), 2); + assert_eq!( + state_ts3.get("key1").map(|tv| &tv.value), + Some(&Value::String("updated1".to_string())) + ); + assert_eq!( + state_ts3.get("key2").map(|tv| &tv.value), + Some(&Value::String("value2".to_string())) + ); - // Edge case: If timestamps were the same (due to clamping), verify "last write wins" - // This is important because recovery processes entries in order, so later versions - // should overwrite earlier ones with the same timestamp - if ts2 == ts1 && ts3 == ts1 { - // All entries have the same timestamp - let state_at_shared_ts = recovery.recover_at_timestamp(ts1)?; - - // All entries should be included since they all have timestamp == ts1 - assert_eq!(state_at_shared_ts.len(), 3); - - // Verify values are from the latest versions (last write wins) - assert_eq!( - state_at_shared_ts.get("key1").map(|tv| &tv.value), - Some(&Value::String("value1".to_string())) - ); - assert_eq!( - state_at_shared_ts.get("key2").map(|tv| &tv.value), - Some(&Value::String("value2".to_string())) - ); - assert_eq!( - state_at_shared_ts.get("key3").map(|tv| &tv.value), - Some(&Value::String("value3".to_string())) - ); - } + // Recover current should match ts3 + let current = recovery.recover_current()?; + assert_eq!(current, state_ts3); Ok(()) } + #[tokio::test] async fn test_recover_current_only_needs_last_journal() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; @@ -229,7 +118,6 @@ async fn test_recover_current_only_needs_last_journal() -> anyhow::Result<()> { .await?; // First rotation - let _archive1_version = store.current_version(); store.rotate_journal().await?; // Update key1 and add key3 @@ -241,7 +129,6 @@ async fn test_recover_current_only_needs_last_journal() -> anyhow::Result<()> { .await?; // Second rotation - let _archive2_version = store.current_version(); store.rotate_journal().await?; // Add more data and delete key2 @@ -261,7 +148,7 @@ async fn test_recover_current_only_needs_last_journal() -> anyhow::Result<()> { .filter_map(|entry| { let entry = entry.ok()?; let path = entry.path(); - if path.file_name()?.to_str()?.starts_with("test.jrn.v") { + if path.file_name()?.to_str()?.starts_with("test.jrn.t") { Some(path) } else { None @@ -327,13 +214,23 @@ async fn test_detection_compressed_journal() -> anyhow::Result<()> { store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; - let archive_version = store.current_version(); store.rotate_journal().await?; - let archived_path = temp_dir - .path() - .join(format!("test.jrn.v{}.zz", archive_version)); - let compressed_data = std::fs::read(&archived_path)?; + // Find the archived file + let mut archived_files = std::fs::read_dir(temp_dir.path())? + .filter_map(|entry| { + let entry = entry.ok()?; + let path = entry.path(); + if path.file_name()?.to_str()?.starts_with("test.jrn.t") { + Some(path) + } else { + None + } + }) + .collect::>(); + archived_files.sort(); + let archived_path = archived_files.first().unwrap(); + let compressed_data = std::fs::read(archived_path)?; // Verify it starts with zlib magic bytes (0x78) assert_eq!( @@ -492,10 +389,16 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { // Create a store with larger buffer to avoid BufferFull errors during test let mut store = VersionedKVStore::new(temp_dir.path(), "test", 2048, None)?; - // Write data that will trigger rotation - let v1 = store + store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; + let ts1 = store + .get_with_timestamp("key1") + .map(|tv| tv.timestamp) + .unwrap(); + + std::thread::sleep(std::time::Duration::from_millis(10)); + store .insert("key2".to_string(), Value::String("value2".to_string())) .await?; @@ -505,15 +408,24 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { store.insert(format!("key{i}"), Value::Signed(i)).await?; } - let v_middle = store.current_version(); + let ts_middle = store + .get_with_timestamp("key19") + .map(|tv| tv.timestamp) + .unwrap(); + + std::thread::sleep(std::time::Duration::from_millis(10)); // Write more after rotation - let v_final = store + store .insert( "final".to_string(), Value::String("final_value".to_string()), ) .await?; + let ts_final = store + .get_with_timestamp("final") + .map(|tv| tv.timestamp) + .unwrap(); store.sync()?; // Read all journal files @@ -524,7 +436,7 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { .filter_map(|entry| { let entry = entry.ok()?; let path = entry.path(); - if path.file_name()?.to_str()?.starts_with("test.jrn.v") { + if path.file_name()?.to_str()?.starts_with("test.jrn.t") { Some(path) } else { None @@ -543,19 +455,19 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { let journal_refs: Vec<&[u8]> = all_journals.iter().map(std::vec::Vec::as_slice).collect(); let recovery = VersionedRecovery::new(journal_refs)?; - // Verify we can recover at early version - let state_v1 = recovery.recover_at_version(v1)?; - assert_eq!(state_v1.len(), 1); - assert!(state_v1.contains_key("key1")); + // Verify we can recover at early timestamp + let state_ts1 = recovery.recover_at_timestamp(ts1)?; + assert_eq!(state_ts1.len(), 1); + assert!(state_ts1.contains_key("key1")); - // Verify we can recover at middle version (after rotation) - let state_middle = recovery.recover_at_version(v_middle)?; + // Verify we can recover at middle timestamp (after rotation) + let state_middle = recovery.recover_at_timestamp(ts_middle)?; assert!(state_middle.len() > 2); assert!(state_middle.contains_key("key1")); assert!(state_middle.contains_key("key2")); - // Verify we can recover at final version - let state_final = recovery.recover_at_version(v_final)?; + // Verify we can recover at final timestamp + let state_final = recovery.recover_at_timestamp(ts_final)?; assert!(state_final.contains_key("final")); assert_eq!( state_final.get("final").map(|tv| &tv.value), @@ -577,21 +489,19 @@ async fn test_recovery_empty_journal() -> anyhow::Result<()> { let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; let recovery = VersionedRecovery::new(vec![&journal_data])?; - // Should have version range starting at 1 - let version_range = recovery.version_range(); - assert!(version_range.is_some()); - let (min, _max) = version_range.unwrap(); - assert_eq!(min, 1); + // Should have timestamp range starting at base + let timestamp_range = recovery.timestamp_range(); + assert!(timestamp_range.is_some()); - // Recovering at any version should return empty map - let state = recovery.recover_at_version(1)?; + // Recovering current should return empty map + let state = recovery.recover_current()?; assert_eq!(state.len(), 0); Ok(()) } #[tokio::test] -async fn test_recovery_version_range() -> anyhow::Result<()> { +async fn test_recovery_timestamp_range() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; @@ -599,22 +509,37 @@ async fn test_recovery_version_range() -> anyhow::Result<()> { store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; + let ts1 = store + .get_with_timestamp("key1") + .map(|tv| tv.timestamp) + .unwrap(); + + std::thread::sleep(std::time::Duration::from_millis(10)); + store .insert("key2".to_string(), Value::String("value2".to_string())) .await?; - let v3 = store + + std::thread::sleep(std::time::Duration::from_millis(10)); + + store .insert("key3".to_string(), Value::String("value3".to_string())) .await?; + let ts3 = store + .get_with_timestamp("key3") + .map(|tv| tv.timestamp) + .unwrap(); + store.sync()?; let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; let recovery = VersionedRecovery::new(vec![&journal_data])?; - let version_range = recovery.version_range(); - assert!(version_range.is_some()); - let (min, max) = version_range.unwrap(); - assert_eq!(min, 1); // base_version defaults to 1 for new stores - assert_eq!(max, v3); + let timestamp_range = recovery.timestamp_range(); + assert!(timestamp_range.is_some()); + let (min, max) = timestamp_range.unwrap(); + assert!(min <= ts1); + assert!(max >= ts3); Ok(()) } @@ -625,30 +550,49 @@ async fn test_recovery_with_overwrites() -> anyhow::Result<()> { let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - let v1 = store.insert("key".to_string(), Value::Signed(1)).await?; - let v2 = store.insert("key".to_string(), Value::Signed(2)).await?; - let v3 = store.insert("key".to_string(), Value::Signed(3)).await?; + store.insert("key".to_string(), Value::Signed(1)).await?; + let ts1 = store + .get_with_timestamp("key") + .map(|tv| tv.timestamp) + .unwrap(); + + std::thread::sleep(std::time::Duration::from_millis(10)); + + store.insert("key".to_string(), Value::Signed(2)).await?; + let ts2 = store + .get_with_timestamp("key") + .map(|tv| tv.timestamp) + .unwrap(); + + std::thread::sleep(std::time::Duration::from_millis(10)); + + store.insert("key".to_string(), Value::Signed(3)).await?; + let ts3 = store + .get_with_timestamp("key") + .map(|tv| tv.timestamp) + .unwrap(); + store.sync()?; let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; let recovery = VersionedRecovery::new(vec![&journal_data])?; - // Each version should show the value at that time - let state_v1 = recovery.recover_at_version(v1)?; + // Each timestamp should show the value at that time + let state_ts1 = recovery.recover_at_timestamp(ts1)?; assert_eq!( - state_v1.get("key").map(|tv| &tv.value), + state_ts1.get("key").map(|tv| &tv.value), Some(&Value::Signed(1)) ); - let state_v2 = recovery.recover_at_version(v2)?; + let state_ts2 = recovery.recover_at_timestamp(ts2)?; assert_eq!( - state_v2.get("key").map(|tv| &tv.value), + state_ts2.get("key").map(|tv| &tv.value), Some(&Value::Signed(2)) ); - let state_v3 = recovery.recover_at_version(v3)?; + let state_ts3 = recovery.recover_at_timestamp(ts3)?; assert_eq!( - state_v3.get("key").map(|tv| &tv.value), + state_ts3.get("key").map(|tv| &tv.value), Some(&Value::Signed(3)) ); @@ -671,13 +615,12 @@ async fn test_recovery_various_value_types() -> anyhow::Result<()> { .insert("float".to_string(), Value::Float(3.14)) .await?; store.insert("bool".to_string(), Value::Bool(true)).await?; - let v_final = store.current_version(); store.sync()?; let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; let recovery = VersionedRecovery::new(vec![&journal_data])?; - let state = recovery.recover_at_version(v_final)?; + let state = recovery.recover_current()?; assert_eq!(state.len(), 4); assert_eq!( state.get("string").map(|tv| &tv.value), @@ -706,63 +649,89 @@ async fn test_recovery_from_compressed_archive() -> anyhow::Result<()> { // Create a store and write some data let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - let v1 = store + store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; - let v2 = store + let ts1 = store + .get_with_timestamp("key1") + .map(|tv| tv.timestamp) + .unwrap(); + + std::thread::sleep(std::time::Duration::from_millis(10)); + + store .insert("key2".to_string(), Value::String("value2".to_string())) .await?; - store.sync()?; + let ts2 = store + .get_with_timestamp("key2") + .map(|tv| tv.timestamp) + .unwrap(); - // Get the current version before rotation (this will be used in the archive name) - let archive_version = store.current_version(); + store.sync()?; // Rotate to create compressed archive store.rotate_journal().await?; // Add more data to active journal - let v3 = store + std::thread::sleep(std::time::Duration::from_millis(10)); + + store .insert("key3".to_string(), Value::String("value3".to_string())) .await?; + let ts3 = store + .get_with_timestamp("key3") + .map(|tv| tv.timestamp) + .unwrap(); + store.sync()?; - // Find the compressed archive (using the version at the time of rotation) - let archived_path = temp_dir - .path() - .join(format!("test.jrn.v{}.zz", archive_version)); + // Find the compressed archive + let mut archived_files = std::fs::read_dir(temp_dir.path())? + .filter_map(|entry| { + let entry = entry.ok()?; + let path = entry.path(); + if path.file_name()?.to_str()?.starts_with("test.jrn.t") { + Some(path) + } else { + None + } + }) + .collect::>(); + archived_files.sort(); + let archived_path = archived_files.first().unwrap(); assert!(archived_path.exists(), "Compressed archive should exist"); // Read both journals - let compressed_data = std::fs::read(&archived_path)?; + let compressed_data = std::fs::read(archived_path)?; let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; // Create recovery from both journals (compressed first, then active) let recovery = VersionedRecovery::new(vec![&compressed_data, &active_data])?; - // Verify version range spans both journals - let version_range = recovery.version_range(); - assert!(version_range.is_some()); - let (min, max) = version_range.unwrap(); - assert_eq!(min, 1); - assert_eq!(max, v3); + // Verify timestamp range spans both journals + let timestamp_range = recovery.timestamp_range(); + assert!(timestamp_range.is_some()); + let (min, max) = timestamp_range.unwrap(); + assert!(min <= ts1); + assert!(max >= ts3); - // Recover at v1 (should be in compressed archive) - let state_v1 = recovery.recover_at_version(v1)?; - assert_eq!(state_v1.len(), 1); + // Recover at ts1 (should be in compressed archive) + let state_ts1 = recovery.recover_at_timestamp(ts1)?; + assert_eq!(state_ts1.len(), 1); assert_eq!( - state_v1.get("key1").map(|tv| &tv.value), + state_ts1.get("key1").map(|tv| &tv.value), Some(&Value::String("value1".to_string())) ); - // Recover at v2 (should be in compressed archive) - let state_v2 = recovery.recover_at_version(v2)?; - assert_eq!(state_v2.len(), 2); + // Recover at ts2 (should be in compressed archive) + let state_ts2 = recovery.recover_at_timestamp(ts2)?; + assert_eq!(state_ts2.len(), 2); - // Recover at v3 (should include data from both archives and active journal) - let state_v3 = recovery.recover_at_version(v3)?; - assert_eq!(state_v3.len(), 3); + // Recover at ts3 (should include data from both archives and active journal) + let state_ts3 = recovery.recover_at_timestamp(ts3)?; + assert_eq!(state_ts3.len(), 3); assert_eq!( - state_v3.get("key3").map(|tv| &tv.value), + state_ts3.get("key3").map(|tv| &tv.value), Some(&Value::String("value3".to_string())) ); @@ -776,53 +745,79 @@ async fn test_recovery_from_multiple_compressed_archives() -> anyhow::Result<()> // Create a store and perform multiple rotations let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - let v1 = store + store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; - let archive1_version = store.current_version(); + let ts1 = store + .get_with_timestamp("key1") + .map(|tv| tv.timestamp) + .unwrap(); + store.rotate_journal().await?; - let v2 = store + std::thread::sleep(std::time::Duration::from_millis(10)); + + store .insert("key2".to_string(), Value::String("value2".to_string())) .await?; - let archive2_version = store.current_version(); + let ts2 = store + .get_with_timestamp("key2") + .map(|tv| tv.timestamp) + .unwrap(); + store.rotate_journal().await?; - let v3 = store + std::thread::sleep(std::time::Duration::from_millis(10)); + + store .insert("key3".to_string(), Value::String("value3".to_string())) .await?; + let ts3 = store + .get_with_timestamp("key3") + .map(|tv| tv.timestamp) + .unwrap(); + store.sync()?; // Collect all journal data (2 compressed + 1 active) - let archive1_path = temp_dir - .path() - .join(format!("test.jrn.v{}.zz", archive1_version)); - let archive2_path = temp_dir - .path() - .join(format!("test.jrn.v{}.zz", archive2_version)); + let mut archived_files = std::fs::read_dir(temp_dir.path())? + .filter_map(|entry| { + let entry = entry.ok()?; + let path = entry.path(); + if path.file_name()?.to_str()?.starts_with("test.jrn.t") { + Some(path) + } else { + None + } + }) + .collect::>(); + archived_files.sort(); - let archive1_data = std::fs::read(&archive1_path)?; - let archive2_data = std::fs::read(&archive2_path)?; + let archive1_path = &archived_files[0]; + let archive2_path = &archived_files[1]; + + let archive1_data = std::fs::read(archive1_path)?; + let archive2_data = std::fs::read(archive2_path)?; let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; // Create recovery from all journals let recovery = VersionedRecovery::new(vec![&archive1_data, &archive2_data, &active_data])?; - // Verify we can recover at any version - let state_v1 = recovery.recover_at_version(v1)?; - assert_eq!(state_v1.len(), 1); - assert!(state_v1.contains_key("key1")); + // Verify we can recover at any timestamp + let state_ts1 = recovery.recover_at_timestamp(ts1)?; + assert_eq!(state_ts1.len(), 1); + assert!(state_ts1.contains_key("key1")); - let state_v2 = recovery.recover_at_version(v2)?; - assert_eq!(state_v2.len(), 2); - assert!(state_v2.contains_key("key1")); - assert!(state_v2.contains_key("key2")); + let state_ts2 = recovery.recover_at_timestamp(ts2)?; + assert_eq!(state_ts2.len(), 2); + assert!(state_ts2.contains_key("key1")); + assert!(state_ts2.contains_key("key2")); - let state_v3 = recovery.recover_at_version(v3)?; - assert_eq!(state_v3.len(), 3); - assert!(state_v3.contains_key("key1")); - assert!(state_v3.contains_key("key2")); - assert!(state_v3.contains_key("key3")); + let state_ts3 = recovery.recover_at_timestamp(ts3)?; + assert_eq!(state_ts3.len(), 3); + assert!(state_ts3.contains_key("key1")); + assert!(state_ts3.contains_key("key2")); + assert!(state_ts3.contains_key("key3")); Ok(()) } @@ -834,31 +829,44 @@ async fn test_recovery_mixed_compressed_and_uncompressed() -> anyhow::Result<()> // Create initial store and archive (will be compressed) let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - let _v1 = store + let _ts1 = store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; store.sync()?; - let archive_version = store.current_version(); store.rotate_journal().await?; // Get compressed archive - let compressed_archive_path = temp_dir - .path() - .join(format!("test.jrn.v{}.zz", archive_version)); - let compressed_data = std::fs::read(&compressed_archive_path)?; + let mut archived_files = std::fs::read_dir(temp_dir.path())? + .filter_map(|entry| { + let entry = entry.ok()?; + let path = entry.path(); + if path.file_name()?.to_str()?.starts_with("test.jrn.t") { + Some(path) + } else { + None + } + }) + .collect::>(); + archived_files.sort(); + let compressed_archive_path = archived_files.first().unwrap(); + let compressed_data = std::fs::read(compressed_archive_path)?; // Create uncompressed journal data manually let mut uncompressed_store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - let v2 = uncompressed_store + uncompressed_store .insert("key2".to_string(), Value::String("value2".to_string())) .await?; + let ts2 = uncompressed_store + .get_with_timestamp("key2") + .map(|tv| tv.timestamp) + .unwrap(); uncompressed_store.sync()?; let uncompressed_data = std::fs::read(temp_dir.path().join("test.jrn"))?; // Recovery should handle both compressed and uncompressed let recovery = VersionedRecovery::new(vec![&compressed_data, &uncompressed_data])?; - let state_final = recovery.recover_at_version(v2)?; + let state_final = recovery.recover_at_timestamp(ts2)?; assert_eq!(state_final.len(), 2); assert!(state_final.contains_key("key1")); assert!(state_final.contains_key("key2")); @@ -872,7 +880,7 @@ async fn test_recovery_at_timestamp() -> anyhow::Result<()> { // Create a store and write some timestamped data let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - + store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; @@ -979,7 +987,7 @@ async fn test_recovery_at_timestamp_with_rotation() -> anyhow::Result<()> { .unwrap(); // Rotate journal - let archive_version = store.current_version(); + let rotation_ts = ts2; store.rotate_journal().await?; std::thread::sleep(std::time::Duration::from_millis(10)); @@ -998,7 +1006,7 @@ async fn test_recovery_at_timestamp_with_rotation() -> anyhow::Result<()> { // Read both journals let archived_path = temp_dir .path() - .join(format!("test.jrn.v{}.zz", archive_version)); + .join(format!("test.jrn.t{}.zz", rotation_ts)); let archived_data = std::fs::read(&archived_path)?; let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; @@ -1038,7 +1046,7 @@ async fn test_timestamp_range() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - + store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; @@ -1071,11 +1079,11 @@ async fn test_timestamp_range() -> anyhow::Result<()> { let timestamp_range = recovery.timestamp_range(); assert!(timestamp_range.is_some()); let (min, max) = timestamp_range.unwrap(); - + // Min should be <= first timestamp, max should be >= last timestamp assert!(min <= ts1); assert!(max >= ts3); - + // Timestamps should be ordered assert!(ts3 > ts1); @@ -1090,34 +1098,45 @@ async fn test_recovery_decompression_transparent() -> anyhow::Result<()> { // Create store with compressible data let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; let compressible = "A".repeat(500); - let v1 = store + store .insert("data".to_string(), Value::String(compressible.clone())) .await?; + let ts1 = store + .get_with_timestamp("data") + .map(|tv| tv.timestamp) + .unwrap(); store.sync()?; // Create uncompressed recovery baseline let uncompressed_data = std::fs::read(temp_dir.path().join("test.jrn"))?; let recovery_uncompressed = VersionedRecovery::new(vec![&uncompressed_data])?; - let state_uncompressed = recovery_uncompressed.recover_at_version(v1)?; - - // Get archive version before rotation - let archive_version = store.current_version(); + let state_uncompressed = recovery_uncompressed.recover_at_timestamp(ts1)?; // Rotate to compress store.rotate_journal().await?; // Read compressed archive - let compressed_path = temp_dir - .path() - .join(format!("test.jrn.v{}.zz", archive_version)); - let compressed_data = std::fs::read(&compressed_path)?; + let mut archived_files = std::fs::read_dir(temp_dir.path())? + .filter_map(|entry| { + let entry = entry.ok()?; + let path = entry.path(); + if path.file_name()?.to_str()?.starts_with("test.jrn.t") { + Some(path) + } else { + None + } + }) + .collect::>(); + archived_files.sort(); + let compressed_path = archived_files.first().unwrap(); + let compressed_data = std::fs::read(compressed_path)?; // Verify it's actually compressed (smaller) assert!(compressed_data.len() < uncompressed_data.len()); // Create recovery from compressed data let recovery_compressed = VersionedRecovery::new(vec![&compressed_data])?; - let state_compressed = recovery_compressed.recover_at_version(v1)?; + let state_compressed = recovery_compressed.recover_at_timestamp(ts1)?; // Both should produce identical results assert_eq!(state_uncompressed.len(), state_compressed.len()); @@ -1132,3 +1151,66 @@ async fn test_recovery_decompression_transparent() -> anyhow::Result<()> { Ok(()) } + +#[tokio::test] +async fn test_base_timestamp_validation() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; + + // Create a store and perform rotation to get proper sequential journals + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + + // Add initial data + store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + store.sync()?; + + // Rotate to create first archived journal + store.rotate_journal().await?; + + std::thread::sleep(std::time::Duration::from_millis(50)); + + // Add more data after rotation + store + .insert("key2".to_string(), Value::String("value2".to_string())) + .await?; + store.sync()?; + + // Read both journals (archived + active) + let mut archived_files = std::fs::read_dir(temp_dir.path())? + .filter_map(|entry| { + let entry = entry.ok()?; + let path = entry.path(); + if path.file_name()?.to_str()?.starts_with("test.jrn.t") { + Some(path) + } else { + None + } + }) + .collect::>(); + archived_files.sort(); + + let archived_data = std::fs::read(archived_files.first().unwrap())?; + let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; + + // Should succeed when journals are in correct chronological order (archived, then active) + let recovery = VersionedRecovery::new(vec![&archived_data, &active_data]); + assert!(recovery.is_ok(), "Should succeed with correct ordering"); + + // Should fail when journals are in wrong order (active before archived) + let recovery_reversed = VersionedRecovery::new(vec![&active_data, &archived_data]); + assert!( + recovery_reversed.is_err(), + "Should fail when base_timestamp ordering is violated" + ); + + let err = recovery_reversed.unwrap_err(); + let err_msg = err.to_string(); + assert!( + err_msg.contains("base_timestamp") && err_msg.contains("max_timestamp"), + "Error should mention base_timestamp and max_timestamp validation, got: {}", + err_msg + ); + + Ok(()) +} diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs index 5b82538e..052a0529 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -15,7 +15,7 @@ use std::path::{Path, PathBuf}; /// The callback receives: /// - `old_journal_path`: The path to the archived journal file that was just rotated out /// - `new_journal_path`: The path to the new active journal file -/// - `rotation_version`: The version at which rotation occurred (snapshot version) +/// - `rotation_timestamp`: The timestamp at which rotation occurred (snapshot timestamp) /// /// This callback can be used to trigger asynchronous upload of archived journals to remote /// storage, perform cleanup, or other post-rotation operations. @@ -44,11 +44,10 @@ async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result .await? } -/// A persistent key-value store with version and timestamp tracking. +/// A persistent key-value store with timestamp tracking. /// -/// `VersionedKVStore` provides HashMap-like semantics backed by a versioned journal that -/// assigns both a version number and a monotonically increasing timestamp to each write -/// operation. This enables: +/// `VersionedKVStore` provides HashMap-like semantics backed by a timestamped journal that +/// assigns a monotonically increasing timestamp to each write operation. This enables: /// - Audit logs with timestamp tracking for every write (timestamps serve as logical clocks) /// - Point-in-time recovery at any historical timestamp /// - Correlation with external timestamped event streams @@ -62,18 +61,32 @@ async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result /// - If system clock goes backward, timestamps are clamped to maintain ordering /// - Multiple operations may share the same timestamp if system clock hasn't advanced /// - Enables natural correlation with timestamped event buffers for upload -/// - Version numbers provide secondary ordering and backward compatibility /// /// For performance optimization, `VersionedKVStore` maintains an in-memory cache of the /// current key-value data to provide O(1) read operations and avoid expensive journal /// decoding on every access. /// /// # Rotation Strategy -/// When the journal reaches its high water mark, the store automatically: -/// 1. Creates a new journal file with a rotated name (e.g., `store.jrn.v12345`) -/// 2. Writes the current state as versioned entries at the rotation version -/// 3. Archives the old journal for potential upload/cleanup -/// 4. Continues normal operations in the new journal +/// +/// When the journal reaches its high water mark, the store automatically rotates to a new journal. +/// The rotation process creates a snapshot of the current state while preserving timestamp +/// semantics for accurate point-in-time recovery. +/// +/// ## Rotation Process +/// 1. Computes `rotation_timestamp` = max timestamp of all current entries +/// 2. Archives old journal as `.jrn.t.zz` (compressed) +/// 3. Creates new journal with `base_timestamp = rotation_timestamp` +/// 4. Writes compacted state with **original timestamps preserved** +/// 5. Continues normal operations in the new journal +/// +/// ## Timestamp Semantics Across Snapshots +/// +/// Compacted entries in the new journal preserve their original timestamps, which means entry +/// timestamps may overlap across adjacent snapshots. The filename timestamp (`t300`, `t500`) +/// represents the rotation point (snapshot boundary), not the minimum timestamp of entries. +/// +/// For detailed information about timestamp semantics, recovery bucketing, and invariants, +/// see the `VersionedRecovery` documentation. /// /// # Example /// ```ignore @@ -82,9 +95,9 @@ async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result /// /// let mut store = VersionedKVStore::new("/path/to/dir", "mystore", 1024 * 1024, None)?; /// -/// // Insert with version tracking -/// let v1 = store.insert("key1".to_string(), Value::from(42))?; -/// let v2 = store.insert("key2".to_string(), Value::from("hello"))?; +/// // Insert with timestamp tracking +/// let t1 = store.insert("key1".to_string(), Value::from(42))?; +/// let t2 = store.insert("key2".to_string(), Value::from("hello"))?; /// ``` pub struct VersionedKVStore { journal: MemMappedVersionedKVJournal, @@ -218,24 +231,24 @@ impl VersionedKVStore { self.cached_map.get(key) } - /// Insert a value for a key, returning the version number assigned to this write. + /// Insert a value for a key, returning the timestamp assigned to this write. /// /// Note: Inserting `Value::Null` is equivalent to removing the key. /// /// # Errors /// Returns an error if the value cannot be written to the journal. pub async fn insert(&mut self, key: String, value: Value) -> anyhow::Result { - let version = if matches!(value, Value::Null) { + let timestamp = if matches!(value, Value::Null) { // Inserting null is equivalent to deletion - let (version, _timestamp) = self.journal.delete_versioned(&key)?; + let timestamp = self.journal.delete_versioned(&key)?; self.cached_map.remove(&key); - version + timestamp } else { - let (version, timestamp) = self.journal.set_versioned(&key, &value)?; + let timestamp = self.journal.set_versioned(&key, &value)?; self .cached_map .insert(key, TimestampedValue { value, timestamp }); - version + timestamp }; // Check if rotation is needed @@ -243,12 +256,12 @@ impl VersionedKVStore { self.rotate_journal().await?; } - Ok(version) + Ok(timestamp) } - /// Remove a key and return the version number assigned to this deletion. + /// Remove a key and return the timestamp assigned to this deletion. /// - /// Returns `None` if the key didn't exist, otherwise returns the version number. + /// Returns `None` if the key didn't exist, otherwise returns the timestamp. /// /// # Errors /// Returns an error if the deletion cannot be written to the journal. @@ -257,7 +270,7 @@ impl VersionedKVStore { return Ok(None); } - let (version, _timestamp) = self.journal.delete_versioned(key)?; + let timestamp = self.journal.delete_versioned(key)?; self.cached_map.remove(key); // Check if rotation is needed @@ -265,7 +278,7 @@ impl VersionedKVStore { self.rotate_journal().await?; } - Ok(Some(version)) + Ok(Some(timestamp)) } /// Check if the store contains a key. @@ -316,18 +329,6 @@ impl VersionedKVStore { .collect() } - /// Get the current version number. - #[must_use] - pub fn current_version(&self) -> u64 { - self.journal.current_version() - } - - /// Get the base version (first version in this journal). - #[must_use] - pub fn base_version(&self) -> u64 { - self.journal.base_version() - } - /// Synchronize changes to disk. /// /// This is a blocking operation that performs synchronous I/O. In async contexts, @@ -361,13 +362,19 @@ impl VersionedKVStore { /// # Errors /// Returns an error if rotation fails. pub async fn rotate_journal(&mut self) -> anyhow::Result<()> { - let rotation_version = self.journal.current_version(); + // Get the maximum timestamp from current state for rotation tracking + let rotation_timestamp = self + .cached_map + .values() + .map(|tv| tv.timestamp) + .max() + .unwrap_or(0); - // Generate archived journal path with rotation version (compressed) - let archived_path = self.generate_archived_path(rotation_version); + // Generate archived journal path with rotation timestamp (compressed) + let archived_path = self.generate_archived_path(rotation_timestamp); // Create new journal with rotated state - let new_journal = self.create_rotated_journal(rotation_version).await?; + let new_journal = self.create_rotated_journal(rotation_timestamp).await?; // Replace old journal with new one let old_journal = std::mem::replace(&mut self.journal, new_journal); @@ -390,25 +397,25 @@ impl VersionedKVStore { // Invoke rotation callback if set if let Some(ref mut callback) = self.rotation_callback { - callback(&archived_path, &journal_path, rotation_version); + callback(&archived_path, &journal_path, rotation_timestamp); } Ok(()) } - /// Generate the archived journal path for a given rotation version. + /// Generate the archived journal path for a given rotation timestamp. /// Archived journals use the .zz extension to indicate zlib compression. - fn generate_archived_path(&self, rotation_version: u64) -> PathBuf { + fn generate_archived_path(&self, rotation_timestamp: u64) -> PathBuf { self.dir_path.join(format!( - "{}.jrn.v{}.zz", - self.journal_name, rotation_version + "{}.jrn.t{}.zz", + self.journal_name, rotation_timestamp )) } /// Create a new rotated journal with compacted state. async fn create_rotated_journal( &self, - rotation_version: u64, + base_timestamp: u64, ) -> anyhow::Result { // Create temporary journal file let temp_path = self.dir_path.join(format!("{}.jrn.tmp", self.journal_name)); @@ -419,7 +426,7 @@ impl VersionedKVStore { // Use VersionedKVJournal to create rotated journal in memory let _rotated = VersionedKVJournal::create_rotated_journal( &mut buffer, - rotation_version, + base_timestamp, &self.cached_map, self.high_water_mark_ratio, )?; diff --git a/bd-resilient-kv/src/versioned_recovery.rs b/bd-resilient-kv/src/versioned_recovery.rs index 061b448a..376aa815 100644 --- a/bd-resilient-kv/src/versioned_recovery.rs +++ b/bd-resilient-kv/src/versioned_recovery.rs @@ -33,18 +33,73 @@ fn read_u64_field(obj: &AHashMap, key: &str) -> Option { } } -/// A utility for recovering state at arbitrary versions or timestamps from raw journal data. +/// A utility for recovering state at arbitrary timestamps from raw journal data. /// -/// This utility operates on raw byte slices from versioned journals and can reconstruct -/// the key-value state at any historical version or timestamp by replaying journal entries. +/// This utility operates on raw byte slices from timestamped journals and can reconstruct +/// the key-value state at any historical timestamp by replaying journal entries. /// /// # Timestamp-Based Recovery /// -/// The primary use case is timestamp-based recovery, which enables correlation with -/// external timestamped event streams. Timestamps are monotonically non-decreasing logical -/// clocks (not pure wall time), enabling snapshots that match specific event buffer timestamps. +/// Timestamps are monotonically non-decreasing logical clocks (not pure wall time), +/// enabling snapshots that match specific event buffer timestamps. /// -/// Version-based recovery is also supported for backward compatibility. +/// ## Snapshot Bucketing and Entry Timestamp Overlaps +/// +/// Entry timestamps may overlap across adjacent snapshots because compacted entries preserve +/// their original timestamps during rotation. This design provides implementation simplicity +/// and audit trail preservation without affecting recovery correctness. +/// +/// **Design rationale:** Preserving original timestamps is not strictly required for +/// point-in-time state reconstruction, but provides benefits at zero cost: +/// - **Implementation simplicity**: No timestamp rewriting logic needed during rotation +/// - **Semantic accuracy**: Maintains "when was this value last modified" for audit trails +/// - **Future-proof**: Preserves historical information that may become useful +/// +/// Each snapshot has: +/// - `base_timestamp`: Stored in metadata, used for monotonic write enforcement +/// - `min_timestamp`: Minimum entry timestamp in the snapshot (from actual entries) +/// - `max_timestamp`: Maximum entry timestamp in the snapshot (from actual entries) +/// - Filename timestamp: The rotation point (equals `max_timestamp` of archived journal) +/// +/// Example timeline: +/// ```text +/// Snapshot 1: store.jrn.t300.zz +/// - base_timestamp: 0 +/// - Entries: foo@100, bar@200, foo@300 +/// - min_timestamp: 100, max_timestamp: 300 +/// - Range: [100, 300] +/// +/// Snapshot 2: store.jrn.t500.zz +/// - base_timestamp: 300 (rotation point of Snapshot 1) +/// - Compacted entries: foo@300, bar@200 (original timestamps!) +/// - New entries: baz@400, qux@500 +/// - min_timestamp: 200, max_timestamp: 500 +/// - Range: [200, 500] — overlaps with [100, 300]! +/// ``` +/// +/// ## Recovery Bucketing Model +/// +/// To recover state for multiple logs at different timestamps efficiently: +/// +/// 1. **Bucket logs by snapshot:** Compare log timestamp against each snapshot's `[min_timestamp, +/// max_timestamp]` range +/// 2. **Sequential replay:** For each bucket, replay journals sequentially up to target timestamp +/// 3. **State reconstruction:** Overlapping timestamps are handled correctly because compacted +/// entries represent the state at rotation time +/// +/// Example: Recovering logs at timestamps [100, 250, 400, 500] +/// - Log@100: Use Snapshot 1 (100 is in range [100, 300]) +/// - Log@250: Use Snapshot 1 (250 is in range [100, 300]) +/// - Log@400: Use Snapshot 2 (400 is in range [200, 500], replay compacted state + new entries) +/// - Log@500: Use Snapshot 2 (500 is in range [200, 500]) +/// +/// ## Invariants +/// +/// - `base_timestamp` values are non-decreasing across snapshots +/// - Filename timestamps strictly increase (t300 < t500) +/// - `base_timestamp[i]` >= `max_timestamp[i-1]` (validated in `new()`) +/// - Entry timestamp ranges may overlap between adjacent snapshots +/// - Sequential replay produces correct state at any timestamp /// /// Supports both compressed (zlib) and uncompressed journals. Compressed journals are /// automatically detected and decompressed transparently. @@ -56,8 +111,7 @@ pub struct VersionedRecovery { #[derive(Debug)] struct JournalInfo { data: Vec, - base_version: u64, - max_version: u64, + base_timestamp: u64, min_timestamp: u64, max_timestamp: u64, } @@ -78,17 +132,31 @@ impl VersionedRecovery { for data in journals { // Detect and decompress if needed let decompressed = decompress_if_needed(data)?; - let (base_version, max_version) = extract_version_range(&decompressed)?; - let (min_timestamp, max_timestamp) = extract_timestamp_range(&decompressed)?; + let (base_timestamp, min_timestamp, max_timestamp) = extract_timestamp_range(&decompressed)?; journal_infos.push(JournalInfo { data: decompressed, - base_version, - max_version, + base_timestamp, min_timestamp, max_timestamp, }); } + // Validate that base_timestamp values are consistent across journals + // Each journal's base_timestamp should be >= the previous journal's max_timestamp + for i in 1 .. journal_infos.len() { + let prev = &journal_infos[i - 1]; + let curr = &journal_infos[i]; + + if curr.base_timestamp < prev.max_timestamp { + anyhow::bail!( + "Journal {} has base_timestamp {} which is less than previous journal's max_timestamp {}", + i, + curr.base_timestamp, + prev.max_timestamp + ); + } + } + Ok(Self { journals: journal_infos, }) @@ -116,65 +184,6 @@ impl VersionedRecovery { Self::new(journal_slices) } - /// Recover the key-value state at a specific version. - /// - /// This method replays all journal entries from all provided journals up to and including - /// the target version, reconstructing the exact state at that point in time. - /// - /// # Arguments - /// - /// * `target_version` - The version to recover state at - /// - /// # Returns - /// - /// A hashmap containing all key-value pairs with their timestamps as they existed at the - /// target version. - /// - /// # Errors - /// - /// Returns an error if: - /// - The target version is not found in any journal - /// - Journal data is corrupted or invalid - pub fn recover_at_version( - &self, - target_version: u64, - ) -> anyhow::Result> { - let mut map = AHashMap::new(); - - // Find all journals that might contain entries up to target version - for journal in &self.journals { - // Skip journals that start after our target - if journal.base_version > target_version { - break; - } - - // Replay entries from this journal - replay_journal_to_version(&journal.data, target_version, &mut map)?; - - // If this journal contains the target version, we're done - if journal.max_version >= target_version { - break; - } - } - - Ok(map) - } - - /// Get the range of versions available in the recovery utility. - /// - /// Returns (`min_version`, `max_version`) tuple representing the earliest and latest - /// versions that can be recovered. - #[must_use] - pub fn version_range(&self) -> Option<(u64, u64)> { - if self.journals.is_empty() { - return None; - } - - let min = self.journals.first().map(|j| j.base_version)?; - let max = self.journals.last().map(|j| j.max_version)?; - Some((min, max)) - } - /// Get the range of timestamps available in the recovery utility. /// /// Returns (`min_timestamp`, `max_timestamp`) tuple representing the earliest and latest @@ -245,7 +254,7 @@ impl VersionedRecovery { Ok(map) } - /// Get the current state (at the latest version). + /// Get the current state (at the latest timestamp). /// /// # Errors /// @@ -254,10 +263,10 @@ impl VersionedRecovery { let mut map = AHashMap::new(); // Optimization: Only read the last journal since journal rotation writes - // the complete state at the snapshot version, so the last journal contains + // the complete state at the snapshot timestamp, so the last journal contains // all current state. if let Some(last_journal) = self.journals.last() { - replay_journal_to_version(&last_journal.data, u64::MAX, &mut map)?; + replay_journal_to_timestamp(&last_journal.data, u64::MAX, &mut map)?; } Ok(map) @@ -306,46 +315,24 @@ fn decompress_if_needed(data: &[u8]) -> anyhow::Result> { anyhow::bail!("Data too small to be valid journal (size: {})", data.len()) } -/// Extract the base version and maximum version from a journal. -fn extract_version_range(buffer: &[u8]) -> anyhow::Result<(u64, u64)> { - let array = read_bonjson_payload(buffer)?; - - // Extract base_version from metadata (default to 1 if not found) - let base_version = if let Value::Array(entries) = &array - && let Some(Value::Object(obj)) = entries.first() - { - read_u64_field(obj, "base_version").unwrap_or(1) - } else { - anyhow::bail!("Failed to extract metadata from journal"); - }; - - // Find the maximum version by scanning all entries - let mut max_version = base_version; - if let Value::Array(entries) = array { - for (index, entry) in entries.iter().enumerate() { - if index == 0 { - continue; // Skip metadata - } - - if let Value::Object(obj) = entry - && let Some(v) = read_u64_field(obj, "v") - { - max_version = max_version.max(v); - } - } - } - - Ok((base_version, max_version)) -} - -/// Extract the minimum and maximum timestamps from a journal. -fn extract_timestamp_range(buffer: &[u8]) -> anyhow::Result<(u64, u64)> { +/// Extract the base timestamp and the minimum/maximum timestamps from a journal. +/// +/// Returns (`base_timestamp`, `min_timestamp`, `max_timestamp`). +/// The `base_timestamp` comes from the metadata, while min/max are computed from actual entries. +fn extract_timestamp_range(buffer: &[u8]) -> anyhow::Result<(u64, u64, u64)> { let array = read_bonjson_payload(buffer)?; + let mut base_timestamp = 0; let mut min_timestamp = u64::MAX; let mut max_timestamp = 0; if let Value::Array(entries) = array { + // First entry is metadata - extract base_timestamp + if let Some(Value::Object(metadata)) = entries.first() { + base_timestamp = read_u64_field(metadata, "base_timestamp").unwrap_or(0); + } + + // Process remaining entries to find min/max timestamps for (index, entry) in entries.iter().enumerate() { if index == 0 { continue; // Skip metadata @@ -360,64 +347,13 @@ fn extract_timestamp_range(buffer: &[u8]) -> anyhow::Result<(u64, u64)> { } } - // If no entries found, default to (0, 0) + // If no entries found, default to (base_timestamp, base_timestamp) if min_timestamp == u64::MAX { - min_timestamp = 0; + min_timestamp = base_timestamp; + max_timestamp = base_timestamp; } - Ok((min_timestamp, max_timestamp)) -} - -/// Replay journal entries up to a target version. -fn replay_journal_to_version( - buffer: &[u8], - target_version: u64, - map: &mut AHashMap, -) -> anyhow::Result<()> { - let array = read_bonjson_payload(buffer)?; - - if let Value::Array(entries) = array { - for (index, entry) in entries.iter().enumerate() { - // Skip metadata (first entry) - if index == 0 { - continue; - } - - if let Value::Object(obj) = entry { - // Check version - let Some(entry_version) = read_u64_field(obj, "v") else { - continue; // Skip entries without version - }; - - // Only apply entries up to target version - if entry_version > target_version { - break; - } - - // Extract timestamp (default to 0 if not found) - let timestamp = read_u64_field(obj, "t").unwrap_or(0); - - // Extract key and operation - if let Some(Value::String(key)) = obj.get("k") - && let Some(operation) = obj.get("o") - { - if operation.is_null() { - map.remove(key); - } else { - map.insert( - key.clone(), - TimestampedValue { - value: operation.clone(), - timestamp, - }, - ); - } - } - } - } - } - - Ok(()) + Ok((base_timestamp, min_timestamp, max_timestamp)) } /// Replay journal entries up to and including the target timestamp. From bcef8191ebe85a0326439a40d4c30dc19628896b Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 19:44:26 -0800 Subject: [PATCH 17/66] simplify tests --- .../src/tests/versioned_recovery_test.rs | 232 +++--------------- 1 file changed, 36 insertions(+), 196 deletions(-) diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index 1727b486..24418cb9 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -12,96 +12,6 @@ use crate::versioned_recovery::VersionedRecovery; use bd_bonjson::Value; use tempfile::TempDir; -#[tokio::test] -async fn test_recovery_single_journal() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - // Create a store and write some versioned data - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - store - .insert("key1".to_string(), Value::String("value1".to_string())) - .await?; - let ts1 = store - .get_with_timestamp("key1") - .map(|tv| tv.timestamp) - .unwrap(); - - std::thread::sleep(std::time::Duration::from_millis(10)); - - store - .insert("key2".to_string(), Value::String("value2".to_string())) - .await?; - let ts2 = store - .get_with_timestamp("key2") - .map(|tv| tv.timestamp) - .unwrap(); - - std::thread::sleep(std::time::Duration::from_millis(10)); - - store - .insert("key1".to_string(), Value::String("updated1".to_string())) - .await?; - let ts3 = store - .get_with_timestamp("key1") - .map(|tv| tv.timestamp) - .unwrap(); - - store.sync()?; - - // Read the journal data - let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - - // Create recovery utility - let recovery = VersionedRecovery::new(vec![&journal_data])?; - - // Verify timestamp range - let timestamp_range = recovery.timestamp_range(); - assert!(timestamp_range.is_some()); - let (min, max) = timestamp_range.unwrap(); - assert!(min <= ts1); - assert!(max >= ts3); - - // Recover at ts1: should have only key1=value1 - let state_ts1 = recovery.recover_at_timestamp(ts1)?; - assert_eq!(state_ts1.len(), 1); - assert_eq!( - state_ts1.get("key1").map(|tv| &tv.value), - Some(&Value::String("value1".to_string())) - ); - - // Recover at ts2: should have key1=value1, key2=value2 - let state_ts2 = recovery.recover_at_timestamp(ts2)?; - assert_eq!(state_ts2.len(), 2); - assert_eq!( - state_ts2.get("key1").map(|tv| &tv.value), - Some(&Value::String("value1".to_string())) - ); - assert_eq!( - state_ts2.get("key2").map(|tv| &tv.value), - Some(&Value::String("value2".to_string())) - ); - - // Recover at ts3: should have key1=updated1, key2=value2 - let state_ts3 = recovery.recover_at_timestamp(ts3)?; - assert_eq!(state_ts3.len(), 2); - assert_eq!( - state_ts3.get("key1").map(|tv| &tv.value), - Some(&Value::String("updated1".to_string())) - ); - assert_eq!( - state_ts3.get("key2").map(|tv| &tv.value), - Some(&Value::String("value2".to_string())) - ); - - // Recover current should match ts3 - let current = recovery.recover_current()?; - assert_eq!(current, state_ts3); - - Ok(()) -} - - #[tokio::test] async fn test_recover_current_only_needs_last_journal() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; @@ -250,95 +160,69 @@ async fn test_detection_compressed_journal() -> anyhow::Result<()> { Ok(()) } -#[test] -fn test_detection_invalid_format_version() { - // Create data with invalid format version (e.g., 999) - let mut invalid_data = vec![0u8; 32]; - let version_bytes = 999u64.to_le_bytes(); - invalid_data[0 .. 8].copy_from_slice(&version_bytes); +#[tokio::test] +async fn test_detection_invalid_journal_data() -> anyhow::Result<()> { + let temp_dir = TempDir::new()?; - // Should fail with clear error message about invalid version - let result = VersionedRecovery::new(vec![&invalid_data]); + // Create valid journal for mixed test + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + store + .insert("key1".to_string(), Value::String("value1".to_string())) + .await?; + store.sync()?; + let valid_data = std::fs::read(temp_dir.path().join("test.jrn"))?; + + // Test 1: Invalid format version + let mut invalid_version = vec![0u8; 32]; + let version_bytes = 999u64.to_le_bytes(); + invalid_version[0 .. 8].copy_from_slice(&version_bytes); + let result = VersionedRecovery::new(vec![&invalid_version]); assert!(result.is_err()); - let err_msg = result.unwrap_err().to_string(); assert!( - err_msg.contains("Invalid journal format version"), - "Expected error about invalid version, got: {err_msg}" + result.unwrap_err().to_string().contains("Invalid journal format version"), + "Should fail with invalid version error" ); -} -#[test] -fn test_detection_data_too_small() { - // Data smaller than header size (16 bytes) + // Test 2: Data too small (smaller than header) let small_data = vec![0u8; 8]; - let result = VersionedRecovery::new(vec![&small_data]); assert!(result.is_err()); - let err_msg = result.unwrap_err().to_string(); assert!( - err_msg.contains("Data too small"), - "Expected error about data too small, got: {err_msg}" + result.unwrap_err().to_string().contains("Data too small"), + "Should fail with data too small error" ); -} -#[test] -fn test_detection_empty_data() { + // Test 3: Empty data let empty_data = vec![]; - let result = VersionedRecovery::new(vec![&empty_data]); assert!(result.is_err()); - let err_msg = result.unwrap_err().to_string(); assert!( - err_msg.contains("Data too small"), - "Expected error about data too small, got: {err_msg}" + result.unwrap_err().to_string().contains("Data too small"), + "Should fail with data too small error" ); -} -#[test] -fn test_detection_corrupted_zlib_header() { - // Create data that looks like zlib (starts with 0x78) but is invalid + // Test 4: Corrupted zlib header let mut fake_zlib = vec![0x78, 0x9C]; // Valid zlib magic bytes fake_zlib.extend_from_slice(&[0xFF; 100]); // But garbage data - let result = VersionedRecovery::new(vec![&fake_zlib]); - assert!(result.is_err()); - // Should fail during decompression -} + assert!(result.is_err(), "Should fail with corrupted zlib data"); -#[test] -fn test_detection_random_garbage() { - // Random data that doesn't match any valid format + // Test 5: Random garbage let garbage = vec![0xAB, 0xCD, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x90]; - let result = VersionedRecovery::new(vec![&garbage]); assert!(result.is_err()); let err_msg = result.unwrap_err().to_string(); - // Should try to decompress it and fail - assert!(err_msg.contains("Data too small") || err_msg.contains("corrupt")); -} - -#[tokio::test] -async fn test_detection_mixed_valid_and_invalid() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - // Create valid journal - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - store - .insert("key1".to_string(), Value::String("value1".to_string())) - .await?; - store.sync()?; - - let valid_data = std::fs::read(temp_dir.path().join("test.jrn"))?; + assert!( + err_msg.contains("Data too small") || err_msg.contains("corrupt"), + "Should fail with appropriate error" + ); - // Create invalid data - let mut invalid_data = vec![0u8; 32]; + // Test 6: Mixed valid and invalid journals + let mut invalid_mixed = vec![0u8; 32]; let version_bytes = 999u64.to_le_bytes(); - invalid_data[0 .. 8].copy_from_slice(&version_bytes); - - // Should fail if any journal is invalid - let result = VersionedRecovery::new(vec![&valid_data, &invalid_data]); - assert!(result.is_err()); + invalid_mixed[0 .. 8].copy_from_slice(&version_bytes); + let result = VersionedRecovery::new(vec![&valid_data, &invalid_mixed]); + assert!(result.is_err(), "Should fail if any journal is invalid"); Ok(()) } @@ -500,50 +384,6 @@ async fn test_recovery_empty_journal() -> anyhow::Result<()> { Ok(()) } -#[tokio::test] -async fn test_recovery_timestamp_range() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - store - .insert("key1".to_string(), Value::String("value1".to_string())) - .await?; - let ts1 = store - .get_with_timestamp("key1") - .map(|tv| tv.timestamp) - .unwrap(); - - std::thread::sleep(std::time::Duration::from_millis(10)); - - store - .insert("key2".to_string(), Value::String("value2".to_string())) - .await?; - - std::thread::sleep(std::time::Duration::from_millis(10)); - - store - .insert("key3".to_string(), Value::String("value3".to_string())) - .await?; - let ts3 = store - .get_with_timestamp("key3") - .map(|tv| tv.timestamp) - .unwrap(); - - store.sync()?; - - let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - let recovery = VersionedRecovery::new(vec![&journal_data])?; - - let timestamp_range = recovery.timestamp_range(); - assert!(timestamp_range.is_some()); - let (min, max) = timestamp_range.unwrap(); - assert!(min <= ts1); - assert!(max >= ts3); - - Ok(()) -} - #[tokio::test] async fn test_recovery_with_overwrites() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; From fb1b250cd78128df39ed8c340f7525fc1aea24d1 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 19:47:09 -0800 Subject: [PATCH 18/66] use 5 --- bd-resilient-kv/src/kv_journal/memmapped.rs | 15 +++++++- .../src/tests/versioned_recovery_test.rs | 35 ++++++++----------- bd-resilient-kv/src/versioned_kv_store.rs | 2 +- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/bd-resilient-kv/src/kv_journal/memmapped.rs b/bd-resilient-kv/src/kv_journal/memmapped.rs index 4c7cff07..6af76376 100644 --- a/bd-resilient-kv/src/kv_journal/memmapped.rs +++ b/bd-resilient-kv/src/kv_journal/memmapped.rs @@ -118,6 +118,19 @@ impl MemMappedKVJournal { Ok(Self { mmap, in_memory_kv }) } + /// Synchronize changes to disk. + /// + /// This forces any changes in the memory-mapped region to be written to the underlying file. + /// Note that changes are typically synced automatically by the OS, but this provides + /// explicit control when needed. + /// + /// # Errors + /// Returns an error if the sync operation fails. + pub fn sync(&self) -> anyhow::Result<()> { + self.mmap.flush()?; + Ok(()) + } + /// Get the size of the underlying file in bytes. #[must_use] pub fn file_size(&self) -> usize { @@ -232,6 +245,6 @@ impl KVJournal for MemMappedKVJournal { /// # Errors /// Returns an error if the sync operation fails. fn sync(&self) -> anyhow::Result<()> { - self.mmap.flush().map_err(Into::into) + self.sync() } } diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index 24418cb9..abe7bdc2 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -228,7 +228,7 @@ async fn test_detection_invalid_journal_data() -> anyhow::Result<()> { } #[test] -fn test_detection_all_zlib_compression_levels() { +fn test_detection_zlib_compression_level_5() { use flate2::Compression; use flate2::write::ZlibEncoder; use std::io::Write; @@ -242,26 +242,19 @@ fn test_detection_all_zlib_compression_levels() { // Some data uncompressed[16 .. 32].copy_from_slice(b"[{\"base_version\""); - // Test different compression levels - for level in [ - Compression::none(), - Compression::fast(), - Compression::default(), - Compression::best(), - ] { - let mut encoder = ZlibEncoder::new(Vec::new(), level); - encoder.write_all(&uncompressed).unwrap(); - let compressed = encoder.finish().unwrap(); - - // Verify it starts with 0x78 - assert_eq!(compressed[0], 0x78); - - // Should be able to detect and decompress - let result = VersionedRecovery::new(vec![&compressed]); - // May succeed or fail depending on whether the data is valid bonjson, - // but should at least attempt decompression without panicking - let _ = result; - } + // Test compression level 5 (what we use in production) + let mut encoder = ZlibEncoder::new(Vec::new(), Compression::new(5)); + encoder.write_all(&uncompressed).unwrap(); + let compressed = encoder.finish().unwrap(); + + // Verify it starts with 0x78 (zlib magic byte) + assert_eq!(compressed[0], 0x78); + + // Should be able to detect and decompress + let result = VersionedRecovery::new(vec![&compressed]); + // May succeed or fail depending on whether the data is valid bonjson, + // but should at least attempt decompression without panicking + let _ = result; } diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs index 052a0529..e0cb89b5 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -36,7 +36,7 @@ async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result let source_file = std::fs::File::open(&source)?; let dest_file = std::fs::File::create(&dest)?; - let mut encoder = ZlibEncoder::new(dest_file, Compression::default()); + let mut encoder = ZlibEncoder::new(dest_file, Compression::new(5)); copy(&mut BufReader::new(source_file), &mut encoder)?; encoder.finish()?; Ok::<_, anyhow::Error>(()) From 4fd49479346d8021bf0fc647d59e8cd40d4bb0d4 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 20:09:15 -0800 Subject: [PATCH 19/66] remove base_timestamp, not needed --- bd-resilient-kv/VERSIONED_FORMAT.md | 13 +++-- .../src/kv_journal/memmapped_versioned.rs | 4 +- bd-resilient-kv/src/kv_journal/versioned.rs | 27 +++------- .../src/tests/versioned_recovery_test.rs | 32 ++++++------ bd-resilient-kv/src/versioned_kv_store.rs | 18 +++---- bd-resilient-kv/src/versioned_recovery.rs | 49 ++++--------------- 6 files changed, 48 insertions(+), 95 deletions(-) diff --git a/bd-resilient-kv/VERSIONED_FORMAT.md b/bd-resilient-kv/VERSIONED_FORMAT.md index 18a5e681..d9de93bd 100644 --- a/bd-resilient-kv/VERSIONED_FORMAT.md +++ b/bd-resilient-kv/VERSIONED_FORMAT.md @@ -47,8 +47,7 @@ Previous journals, archived during rotation. Each contains complete state at rot ```json { "initialized": , - "format_version": 2, - "base_version": + "format_version": 2 } ``` @@ -74,9 +73,9 @@ Timestamps are monotonically non-decreasing, not strictly increasing. If the sys ## Journal Structure ### Initial Journal -When first created with base version 1: +When first created: ```json -{"initialized": 1699564800000000000, "format_version": 2, "base_version": 1} +{"initialized": 1699564800000000000, "format_version": 2} {"v": 2, "t": 1699564801000000000, "k": "key1", "o": "value1"} {"v": 3, "t": 1699564802000000000, "k": "key2", "o": "value2"} ... @@ -85,7 +84,7 @@ When first created with base version 1: ### Rotated Journal After rotation at version 30000, the new journal contains: ```json -{"initialized": 1699564900000000000, "format_version": 2, "base_version": 30000} +{"initialized": 1699564900000000000, "format_version": 2} {"v": 30000, "t": 1699564800123456789, "k": "key1", "o": "value1"} // Compacted state (original timestamp) {"v": 30000, "t": 1699564850987654321, "k": "key2", "o": "value2"} // Compacted state (original timestamp) {"v": 30000, "t": 1699564875111222333, "k": "key3", "o": "value3"} // Compacted state (original timestamp) @@ -118,10 +117,10 @@ When high water mark is reached at version N: Example: ``` Before rotation at v30000: - my_store.jrn # Active, base_version=20000, contains v20000-v30000 + my_store.jrn # Active, contains v20000-v30000 After rotation: - my_store.jrn # Active, base_version=30000, contains compacted state at v30000 + my_store.jrn # Active, contains compacted state at v30000 my_store.jrn.v30000.zz # Compressed archive, contains v20000-v30000 ``` diff --git a/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs b/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs index 367627c3..6a1ae2e1 100644 --- a/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs +++ b/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs @@ -58,7 +58,6 @@ impl MemMappedVersionedKVJournal { /// # Arguments /// * `file_path` - Path to the file to use for storage /// * `size` - Minimum size of the file in bytes - /// * `base_timestamp` - The starting timestamp for this journal (typically current time) /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 /// /// # Errors @@ -66,7 +65,6 @@ impl MemMappedVersionedKVJournal { pub fn new>( file_path: P, size: usize, - base_timestamp: u64, high_water_mark_ratio: Option, ) -> anyhow::Result { let file = OpenOptions::new() @@ -83,7 +81,7 @@ impl MemMappedVersionedKVJournal { let (mmap, buffer) = unsafe { Self::create_mmap_buffer(file)? }; - let versioned_kv = VersionedKVJournal::new(buffer, base_timestamp, high_water_mark_ratio)?; + let versioned_kv = VersionedKVJournal::new(buffer, high_water_mark_ratio)?; Ok(Self { mmap, versioned_kv }) } diff --git a/bd-resilient-kv/src/kv_journal/versioned.rs b/bd-resilient-kv/src/kv_journal/versioned.rs index c2447a0b..8e4f1506 100644 --- a/bd-resilient-kv/src/kv_journal/versioned.rs +++ b/bd-resilient-kv/src/kv_journal/versioned.rs @@ -142,7 +142,7 @@ fn read_bonjson_payload(buffer: &[u8]) -> anyhow::Result { } /// Create and write the metadata section of a versioned journal. -fn write_metadata(buffer: &mut [u8], timestamp: u64, base_timestamp: u64) -> anyhow::Result { +fn write_metadata(buffer: &mut [u8], timestamp: u64) -> anyhow::Result { let buffer_len = buffer.len(); let mut cursor = &mut buffer[METADATA_OFFSET ..]; @@ -150,10 +150,6 @@ fn write_metadata(buffer: &mut [u8], timestamp: u64, base_timestamp: u64) -> any let mut metadata = AHashMap::new(); metadata.insert("initialized".to_string(), Value::Unsigned(timestamp)); metadata.insert("format_version".to_string(), Value::Unsigned(VERSION)); - metadata.insert( - "base_timestamp".to_string(), - Value::Unsigned(base_timestamp), - ); // Write metadata object encode_into_buf(&mut cursor, &Value::Object(metadata)) @@ -210,16 +206,11 @@ impl<'a> VersionedKVJournal<'a> { /// /// # Arguments /// * `buffer` - The storage buffer - /// * `base_timestamp` - The starting timestamp for this journal (typically current time) /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 /// /// # Errors /// Returns an error if the buffer is too small or if `high_water_mark_ratio` is invalid. - pub fn new( - buffer: &'a mut [u8], - base_timestamp: u64, - high_water_mark_ratio: Option, - ) -> anyhow::Result { + pub fn new(buffer: &'a mut [u8], high_water_mark_ratio: Option) -> anyhow::Result { let buffer_len = validate_buffer_len(buffer)?; let high_water_mark = calculate_high_water_mark(buffer_len, high_water_mark_ratio)?; @@ -229,7 +220,7 @@ impl<'a> VersionedKVJournal<'a> { // Write metadata with current timestamp let timestamp = current_timestamp()?; - let position = write_metadata(buffer, timestamp, base_timestamp)?; + let position = write_metadata(buffer, timestamp)?; write_position(buffer, position); write_version(buffer); @@ -240,7 +231,7 @@ impl<'a> VersionedKVJournal<'a> { high_water_mark, high_water_mark_triggered: false, initialized_at_unix_time_ns: timestamp, - last_timestamp: std::cmp::max(timestamp, base_timestamp), + last_timestamp: timestamp, }) } @@ -492,10 +483,10 @@ impl<'a> VersionedKVJournal<'a> { /// - Final state: foo=v3@300, bar=v1@200 /// - rotation_timestamp = 300 (max of all timestamps) /// - /// New journal (created by this function with base_timestamp=300): + /// New journal (created by this function): /// - Compacted entries: foo=v3@300, bar=v1@200 ← Original timestamps preserved! /// - These timestamps (300, 200) may equal/overlap with old journal's range [100, 300] - /// - Future entries will have t >= 300 (enforced by base_timestamp) + /// - Future entries will have t >= 300 (enforced by last_timestamp initialization) /// ``` /// /// ## Design Rationale @@ -514,7 +505,6 @@ impl<'a> VersionedKVJournal<'a> { /// /// # Arguments /// * `buffer` - The buffer to write the new journal to - /// * `base_timestamp` - The starting timestamp for the journal (for monotonic enforcement) /// * `state` - The current key-value state with timestamps to write /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark /// @@ -522,12 +512,11 @@ impl<'a> VersionedKVJournal<'a> { /// Returns an error if serialization fails or buffer is too small. pub fn create_rotated_journal( buffer: &'a mut [u8], - base_timestamp: u64, state: &AHashMap, high_water_mark_ratio: Option, ) -> anyhow::Result { - // Create a new journal with the base timestamp - let mut journal = Self::new(buffer, base_timestamp, high_water_mark_ratio)?; + // Create a new journal + let mut journal = Self::new(buffer, high_water_mark_ratio)?; // Find the maximum timestamp in the state to maintain monotonicity let max_state_timestamp = state.values().map(|tv| tv.timestamp).max().unwrap_or(0); diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index abe7bdc2..092ea6e7 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -179,7 +179,10 @@ async fn test_detection_invalid_journal_data() -> anyhow::Result<()> { let result = VersionedRecovery::new(vec![&invalid_version]); assert!(result.is_err()); assert!( - result.unwrap_err().to_string().contains("Invalid journal format version"), + result + .unwrap_err() + .to_string() + .contains("Invalid journal format version"), "Should fail with invalid version error" ); @@ -986,7 +989,7 @@ async fn test_recovery_decompression_transparent() -> anyhow::Result<()> { } #[tokio::test] -async fn test_base_timestamp_validation() -> anyhow::Result<()> { +async fn test_journal_ordering_requirement() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; // Create a store and perform rotation to get proper sequential journals @@ -1030,20 +1033,17 @@ async fn test_base_timestamp_validation() -> anyhow::Result<()> { let recovery = VersionedRecovery::new(vec![&archived_data, &active_data]); assert!(recovery.is_ok(), "Should succeed with correct ordering"); - // Should fail when journals are in wrong order (active before archived) - let recovery_reversed = VersionedRecovery::new(vec![&active_data, &archived_data]); - assert!( - recovery_reversed.is_err(), - "Should fail when base_timestamp ordering is violated" - ); - - let err = recovery_reversed.unwrap_err(); - let err_msg = err.to_string(); - assert!( - err_msg.contains("base_timestamp") && err_msg.contains("max_timestamp"), - "Error should mention base_timestamp and max_timestamp validation, got: {}", - err_msg - ); + // Verify correct ordering produces expected results + let state = recovery?.recover_current()?; + assert_eq!(state.len(), 2); + assert!(state.contains_key("key1")); + assert!(state.contains_key("key2")); + + // Note: Journals with reversed order may not produce correct results + // because recovery replays journals sequentially. Users must provide + // journals in chronological order (oldest to newest). + // The removal of base_timestamp metadata field doesn't change this requirement - + // chronological order is determined by filename timestamps (e.g., store.jrn.t300.zz) Ok(()) } diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs index e0cb89b5..4bd22668 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -75,7 +75,7 @@ async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result /// ## Rotation Process /// 1. Computes `rotation_timestamp` = max timestamp of all current entries /// 2. Archives old journal as `.jrn.t.zz` (compressed) -/// 3. Creates new journal with `base_timestamp = rotation_timestamp` +/// 3. Creates new journal with compacted state /// 4. Writes compacted state with **original timestamps preserved** /// 5. Continues normal operations in the new journal /// @@ -139,12 +139,12 @@ impl VersionedKVStore { // Try to open existing journal MemMappedVersionedKVJournal::from_file(&journal_path, buffer_size, high_water_mark_ratio) .or_else(|_| { - // Data is corrupt or unreadable, create fresh with base version 1 - MemMappedVersionedKVJournal::new(&journal_path, buffer_size, 1, high_water_mark_ratio) + // Data is corrupt or unreadable, create fresh journal + MemMappedVersionedKVJournal::new(&journal_path, buffer_size, high_water_mark_ratio) })? } else { - // Create new journal with base version 1 - MemMappedVersionedKVJournal::new(&journal_path, buffer_size, 1, high_water_mark_ratio)? + // Create new journal + MemMappedVersionedKVJournal::new(&journal_path, buffer_size, high_water_mark_ratio)? }; let cached_map = journal.as_hashmap_with_timestamps()?; @@ -374,7 +374,7 @@ impl VersionedKVStore { let archived_path = self.generate_archived_path(rotation_timestamp); // Create new journal with rotated state - let new_journal = self.create_rotated_journal(rotation_timestamp).await?; + let new_journal = self.create_rotated_journal().await?; // Replace old journal with new one let old_journal = std::mem::replace(&mut self.journal, new_journal); @@ -413,10 +413,7 @@ impl VersionedKVStore { } /// Create a new rotated journal with compacted state. - async fn create_rotated_journal( - &self, - base_timestamp: u64, - ) -> anyhow::Result { + async fn create_rotated_journal(&self) -> anyhow::Result { // Create temporary journal file let temp_path = self.dir_path.join(format!("{}.jrn.tmp", self.journal_name)); @@ -426,7 +423,6 @@ impl VersionedKVStore { // Use VersionedKVJournal to create rotated journal in memory let _rotated = VersionedKVJournal::create_rotated_journal( &mut buffer, - base_timestamp, &self.cached_map, self.high_water_mark_ratio, )?; diff --git a/bd-resilient-kv/src/versioned_recovery.rs b/bd-resilient-kv/src/versioned_recovery.rs index 376aa815..43b13c49 100644 --- a/bd-resilient-kv/src/versioned_recovery.rs +++ b/bd-resilient-kv/src/versioned_recovery.rs @@ -56,7 +56,6 @@ fn read_u64_field(obj: &AHashMap, key: &str) -> Option { /// - **Future-proof**: Preserves historical information that may become useful /// /// Each snapshot has: -/// - `base_timestamp`: Stored in metadata, used for monotonic write enforcement /// - `min_timestamp`: Minimum entry timestamp in the snapshot (from actual entries) /// - `max_timestamp`: Maximum entry timestamp in the snapshot (from actual entries) /// - Filename timestamp: The rotation point (equals `max_timestamp` of archived journal) @@ -64,13 +63,11 @@ fn read_u64_field(obj: &AHashMap, key: &str) -> Option { /// Example timeline: /// ```text /// Snapshot 1: store.jrn.t300.zz -/// - base_timestamp: 0 /// - Entries: foo@100, bar@200, foo@300 /// - min_timestamp: 100, max_timestamp: 300 /// - Range: [100, 300] /// /// Snapshot 2: store.jrn.t500.zz -/// - base_timestamp: 300 (rotation point of Snapshot 1) /// - Compacted entries: foo@300, bar@200 (original timestamps!) /// - New entries: baz@400, qux@500 /// - min_timestamp: 200, max_timestamp: 500 @@ -95,9 +92,7 @@ fn read_u64_field(obj: &AHashMap, key: &str) -> Option { /// /// ## Invariants /// -/// - `base_timestamp` values are non-decreasing across snapshots /// - Filename timestamps strictly increase (t300 < t500) -/// - `base_timestamp[i]` >= `max_timestamp[i-1]` (validated in `new()`) /// - Entry timestamp ranges may overlap between adjacent snapshots /// - Sequential replay produces correct state at any timestamp /// @@ -111,7 +106,6 @@ pub struct VersionedRecovery { #[derive(Debug)] struct JournalInfo { data: Vec, - base_timestamp: u64, min_timestamp: u64, max_timestamp: u64, } @@ -132,31 +126,14 @@ impl VersionedRecovery { for data in journals { // Detect and decompress if needed let decompressed = decompress_if_needed(data)?; - let (base_timestamp, min_timestamp, max_timestamp) = extract_timestamp_range(&decompressed)?; + let (min_timestamp, max_timestamp) = extract_timestamp_range(&decompressed)?; journal_infos.push(JournalInfo { data: decompressed, - base_timestamp, min_timestamp, max_timestamp, }); } - // Validate that base_timestamp values are consistent across journals - // Each journal's base_timestamp should be >= the previous journal's max_timestamp - for i in 1 .. journal_infos.len() { - let prev = &journal_infos[i - 1]; - let curr = &journal_infos[i]; - - if curr.base_timestamp < prev.max_timestamp { - anyhow::bail!( - "Journal {} has base_timestamp {} which is less than previous journal's max_timestamp {}", - i, - curr.base_timestamp, - prev.max_timestamp - ); - } - } - Ok(Self { journals: journal_infos, }) @@ -315,24 +292,18 @@ fn decompress_if_needed(data: &[u8]) -> anyhow::Result> { anyhow::bail!("Data too small to be valid journal (size: {})", data.len()) } -/// Extract the base timestamp and the minimum/maximum timestamps from a journal. +/// Extract the minimum/maximum timestamps from a journal. /// -/// Returns (`base_timestamp`, `min_timestamp`, `max_timestamp`). -/// The `base_timestamp` comes from the metadata, while min/max are computed from actual entries. -fn extract_timestamp_range(buffer: &[u8]) -> anyhow::Result<(u64, u64, u64)> { +/// Returns (`min_timestamp`, `max_timestamp`). +/// These are computed from actual entry timestamps in the journal. +fn extract_timestamp_range(buffer: &[u8]) -> anyhow::Result<(u64, u64)> { let array = read_bonjson_payload(buffer)?; - let mut base_timestamp = 0; let mut min_timestamp = u64::MAX; let mut max_timestamp = 0; if let Value::Array(entries) = array { - // First entry is metadata - extract base_timestamp - if let Some(Value::Object(metadata)) = entries.first() { - base_timestamp = read_u64_field(metadata, "base_timestamp").unwrap_or(0); - } - - // Process remaining entries to find min/max timestamps + // Process entries to find min/max timestamps (skip metadata at index 0) for (index, entry) in entries.iter().enumerate() { if index == 0 { continue; // Skip metadata @@ -347,13 +318,13 @@ fn extract_timestamp_range(buffer: &[u8]) -> anyhow::Result<(u64, u64, u64)> { } } - // If no entries found, default to (base_timestamp, base_timestamp) + // If no entries found, default to (0, 0) if min_timestamp == u64::MAX { - min_timestamp = base_timestamp; - max_timestamp = base_timestamp; + min_timestamp = 0; + max_timestamp = 0; } - Ok((base_timestamp, min_timestamp, max_timestamp)) + Ok((min_timestamp, max_timestamp)) } /// Replay journal entries up to and including the target timestamp. From 127f8ada4ef704604991df68bc56a83e54d9b677 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 20:15:55 -0800 Subject: [PATCH 20/66] remove cleanup for now --- bd-resilient-kv/AGENTS.md | 6 - bd-resilient-kv/README.md | 81 ----- bd-resilient-kv/src/lib.rs | 2 - bd-resilient-kv/src/snapshot_cleanup.rs | 253 ------------- bd-resilient-kv/src/tests/mod.rs | 1 - .../src/tests/snapshot_cleanup_test.rs | 332 ------------------ 6 files changed, 675 deletions(-) delete mode 100644 bd-resilient-kv/src/snapshot_cleanup.rs delete mode 100644 bd-resilient-kv/src/tests/snapshot_cleanup_test.rs diff --git a/bd-resilient-kv/AGENTS.md b/bd-resilient-kv/AGENTS.md index 6dc6e394..2ff31a1b 100644 --- a/bd-resilient-kv/AGENTS.md +++ b/bd-resilient-kv/AGENTS.md @@ -97,12 +97,6 @@ The `VersionedRecovery` utility provides point-in-time recovery capabilities for **Recovery Optimization**: The `recover_current()` method in `VersionedRecovery` is optimized to only read the last journal rather than replaying all journals from the beginning. This is possible because journal rotation writes the complete current state into the new journal at the snapshot version, so the last journal alone contains the full current state. For historical version recovery, `recover_at_version()` intelligently selects and replays only the necessary journals. -**Snapshot Cleanup**: -The `SnapshotCleanup` utility provides async methods for managing archived journal snapshots: -- All cleanup operations are async and require a Tokio runtime -- `list_snapshots()`, `cleanup_before_version()`, `cleanup_keep_recent()` are all async -- Enables efficient disk space management without blocking operations - ## Critical Design Insights ### 1. Two Storage Models diff --git a/bd-resilient-kv/README.md b/bd-resilient-kv/README.md index 6bf44fa3..571239fa 100644 --- a/bd-resilient-kv/README.md +++ b/bd-resilient-kv/README.md @@ -582,87 +582,6 @@ async fn main() -> anyhow::Result<()> { - Only archived journals are compressed during rotation (asynchronously) - No configuration needed - compression is automatic -### Snapshot Cleanup Management - -**SnapshotCleanup** provides utilities for managing disk space by cleaning up old archived journals. Its methods are async and require a Tokio runtime. - -```rust -use bd_resilient_kv::SnapshotCleanup; - -#[tokio::main] -async fn main() -> anyhow::Result<()> { - // Create cleanup utility for your journal - let cleanup = SnapshotCleanup::new("my_store.jrn")?; - - // List all archived snapshots (async) - let snapshots = cleanup.list_snapshots().await?; - for snapshot in &snapshots { - println!("Version: {}, Size: {} bytes, Path: {:?}", - snapshot.version, snapshot.size_bytes, snapshot.path); - } - - // Strategy 1: Remove snapshots older than a specific version (async) - // (e.g., your system determined you need to keep data back to version 5000) - let removed = cleanup.cleanup_before_version(5000).await?; - println!("Removed {} old snapshots", removed.len()); - - // Strategy 2: Keep only the N most recent snapshots (async) - let removed = cleanup.cleanup_keep_recent(10).await?; - println!("Removed {} snapshots, kept 10 most recent", removed.len()); - - // Check disk usage (async) - let total_size = cleanup.total_snapshot_size().await?; - println!("Total snapshot size: {} bytes", total_size); - - // Get version range (async) - if let Some(oldest) = cleanup.oldest_snapshot_version().await? { - if let Some(newest) = cleanup.newest_snapshot_version().await? { - println!("Snapshots range from version {} to {}", oldest, newest); - } - } - - Ok(()) -} -``` - -**Key Features**: -- **Async operations**: All methods are async and require a Tokio runtime -- **Version-based cleanup**: Remove snapshots before a specific version -- **Count-based cleanup**: Keep only N most recent snapshots -- **Safe operations**: Only removes compressed archives (`.zz` files), never active journals -- **Disk space monitoring**: Query total size and version ranges -- **Per-journal isolation**: Each cleanup instance only manages its own journal's snapshots - -**Integration with VersionedKVStore**: -```rust -use bd_resilient_kv::{VersionedKVStore, SnapshotCleanup}; -use bd_bonjson::Value; - -#[tokio::main] -async fn main() -> anyhow::Result<()> { - // Your application logic determines minimum required version - let min_version_from_external_system = get_minimum_required_version(); - - // Create store - let mut store = VersionedKVStore::new(".", "my_store", 1024 * 1024, None)?; - - // Perform operations... (async) - store.insert("key".to_string(), Value::from(42)).await?; - - // Periodically clean up old snapshots (async) - let cleanup = SnapshotCleanup::new("my_store.jrn")?; - cleanup.cleanup_before_version(min_version_from_external_system).await?; - - Ok(()) -} - -fn get_minimum_required_version() -> u64 { - // Your external system (e.g., backup service, replication manager) - // tells you how far back you need to maintain history - 5000 -} -``` - ### Custom Buffer Sizes Choose buffer sizes based on your use case: diff --git a/bd-resilient-kv/src/lib.rs b/bd-resilient-kv/src/lib.rs index a2195527..2b4c1732 100644 --- a/bd-resilient-kv/src/lib.rs +++ b/bd-resilient-kv/src/lib.rs @@ -19,7 +19,6 @@ mod tests; pub mod kv_journal; pub mod kv_store; -pub mod snapshot_cleanup; pub mod versioned_kv_store; pub mod versioned_recovery; @@ -32,6 +31,5 @@ pub use kv_journal::{ VersionedKVJournal, }; pub use kv_store::KVStore; -pub use snapshot_cleanup::{SnapshotCleanup, SnapshotInfo}; pub use versioned_kv_store::{RotationCallback, VersionedKVStore}; pub use versioned_recovery::VersionedRecovery; diff --git a/bd-resilient-kv/src/snapshot_cleanup.rs b/bd-resilient-kv/src/snapshot_cleanup.rs deleted file mode 100644 index 4b3668af..00000000 --- a/bd-resilient-kv/src/snapshot_cleanup.rs +++ /dev/null @@ -1,253 +0,0 @@ -// shared-core - bitdrift's common client/server libraries -// Copyright Bitdrift, Inc. All rights reserved. -// -// Use of this source code is governed by a source available license that can be found in the -// LICENSE file or at: -// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt - -use std::path::{Path, PathBuf}; - -/// Information about an archived journal snapshot. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct SnapshotInfo { - /// Full path to the snapshot file - pub path: PathBuf, - /// Version number extracted from the snapshot filename - pub version: u64, - /// File size in bytes - pub size_bytes: u64, -} - -/// A utility for managing cleanup of archived journal snapshots. -/// -/// `SnapshotCleanup` provides functionality to discover and remove old archived journals -/// based on version thresholds. This is useful for managing disk space when your system -/// determines how far back in history you need to maintain snapshots. -/// -/// Archived journals follow the naming pattern: `{base_name}.v{version}.zz` -/// For example: `my_store.jrn.v1000.zz`, `my_store.jrn.v2000.zz` -/// -/// # Example -/// ```ignore -/// use bd_resilient_kv::SnapshotCleanup; -/// -/// // Create cleanup utility for a journal -/// let cleanup = SnapshotCleanup::new("my_store.jrn")?; -/// -/// // List all archived snapshots -/// let snapshots = cleanup.list_snapshots()?; -/// for snapshot in &snapshots { -/// println!("Version: {}, Size: {} bytes", snapshot.version, snapshot.size_bytes); -/// } -/// -/// // Remove snapshots older than version 5000 -/// let removed = cleanup.cleanup_before_version(5000)?; -/// println!("Removed {} snapshots", removed.len()); -/// ``` -pub struct SnapshotCleanup { - directory: PathBuf, - base_filename: String, -} - -impl SnapshotCleanup { - /// Create a new `SnapshotCleanup` utility for the given journal path. - /// - /// The journal path should be the same path used to create the `VersionedKVStore`. - /// For example, if you created your store with `"my_store.jrn"`, pass the same path here. - /// - /// # Arguments - /// * `journal_path` - Path to the journal file (e.g., "`my_store.jrn`") - /// - /// # Errors - /// Returns an error if the path is invalid or cannot be canonicalized. - pub fn new>(journal_path: P) -> anyhow::Result { - let path = journal_path.as_ref(); - - let directory = path - .parent() - .map_or_else(|| PathBuf::from("."), std::path::Path::to_path_buf); - - let base_filename = path - .file_name() - .ok_or_else(|| anyhow::anyhow!("Invalid journal path: no filename"))? - .to_string_lossy() - .to_string(); - - Ok(Self { - directory, - base_filename, - }) - } - - /// List all archived snapshots for this journal. - /// - /// Returns a vector of `SnapshotInfo` containing details about each archived journal, - /// sorted by version number in ascending order. - /// - /// # Errors - /// Returns an error if the directory cannot be read or if file metadata cannot be accessed. - pub async fn list_snapshots(&self) -> anyhow::Result> { - let mut snapshots = Vec::new(); - - // Read directory entries - if !self.directory.exists() { - return Ok(snapshots); - } - - let mut entries = tokio::fs::read_dir(&self.directory).await?; - - while let Some(entry) = entries.next_entry().await? { - let path = entry.path(); - - // Check if this is an archived snapshot for our journal - if let Some(version) = self.extract_version_from_path(&path) { - let metadata = entry.metadata().await?; - snapshots.push(SnapshotInfo { - path: path.clone(), - version, - size_bytes: metadata.len(), - }); - } - } - - // Sort by version number - snapshots.sort_by_key(|s| s.version); - - Ok(snapshots) - } - - /// Remove all archived snapshots with versions strictly less than the specified version. - /// - /// This keeps snapshots at or after the minimum version, removing only older ones. - /// - /// # Arguments - /// * `min_version` - Minimum version to keep (exclusive). Snapshots with versions less than this - /// will be removed. - /// - /// # Returns - /// Returns a vector of `SnapshotInfo` for the snapshots that were removed. - /// - /// # Errors - /// Returns an error if any snapshot cannot be removed. If an error occurs while removing - /// a snapshot, the operation stops and returns the error. Some snapshots may have been - /// removed before the error occurred. - /// - /// # Example - /// ```ignore - /// // Keep snapshots at version 5000 and later, remove older ones - /// let removed = cleanup.cleanup_before_version(5000)?; - /// ``` - pub async fn cleanup_before_version( - &self, - min_version: u64, - ) -> anyhow::Result> { - let snapshots = self.list_snapshots().await?; - let mut removed = Vec::new(); - - for snapshot in snapshots { - if snapshot.version < min_version { - tokio::fs::remove_file(&snapshot.path).await?; - removed.push(snapshot); - } - } - - Ok(removed) - } - - /// Remove all archived snapshots except the most recent N versions. - /// - /// This keeps the N newest snapshots and removes all older ones. - /// - /// # Arguments - /// * `keep_count` - Number of most recent snapshots to keep - /// - /// # Returns - /// Returns a vector of `SnapshotInfo` for the snapshots that were removed. - /// - /// # Errors - /// Returns an error if any snapshot cannot be removed. - /// - /// # Example - /// ```ignore - /// // Keep only the 5 most recent snapshots - /// let removed = cleanup.cleanup_keep_recent(5)?; - /// ``` - pub async fn cleanup_keep_recent(&self, keep_count: usize) -> anyhow::Result> { - let mut snapshots = self.list_snapshots().await?; - - if snapshots.len() <= keep_count { - return Ok(Vec::new()); - } - - // Sort by version descending to get most recent first - snapshots.sort_by_key(|s| std::cmp::Reverse(s.version)); - - // Remove all except the most recent keep_count - let mut removed = Vec::new(); - for snapshot in snapshots.into_iter().skip(keep_count) { - tokio::fs::remove_file(&snapshot.path).await?; - removed.push(snapshot); - } - - // Sort removed list by version ascending for consistency - removed.sort_by_key(|s| s.version); - - Ok(removed) - } - - /// Calculate the total disk space used by all archived snapshots. - /// - /// # Errors - /// Returns an error if snapshots cannot be listed. - pub async fn total_snapshot_size(&self) -> anyhow::Result { - let snapshots = self.list_snapshots().await?; - Ok(snapshots.iter().map(|s| s.size_bytes).sum()) - } - - /// Get the oldest snapshot version. - /// - /// Returns `None` if there are no archived snapshots. - /// - /// # Errors - /// Returns an error if snapshots cannot be listed. - pub async fn oldest_snapshot_version(&self) -> anyhow::Result> { - let snapshots = self.list_snapshots().await?; - Ok(snapshots.first().map(|s| s.version)) - } - - /// Get the newest snapshot version. - /// - /// Returns `None` if there are no archived snapshots. - /// - /// # Errors - /// Returns an error if snapshots cannot be listed. - pub async fn newest_snapshot_version(&self) -> anyhow::Result> { - let snapshots = self.list_snapshots().await?; - Ok(snapshots.last().map(|s| s.version)) - } - - /// Extract version number from an archived journal path. - /// - /// Returns `Some(version)` if the path matches the pattern `{base_name}.v{version}.zz`, - /// otherwise returns `None`. - fn extract_version_from_path(&self, path: &Path) -> Option { - let filename = path.file_name()?.to_string_lossy(); - - // Check if filename starts with our base filename - if !filename.starts_with(&self.base_filename) { - return None; - } - - // Pattern: {base_filename}.v{version}.zz - let suffix = filename.strip_prefix(&self.base_filename)?; - - // Should start with ".v" - let version_part = suffix.strip_prefix(".v")?; - - // Should end with ".zz" - let version_str = version_part.strip_suffix(".zz")?; - - // Parse version number - version_str.parse::().ok() - } -} diff --git a/bd-resilient-kv/src/tests/mod.rs b/bd-resilient-kv/src/tests/mod.rs index 3972eea9..7d838890 100644 --- a/bd-resilient-kv/src/tests/mod.rs +++ b/bd-resilient-kv/src/tests/mod.rs @@ -33,6 +33,5 @@ pub mod error_handling_test; pub mod kv_store_test; pub mod kv_test; pub mod memmapped_test; -pub mod snapshot_cleanup_test; pub mod versioned_kv_store_test; pub mod versioned_recovery_test; diff --git a/bd-resilient-kv/src/tests/snapshot_cleanup_test.rs b/bd-resilient-kv/src/tests/snapshot_cleanup_test.rs deleted file mode 100644 index 2af2bff6..00000000 --- a/bd-resilient-kv/src/tests/snapshot_cleanup_test.rs +++ /dev/null @@ -1,332 +0,0 @@ -// shared-core - bitdrift's common client/server libraries -// Copyright Bitdrift, Inc. All rights reserved. -// -// Use of this source code is governed by a source available license that can be found in the -// LICENSE file or at: -// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt - -use crate::snapshot_cleanup::{SnapshotCleanup, SnapshotInfo}; -use std::fs; -use tempfile::TempDir; - -fn create_mock_snapshot(dir: &TempDir, base_name: &str, version: u64, size: usize) -> SnapshotInfo { - let filename = format!("{}.v{}.zz", base_name, version); - let path = dir.path().join(&filename); - - // Create file with specified size - let data = vec![0u8; size]; - fs::write(&path, data).unwrap(); - - SnapshotInfo { - path, - version, - size_bytes: size as u64, - } -} - -#[tokio::test] -async fn list_snapshots_empty_directory() { - let temp_dir = TempDir::new().unwrap(); - let journal_path = temp_dir.path().join("test.jrn"); - - let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - let snapshots = cleanup.list_snapshots().await.unwrap(); - - assert!(snapshots.is_empty()); -} - -#[tokio::test] -async fn list_snapshots_with_multiple_versions() { - let temp_dir = TempDir::new().unwrap(); - let journal_path = temp_dir.path().join("test.jrn"); - - // Create snapshots with different versions - create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); - create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); - create_mock_snapshot(&temp_dir, "test.jrn", 1500, 150); - create_mock_snapshot(&temp_dir, "test.jrn", 3000, 300); - - let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - let snapshots = cleanup.list_snapshots().await.unwrap(); - - assert_eq!(snapshots.len(), 4); - // Should be sorted by version - assert_eq!(snapshots[0].version, 1000); - assert_eq!(snapshots[1].version, 1500); - assert_eq!(snapshots[2].version, 2000); - assert_eq!(snapshots[3].version, 3000); - - // Verify sizes - assert_eq!(snapshots[0].size_bytes, 100); - assert_eq!(snapshots[1].size_bytes, 150); - assert_eq!(snapshots[2].size_bytes, 200); - assert_eq!(snapshots[3].size_bytes, 300); -} - -#[tokio::test] -async fn list_snapshots_ignores_other_files() { - let temp_dir = TempDir::new().unwrap(); - let journal_path = temp_dir.path().join("test.jrn"); - - // Create valid snapshots - create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); - create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); - - // Create files that should be ignored - fs::write(temp_dir.path().join("test.jrn"), b"active journal").unwrap(); - fs::write(temp_dir.path().join("other.jrn.v1000.zz"), b"other journal").unwrap(); - fs::write(temp_dir.path().join("test.jrn.v1000"), b"uncompressed").unwrap(); - fs::write(temp_dir.path().join("test.jrn.backup"), b"backup").unwrap(); - - let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - let snapshots = cleanup.list_snapshots().await.unwrap(); - - assert_eq!(snapshots.len(), 2); - assert_eq!(snapshots[0].version, 1000); - assert_eq!(snapshots[1].version, 2000); -} - -#[tokio::test] -async fn cleanup_before_version_removes_old_snapshots() { - let temp_dir = TempDir::new().unwrap(); - let journal_path = temp_dir.path().join("test.jrn"); - - create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); - create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); - create_mock_snapshot(&temp_dir, "test.jrn", 3000, 300); - create_mock_snapshot(&temp_dir, "test.jrn", 4000, 400); - - let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - - // Remove snapshots before version 3000 (keep 3000 and 4000) - let removed = cleanup.cleanup_before_version(3000).await.unwrap(); - - assert_eq!(removed.len(), 2); - assert_eq!(removed[0].version, 1000); - assert_eq!(removed[1].version, 2000); - - // Verify remaining snapshots - let remaining = cleanup.list_snapshots().await.unwrap(); - assert_eq!(remaining.len(), 2); - assert_eq!(remaining[0].version, 3000); - assert_eq!(remaining[1].version, 4000); - - // Verify files are actually deleted - assert!(!removed[0].path.exists()); - assert!(!removed[1].path.exists()); - assert!(remaining[0].path.exists()); - assert!(remaining[1].path.exists()); -} - -#[tokio::test] -async fn cleanup_before_version_keeps_all_if_min_version_too_low() { - let temp_dir = TempDir::new().unwrap(); - let journal_path = temp_dir.path().join("test.jrn"); - - create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); - create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); - - let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - - // Min version is lower than all snapshots - let removed = cleanup.cleanup_before_version(500).await.unwrap(); - - assert!(removed.is_empty()); - - let remaining = cleanup.list_snapshots().await.unwrap(); - assert_eq!(remaining.len(), 2); -} - -#[tokio::test] -async fn cleanup_before_version_removes_all_if_min_version_too_high() { - let temp_dir = TempDir::new().unwrap(); - let journal_path = temp_dir.path().join("test.jrn"); - - create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); - create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); - - let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - - // Min version is higher than all snapshots - let removed = cleanup.cleanup_before_version(5000).await.unwrap(); - - assert_eq!(removed.len(), 2); - - let remaining = cleanup.list_snapshots().await.unwrap(); - assert!(remaining.is_empty()); -} - -#[tokio::test] -async fn cleanup_keep_recent_removes_old_snapshots() { - let temp_dir = TempDir::new().unwrap(); - let journal_path = temp_dir.path().join("test.jrn"); - - create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); - create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); - create_mock_snapshot(&temp_dir, "test.jrn", 3000, 300); - create_mock_snapshot(&temp_dir, "test.jrn", 4000, 400); - create_mock_snapshot(&temp_dir, "test.jrn", 5000, 500); - - let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - - // Keep only the 2 most recent snapshots - let removed = cleanup.cleanup_keep_recent(2).await.unwrap(); - - assert_eq!(removed.len(), 3); - assert_eq!(removed[0].version, 1000); - assert_eq!(removed[1].version, 2000); - assert_eq!(removed[2].version, 3000); - - // Verify remaining snapshots - let remaining = cleanup.list_snapshots().await.unwrap(); - assert_eq!(remaining.len(), 2); - assert_eq!(remaining[0].version, 4000); - assert_eq!(remaining[1].version, 5000); -} - -#[tokio::test] -async fn cleanup_keep_recent_keeps_all_if_count_too_high() { - let temp_dir = TempDir::new().unwrap(); - let journal_path = temp_dir.path().join("test.jrn"); - - create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); - create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); - - let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - - // Keep count is higher than total snapshots - let removed = cleanup.cleanup_keep_recent(5).await.unwrap(); - - assert!(removed.is_empty()); - - let remaining = cleanup.list_snapshots().await.unwrap(); - assert_eq!(remaining.len(), 2); -} - -#[tokio::test] -async fn cleanup_keep_recent_with_zero_removes_all() { - let temp_dir = TempDir::new().unwrap(); - let journal_path = temp_dir.path().join("test.jrn"); - - create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); - create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); - - let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - - let removed = cleanup.cleanup_keep_recent(0).await.unwrap(); - - assert_eq!(removed.len(), 2); - - let remaining = cleanup.list_snapshots().await.unwrap(); - assert!(remaining.is_empty()); -} - -#[tokio::test] -async fn total_snapshot_size_calculates_correctly() { - let temp_dir = TempDir::new().unwrap(); - let journal_path = temp_dir.path().join("test.jrn"); - - create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); - create_mock_snapshot(&temp_dir, "test.jrn", 2000, 250); - create_mock_snapshot(&temp_dir, "test.jrn", 3000, 150); - - let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - let total_size = cleanup.total_snapshot_size().await.unwrap(); - - assert_eq!(total_size, 500); // 100 + 250 + 150 -} - -#[tokio::test] -async fn total_snapshot_size_returns_zero_for_empty() { - let temp_dir = TempDir::new().unwrap(); - let journal_path = temp_dir.path().join("test.jrn"); - - let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - let total_size = cleanup.total_snapshot_size().await.unwrap(); - - assert_eq!(total_size, 0); -} - -#[tokio::test] -async fn oldest_and_newest_snapshot_versions() { - let temp_dir = TempDir::new().unwrap(); - let journal_path = temp_dir.path().join("test.jrn"); - - create_mock_snapshot(&temp_dir, "test.jrn", 2000, 200); - create_mock_snapshot(&temp_dir, "test.jrn", 1000, 100); - create_mock_snapshot(&temp_dir, "test.jrn", 3000, 300); - - let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - - assert_eq!(cleanup.oldest_snapshot_version().await.unwrap(), Some(1000)); - assert_eq!(cleanup.newest_snapshot_version().await.unwrap(), Some(3000)); -} - -#[tokio::test] -async fn oldest_and_newest_return_none_for_empty() { - let temp_dir = TempDir::new().unwrap(); - let journal_path = temp_dir.path().join("test.jrn"); - - let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - - assert_eq!(cleanup.oldest_snapshot_version().await.unwrap(), None); - assert_eq!(cleanup.newest_snapshot_version().await.unwrap(), None); -} - -#[tokio::test] -async fn works_with_subdirectory_paths() { - let temp_dir = TempDir::new().unwrap(); - let subdir = temp_dir.path().join("data"); - fs::create_dir(&subdir).unwrap(); - - let journal_path = subdir.join("store.jrn"); - - // Create snapshots in subdirectory - let filename1 = format!("store.jrn.v{}.zz", 1000); - let path1 = subdir.join(&filename1); - fs::write(&path1, vec![0u8; 100]).unwrap(); - - let filename2 = format!("store.jrn.v{}.zz", 2000); - let path2 = subdir.join(&filename2); - fs::write(&path2, vec![0u8; 200]).unwrap(); - - let cleanup = SnapshotCleanup::new(&journal_path).unwrap(); - let snapshots = cleanup.list_snapshots().await.unwrap(); - - assert_eq!(snapshots.len(), 2); - assert_eq!(snapshots[0].version, 1000); - assert_eq!(snapshots[1].version, 2000); -} - -#[tokio::test] -async fn cleanup_with_different_base_names() { - let temp_dir = TempDir::new().unwrap(); - - // Create snapshots for different journals - create_mock_snapshot(&temp_dir, "journal_a.jrn", 1000, 100); - create_mock_snapshot(&temp_dir, "journal_a.jrn", 2000, 200); - create_mock_snapshot(&temp_dir, "journal_b.jrn", 1000, 100); - create_mock_snapshot(&temp_dir, "journal_b.jrn", 2000, 200); - - // Cleanup for journal_a should only see journal_a snapshots - let cleanup_a = SnapshotCleanup::new(temp_dir.path().join("journal_a.jrn")).unwrap(); - let snapshots_a = cleanup_a.list_snapshots().await.unwrap(); - - assert_eq!(snapshots_a.len(), 2); - assert!( - snapshots_a - .iter() - .all(|s| s.path.to_string_lossy().contains("journal_a")) - ); - - // Cleanup for journal_b should only see journal_b snapshots - let cleanup_b = SnapshotCleanup::new(temp_dir.path().join("journal_b.jrn")).unwrap(); - let snapshots_b = cleanup_b.list_snapshots().await.unwrap(); - - assert_eq!(snapshots_b.len(), 2); - assert!( - snapshots_b - .iter() - .all(|s| s.path.to_string_lossy().contains("journal_b")) - ); -} From f00945c0ca887d3007a1ca42628aa013c69b49ea Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 20:21:53 -0800 Subject: [PATCH 21/66] remove rotation callback --- bd-resilient-kv/AGENTS.md | 2 - bd-resilient-kv/README.md | 42 +--------- bd-resilient-kv/src/lib.rs | 2 +- .../src/tests/versioned_kv_store_test.rs | 83 ------------------- bd-resilient-kv/src/versioned_kv_store.rs | 28 ------- 5 files changed, 2 insertions(+), 155 deletions(-) diff --git a/bd-resilient-kv/AGENTS.md b/bd-resilient-kv/AGENTS.md index 2ff31a1b..8f6777f8 100644 --- a/bd-resilient-kv/AGENTS.md +++ b/bd-resilient-kv/AGENTS.md @@ -74,7 +74,6 @@ The `VersionedKVStore` provides a higher-level API built on top of `VersionedKVJ - Current state is compacted into a new journal as versioned entries - Old journal is archived with `.v{version}.zz` suffix - Archived journals are automatically compressed using zlib (RFC 1950, level 3) asynchronously -- Optional callback invoked with archived path and version - Application controls upload/cleanup of archived journals **Rotation Guarantees**: @@ -123,7 +122,6 @@ The `recover_current()` method in `VersionedRecovery` is optimized to only read **Key Differences**: - **KVStore**: Switches between two buffers, old buffer is reset and reused - **VersionedKVStore**: Archives old journal with `.v{version}` suffix, creates new journal -- **Callback**: Only `VersionedKVStore` supports rotation callbacks for upload/cleanup - **Version Preservation**: Archived journals preserve complete history for recovery **When Rotation Occurs**: diff --git a/bd-resilient-kv/README.md b/bd-resilient-kv/README.md index 571239fa..68855d38 100644 --- a/bd-resilient-kv/README.md +++ b/bd-resilient-kv/README.md @@ -35,7 +35,7 @@ bd-bonjson = { path = "path/to/bd-bonjson" } **VersionedKVStore**: Use when you need version tracking - Best for: Audit logs, state history, remote backup -- Features: Every write operation returns a version number, automatic rotation with callbacks +- Features: Every write operation returns a version number, automatic rotation - See: [VERSIONED_FORMAT.md](./VERSIONED_FORMAT.md) for detailed format documentation ### Basic Usage @@ -224,46 +224,6 @@ async fn main() -> anyhow::Result<()> { } ``` -### Versioned Store with Rotation Callback - -Monitor journal rotation events for remote backup or cleanup: - -```rust -use bd_resilient_kv::{VersionedKVStore, RotationCallback}; -use bd_bonjson::Value; - -fn upload_to_remote(path: &std::path::Path, version: u64) { - println!("Uploading archived journal {:?} at version {}", path, version); - // Upload to S3, backup server, etc. -} - -#[tokio::main] -async fn main() -> anyhow::Result<()> { - let callback: RotationCallback = Box::new(|archived_path, _new_path, version| { - upload_to_remote(archived_path, version); - }); - - let mut store = VersionedKVStore::new( - ".", // Directory path - "my_store", // Journal name - 512 * 1024, // 512KB rotation threshold - Some(callback) - )?; - - // When high water mark is reached during insert/remove, - // the callback will be invoked with archived journal path - for i in 0..10000 { - store.insert(format!("key_{}", i), Value::Integer(i as i64)).await?; - // Automatic rotation happens when journal exceeds 512KB - } - - // Manual rotation is also supported (also async) - store.rotate_journal().await?; - - Ok(()) -} -``` - ### Key Features of VersionedKVStore - **Async API**: Write operations (`insert()`, `remove()`, `rotate_journal()`) are async and require a Tokio runtime diff --git a/bd-resilient-kv/src/lib.rs b/bd-resilient-kv/src/lib.rs index 2b4c1732..17d70bd0 100644 --- a/bd-resilient-kv/src/lib.rs +++ b/bd-resilient-kv/src/lib.rs @@ -31,5 +31,5 @@ pub use kv_journal::{ VersionedKVJournal, }; pub use kv_store::KVStore; -pub use versioned_kv_store::{RotationCallback, VersionedKVStore}; +pub use versioned_kv_store::VersionedKVStore; pub use versioned_recovery::VersionedRecovery; diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index da9254c7..0f98f0b2 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -9,7 +9,6 @@ use crate::VersionedKVStore; use bd_bonjson::Value; -use std::sync::{Arc, Mutex}; use tempfile::TempDir; #[test] @@ -183,49 +182,6 @@ async fn test_null_value_is_deletion() -> anyhow::Result<()> { Ok(()) } -#[tokio::test] -async fn test_rotation_callback() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - // Use a small buffer and low high water mark to trigger rotation easily - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 1024, Some(0.3))?; - - // Set up callback to track rotation events - let callback_data = Arc::new(Mutex::new(Vec::new())); - let callback_data_clone = Arc::clone(&callback_data); - - store.set_rotation_callback(Box::new(move |old_path, new_path, timestamp| { - let mut data = callback_data_clone.lock().unwrap(); - data.push((old_path.to_path_buf(), new_path.to_path_buf(), timestamp)); - })); - - // Write enough data to trigger rotation - let mut last_timestamp = 0; - for i in 0 .. 100 { - let key = format!("key{}", i); - let value = Value::String(format!("value_{}_with_some_extra_padding", i)); - last_timestamp = store.insert(key, value).await?; - - // Rotation happens automatically inside insert when high water mark is triggered - let data = callback_data.lock().unwrap(); - if !data.is_empty() { - break; - } - } - - // Check that callback was invoked - let data = callback_data.lock().unwrap(); - assert!(data.len() >= 1, "Expected at least one rotation event"); - - let (old_path, new_path, rotation_timestamp) = &data[0]; - assert!(old_path.to_string_lossy().contains(".t")); - assert_eq!(new_path, &temp_dir.path().join("test.jrn")); - assert!(*rotation_timestamp <= last_timestamp); - - Ok(()) -} - #[tokio::test] async fn test_manual_rotation() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; @@ -703,42 +659,3 @@ async fn test_multiple_rotations_with_compression() -> anyhow::Result<()> { Ok(()) } - -#[tokio::test] -async fn test_rotation_callback_receives_compressed_path() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - - let callback_data = Arc::new(Mutex::new(None)); - let callback_data_clone = Arc::clone(&callback_data); - - store.set_rotation_callback(Box::new(move |old_path, _new_path, _version| { - let mut data = callback_data_clone.lock().unwrap(); - *data = Some(old_path.to_path_buf()); - })); - - store - .insert("key1".to_string(), Value::String("value1".to_string())) - .await?; - store.rotate_journal().await?; - - // Verify callback received compressed path - let data = callback_data.lock().unwrap(); - let archived_path = data.as_ref().unwrap(); - - assert!( - archived_path.to_string_lossy().ends_with(".zz"), - "Callback should receive compressed archive path ending with .zz, got: {:?}", - archived_path - ); - - // Verify the file actually exists - assert!( - archived_path.exists(), - "Compressed archive passed to callback should exist" - ); - - Ok(()) -} diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs index 4bd22668..2f21d269 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -10,17 +10,6 @@ use ahash::AHashMap; use bd_bonjson::Value; use std::path::{Path, PathBuf}; -/// Callback invoked when journal rotation occurs. -/// -/// The callback receives: -/// - `old_journal_path`: The path to the archived journal file that was just rotated out -/// - `new_journal_path`: The path to the new active journal file -/// - `rotation_timestamp`: The timestamp at which rotation occurred (snapshot timestamp) -/// -/// This callback can be used to trigger asynchronous upload of archived journals to remote -/// storage, perform cleanup, or other post-rotation operations. -pub type RotationCallback = Box; - /// Compress an archived journal using zlib. /// /// This function compresses the source file to the destination using zlib compression. @@ -106,7 +95,6 @@ pub struct VersionedKVStore { journal_name: String, buffer_size: usize, high_water_mark_ratio: Option, - rotation_callback: Option, } impl VersionedKVStore { @@ -156,7 +144,6 @@ impl VersionedKVStore { journal_name: name.to_string(), buffer_size, high_water_mark_ratio, - rotation_callback: None, }) } @@ -197,7 +184,6 @@ impl VersionedKVStore { journal_name: name.to_string(), buffer_size, high_water_mark_ratio, - rotation_callback: None, }) } @@ -206,15 +192,6 @@ impl VersionedKVStore { self.dir_path.join(format!("{}.jrn", self.journal_name)) } - /// Set a callback to be invoked when journal rotation occurs. - /// - /// The callback receives the path to the archived journal file, the new active journal file, - /// and the rotation version. This can be used to trigger asynchronous upload of archived - /// journals to remote storage. - pub fn set_rotation_callback(&mut self, callback: RotationCallback) { - self.rotation_callback = Some(callback); - } - /// Get a value by key. /// /// This operation is O(1) as it reads from the in-memory cache. @@ -395,11 +372,6 @@ impl VersionedKVStore { // Remove uncompressed version tokio::fs::remove_file(&temp_uncompressed).await?; - // Invoke rotation callback if set - if let Some(ref mut callback) = self.rotation_callback { - callback(&archived_path, &journal_path, rotation_timestamp); - } - Ok(()) } From 8045327cfe4b14ab93b1fdfb55218604fa4bb815 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Wed, 5 Nov 2025 20:25:33 -0800 Subject: [PATCH 22/66] update docs --- bd-resilient-kv/AGENTS.md | 10 +++++----- bd-resilient-kv/README.md | 6 +++--- bd-resilient-kv/VERSIONED_FORMAT.md | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/bd-resilient-kv/AGENTS.md b/bd-resilient-kv/AGENTS.md index 8f6777f8..a0293c3d 100644 --- a/bd-resilient-kv/AGENTS.md +++ b/bd-resilient-kv/AGENTS.md @@ -73,7 +73,7 @@ The `VersionedKVStore` provides a higher-level API built on top of `VersionedKVJ - Automatic rotation when journal size exceeds high water mark (triggered during async write operations) - Current state is compacted into a new journal as versioned entries - Old journal is archived with `.v{version}.zz` suffix -- Archived journals are automatically compressed using zlib (RFC 1950, level 3) asynchronously +- Archived journals are automatically compressed using zlib (RFC 1950, level 5) asynchronously - Application controls upload/cleanup of archived journals **Rotation Guarantees**: @@ -84,7 +84,7 @@ The `VersionedKVStore` provides a higher-level API built on top of `VersionedKVJ **Compression**: - All archived journals are automatically compressed during rotation using async I/O - Active journals remain uncompressed for write performance -- Compression uses zlib format (RFC 1950) with level 3 for balanced speed/ratio +- Compression uses zlib format (RFC 1950) with level 5 for balanced speed/ratio - Streaming compression avoids loading entire journals into memory - Typical compression achieves >50% size reduction for text-based data - File extension `.zz` indicates compressed archives @@ -121,13 +121,13 @@ The `recover_current()` method in `VersionedRecovery` is optimized to only read **Key Differences**: - **KVStore**: Switches between two buffers, old buffer is reset and reused -- **VersionedKVStore**: Archives old journal with `.v{version}` suffix, creates new journal +- **VersionedKVStore**: Archives old journal with `.t{timestamp}` suffix, creates new journal - **Version Preservation**: Archived journals preserve complete history for recovery **When Rotation Occurs**: - Triggered during `insert()` or `remove()` when journal size exceeds high water mark - Can be manually triggered via `rotate()` -- Automatic and transparent to the caller (except for callback) +- Automatic and transparent to the caller ### 4. Bulk Operations and Retry Logic The system includes sophisticated retry logic specifically for bulk operations: @@ -403,4 +403,4 @@ The kv_journal system is built around efficient append-only storage with intelli - **Cross-Layer Integration**: Consistent bulk operation patterns from FeatureFlags → KVStore → KVJournal - **Optimized for Real-World Use**: Handles edge cases like partial buffer fills and concurrent compaction -**Breaking Changes**: The callback system (`set_high_water_mark_callback`, `HighWaterMarkCallback`) has been completely removed. Code relying on callbacks will no longer compile and must be updated to check the `is_high_water_mark_triggered()` flag instead. \ No newline at end of file +**Breaking Changes**: The callback system (`set_high_water_mark_callback`, `HighWaterMarkCallback`) has been completely removed. Code relying on callbacks will no longer compile and must be updated to check the `is_high_water_mark_triggered()` flag instead. diff --git a/bd-resilient-kv/README.md b/bd-resilient-kv/README.md index 68855d38..f310ac43 100644 --- a/bd-resilient-kv/README.md +++ b/bd-resilient-kv/README.md @@ -233,7 +233,7 @@ async fn main() -> anyhow::Result<()> { - Creates a new journal with the current state as versioned entries (compaction) - Preserves original timestamps from the initial writes - Archives the old journal with `.v{version}.zz` suffix - - Compresses the archived journal using zlib (RFC 1950, level 3) asynchronously + - Compresses the archived journal using zlib (RFC 1950, level 5) asynchronously - Invokes the rotation callback (if provided) for upload/cleanup - **Automatic Compression**: Archived journals are automatically compressed to save disk space - Active journals remain uncompressed for write performance @@ -377,7 +377,7 @@ After rotation: ``` **Compression**: -- Archived journals are automatically compressed using zlib (RFC 1950, level 3) +- Archived journals are automatically compressed using zlib (RFC 1950, level 5) - Active journals remain uncompressed for optimal write performance - Decompression is handled transparently during recovery - File extension `.zz` indicates compressed archives @@ -531,7 +531,7 @@ async fn main() -> anyhow::Result<()> { ``` **Compression Details**: -- **Format**: zlib (RFC 1950) with compression level 3 +- **Format**: zlib (RFC 1950) with compression level 5 - **Performance**: Balanced speed/compression ratio, performed asynchronously with streaming I/O - **Transparency**: Recovery automatically detects and decompresses archived journals - **Naming**: `.zz` extension indicates compressed archives diff --git a/bd-resilient-kv/VERSIONED_FORMAT.md b/bd-resilient-kv/VERSIONED_FORMAT.md index d9de93bd..14bd87c8 100644 --- a/bd-resilient-kv/VERSIONED_FORMAT.md +++ b/bd-resilient-kv/VERSIONED_FORMAT.md @@ -126,12 +126,12 @@ After rotation: ### Compression -Archived journals are automatically compressed using zlib (compression level 3) during rotation: +Archived journals are automatically compressed using zlib (compression level 5) during rotation: - **Format**: Standard zlib format (RFC 1950) - **Extension**: `.zz` indicates zlib compression - **Transparency**: `VersionedRecovery` automatically decompresses archives when reading - **Benefits**: Reduced storage space and bandwidth for remote backups -- **Performance**: Compression level 3 provides good balance between speed and compression ratio +- **Performance**: Compression level 5 provides good balance between speed and compression ratio ### Rotation Failure Modes From fb10cb8a7bf477dfba2662bc01907e5c1e77e2cc Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Thu, 6 Nov 2025 11:05:06 -0800 Subject: [PATCH 23/66] revert cargo --- bd-logger/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bd-logger/Cargo.toml b/bd-logger/Cargo.toml index 85dd6345..f65a9371 100644 --- a/bd-logger/Cargo.toml +++ b/bd-logger/Cargo.toml @@ -63,7 +63,7 @@ bd-hyper-network = { path = "../bd-hyper-network" } bd-noop-network = { path = "../bd-noop-network" } bd-test-helpers = { path = "../bd-test-helpers", default-features = false } ctor.workspace = true -flate2 = { workspace = true, features = ["zlib"] } +flate2.workspace = true pretty_assertions.workspace = true tempfile.workspace = true tokio-test.workspace = true From 163c53c5f0ade3518fa43b4fc9f42f6ca4c16d67 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Thu, 6 Nov 2025 11:08:33 -0800 Subject: [PATCH 24/66] update docs --- bd-resilient-kv/AGENTS.md | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/bd-resilient-kv/AGENTS.md b/bd-resilient-kv/AGENTS.md index a0293c3d..b9711218 100644 --- a/bd-resilient-kv/AGENTS.md +++ b/bd-resilient-kv/AGENTS.md @@ -44,7 +44,7 @@ The system provides efficient bulk operations through a consistent pattern: The `VersionedKVStore` provides a higher-level API built on top of `VersionedKVJournal`: **Key Components**: -- **VersionedKVJournal**: Low-level journal that tracks version numbers for each entry +- **VersionedKVJournal**: Low-level journal that tracks timestamps for each entry - **MemMappedVersionedKVJournal**: Memory-mapped persistence layer - **VersionedKVStore**: High-level HashMap-like API with automatic rotation and async write operations @@ -55,10 +55,11 @@ The `VersionedKVStore` provides a higher-level API built on top of `VersionedKVJ - The async API enables efficient background compression without blocking the main thread **Version Tracking**: -- Every write operation (`insert`, `remove`) returns a monotonically non-decreasing version number -- Version numbers start at 1 (base version), first write is version 2 -- Entries with `Value::Null` are treated as deletions but still versioned -- During rotation, all snapshot entries share the same version (the rotation version) +- Every write operation (`insert`, `remove`) returns a monotonically non-decreasing timestamp (nanoseconds since UNIX epoch) +- Timestamps serve as both version identifiers and logical clocks +- If the system clock goes backward, timestamps are clamped to the last timestamp to maintain monotonicity +- Entries with `Value::Null` are treated as deletions but still timestamped +- During rotation, snapshot entries preserve their original timestamps **Timestamp Tracking**: - Each entry records a timestamp (nanoseconds since UNIX epoch) when the write occurred @@ -91,10 +92,10 @@ The `VersionedKVStore` provides a higher-level API built on top of `VersionedKVJ - Recovery transparently decompresses archived journals when needed **Point-in-Time Recovery**: -The `VersionedRecovery` utility provides point-in-time recovery capabilities for versioned journals. It works with raw journal bytes and can reconstruct state at any historical version, including across rotation boundaries. `VersionedRecovery` is designed for offline analysis, audit tooling, and server-side operations - it is separate from `VersionedKVStore` which is focused on active write operations. Applications can use `VersionedRecovery` to analyze archived journals and recover state at specific versions. The `from_files()` constructor is async for efficient file reading. +The `VersionedRecovery` utility provides point-in-time recovery capabilities for versioned journals. It works with raw journal bytes and can reconstruct state at any historical timestamp, including across rotation boundaries. `VersionedRecovery` is designed for offline analysis, audit tooling, and server-side operations - it is separate from `VersionedKVStore` which is focused on active write operations. Applications can use `VersionedRecovery` to analyze archived journals and recover state at specific timestamps. The `from_files()` constructor is async for efficient file reading. **Recovery Optimization**: -The `recover_current()` method in `VersionedRecovery` is optimized to only read the last journal rather than replaying all journals from the beginning. This is possible because journal rotation writes the complete current state into the new journal at the snapshot version, so the last journal alone contains the full current state. For historical version recovery, `recover_at_version()` intelligently selects and replays only the necessary journals. +The `recover_current()` method in `VersionedRecovery` is optimized to only read the last journal rather than replaying all journals from the beginning. This is possible because journal rotation writes the complete current state into the new journal with original timestamps preserved, so the last journal alone contains the full current state. For historical timestamp recovery, `recover_at_timestamp()` intelligently selects and replays only the necessary journals. ## Critical Design Insights @@ -110,7 +111,7 @@ The `recover_current()` method in `VersionedRecovery` is optimized to only read - Best for: Audit logs, state history, remote backup - Architecture: Single journal with archived versions - Rotation: Creates new journal with compacted state -- Version tracking: Every write returns a version number +- Timestamp tracking: Every write returns a timestamp ### 2. Compaction Efficiency **Key Insight**: Compaction via `reinit_from()` is already maximally efficient. It writes data in the most compact possible serialized form (hashmap → bytes). If even this compact representation exceeds high water marks, then the data volume itself is the limiting factor, not inefficient storage. @@ -270,8 +271,8 @@ fn set_multiple(&mut self, entries: &[(String, Value)]) -> anyhow::Result<()> { - **Why Impossible**: Compaction via `reinit_from()` writes to inactive buffer of the same size. Same reasoning as rotation. - **Implication**: `switch_journals()` may set high water mark flag, but won't fail due to buffer overflow -3. **Version Number Overflow (VersionedKVStore)** - - **Why Practically Impossible**: Uses u64, would require 58+ million years at 10,000 writes/second +3. **Timestamp Overflow (VersionedKVStore)** + - **Why Practically Impossible**: Uses u64 for nanosecond timestamps, would require 584+ years to overflow (u64::MAX nanoseconds ≈ year 2554) - **Implication**: No overflow handling needed in practice ## Common Pitfalls From 2bbc81ef398813cf971652c445966ce36f9d3bf2 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Thu, 6 Nov 2025 11:11:26 -0800 Subject: [PATCH 25/66] simplify docs --- bd-resilient-kv/AGENTS.md | 15 +-------------- bd-resilient-kv/src/versioned_kv_store.rs | 5 +++++ 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/bd-resilient-kv/AGENTS.md b/bd-resilient-kv/AGENTS.md index b9711218..c7ef4bd6 100644 --- a/bd-resilient-kv/AGENTS.md +++ b/bd-resilient-kv/AGENTS.md @@ -77,11 +77,6 @@ The `VersionedKVStore` provides a higher-level API built on top of `VersionedKVJ - Archived journals are automatically compressed using zlib (RFC 1950, level 5) asynchronously - Application controls upload/cleanup of archived journals -**Rotation Guarantees**: -- **Impossible Failure Mode**: Rotation cannot fail due to insufficient buffer space -- **Reasoning**: Rotation creates a new journal with the same buffer size as the original. Since compaction only removes redundant updates (old versions of keys), the compacted state is always ≤ the current journal size. If data fits in the journal during normal operation, it will always fit during rotation. -- **Implication**: Applications do not need to handle "buffer overflow during rotation" errors. This is an architectural guarantee. - **Compression**: - All archived journals are automatically compressed during rotation using async I/O - Active journals remain uncompressed for write performance @@ -263,15 +258,7 @@ fn set_multiple(&mut self, entries: &[(String, Value)]) -> anyhow::Result<()> { ### Impossible Failure Modes (Architectural Guarantees) -1. **Buffer Overflow During Rotation (VersionedKVStore)** - - **Why Impossible**: Rotation creates new journal with same buffer size. Compaction only removes redundant updates, so compacted state ≤ current journal size. If data fits during normal operation, it always fits during rotation. - - **Implication**: No need to handle "insufficient buffer during rotation" errors - -2. **Buffer Overflow During Compaction (KVStore)** - - **Why Impossible**: Compaction via `reinit_from()` writes to inactive buffer of the same size. Same reasoning as rotation. - - **Implication**: `switch_journals()` may set high water mark flag, but won't fail due to buffer overflow - -3. **Timestamp Overflow (VersionedKVStore)** +1. **Timestamp Overflow (VersionedKVStore)** - **Why Practically Impossible**: Uses u64 for nanosecond timestamps, would require 584+ years to overflow (u64::MAX nanoseconds ≈ year 2554) - **Implication**: No overflow handling needed in practice diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs index 2f21d269..a7bc971b 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -385,6 +385,11 @@ impl VersionedKVStore { } /// Create a new rotated journal with compacted state. + /// + /// Note: Rotation cannot fail due to insufficient buffer space. Since rotation creates a new + /// journal with the same buffer size and compaction only removes redundant updates (old + /// versions of keys), the compacted state is always ≤ the current journal size. If data fits + /// during normal operation, it will always fit during rotation. async fn create_rotated_journal(&self) -> anyhow::Result { // Create temporary journal file let temp_path = self.dir_path.join(format!("{}.jrn.tmp", self.journal_name)); From 790a5331d2adaf4940fae83f959dcd4e51b1bb96 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Thu, 6 Nov 2025 12:50:58 -0800 Subject: [PATCH 26/66] simplify recovery --- bd-resilient-kv/AGENTS.md | 5 +- .../src/tests/versioned_recovery_test.rs | 421 ++++++++---------- bd-resilient-kv/src/versioned_recovery.rs | 248 ++--------- 3 files changed, 240 insertions(+), 434 deletions(-) diff --git a/bd-resilient-kv/AGENTS.md b/bd-resilient-kv/AGENTS.md index c7ef4bd6..4df8a008 100644 --- a/bd-resilient-kv/AGENTS.md +++ b/bd-resilient-kv/AGENTS.md @@ -87,10 +87,7 @@ The `VersionedKVStore` provides a higher-level API built on top of `VersionedKVJ - Recovery transparently decompresses archived journals when needed **Point-in-Time Recovery**: -The `VersionedRecovery` utility provides point-in-time recovery capabilities for versioned journals. It works with raw journal bytes and can reconstruct state at any historical timestamp, including across rotation boundaries. `VersionedRecovery` is designed for offline analysis, audit tooling, and server-side operations - it is separate from `VersionedKVStore` which is focused on active write operations. Applications can use `VersionedRecovery` to analyze archived journals and recover state at specific timestamps. The `from_files()` constructor is async for efficient file reading. - -**Recovery Optimization**: -The `recover_current()` method in `VersionedRecovery` is optimized to only read the last journal rather than replaying all journals from the beginning. This is possible because journal rotation writes the complete current state into the new journal with original timestamps preserved, so the last journal alone contains the full current state. For historical timestamp recovery, `recover_at_timestamp()` intelligently selects and replays only the necessary journals. +The `VersionedRecovery` utility provides point-in-time recovery by replaying journal entries up to a target timestamp. It works with raw journal bytes and can reconstruct state at any historical timestamp across rotation boundaries. Recovery is optimized: `recover_current()` only reads the last journal (since rotation writes complete compacted state), while `recover_at_timestamp()` intelligently selects and replays only necessary journals. The `from_files()` constructor is async for efficient file reading. ## Critical Design Insights diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index 092ea6e7..fad0a9ef 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -6,12 +6,64 @@ // https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt #![allow(clippy::unwrap_used)] +#![allow(clippy::case_sensitive_file_extension_comparisons)] use crate::VersionedKVStore; use crate::versioned_recovery::VersionedRecovery; use bd_bonjson::Value; use tempfile::TempDir; +/// Helper function to decompress zlib-compressed data. +/// The `VersionedRecovery` no longer handles compression, so tests must decompress manually. +fn decompress_zlib(data: &[u8]) -> anyhow::Result> { + use flate2::read::ZlibDecoder; + use std::io::Read; + + let mut decoder = ZlibDecoder::new(data); + let mut decompressed = Vec::new(); + decoder.read_to_end(&mut decompressed)?; + Ok(decompressed) +} + +/// Helper function to find archived journal files in a directory. +/// Returns sorted paths to all `.zz` compressed journal archives. +fn find_archived_journals(dir: &std::path::Path) -> anyhow::Result> { + let mut archived_files = std::fs::read_dir(dir)? + .filter_map(|entry| { + let entry = entry.ok()?; + let path = entry.path(); + let filename = path.file_name()?.to_str()?; + if filename.ends_with(".zz") { + Some(path) + } else { + None + } + }) + .collect::>(); + archived_files.sort(); + Ok(archived_files) +} + +/// Helper function to extract rotation timestamp from an archived journal filename. +/// Archived journals have the format: `{name}.jrn.t{timestamp}.zz` +fn extract_rotation_timestamp(path: &std::path::Path) -> anyhow::Result { + let filename = path + .file_name() + .and_then(|f| f.to_str()) + .ok_or_else(|| anyhow::anyhow!("Invalid filename"))?; + + let timestamp = filename + .split('.') + .find(|part| { + part.starts_with('t') && part.len() > 1 && part[1 ..].chars().all(|c| c.is_ascii_digit()) + }) + .and_then(|part| part.strip_prefix('t')) + .ok_or_else(|| anyhow::anyhow!("No timestamp found in filename: {}", filename))? + .parse::()?; + + Ok(timestamp) +} + #[tokio::test] async fn test_recover_current_only_needs_last_journal() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; @@ -52,33 +104,23 @@ async fn test_recover_current_only_needs_last_journal() -> anyhow::Result<()> { // Get expected current state using the store's hashmap let expected_state = store.as_hashmap(); - // Read ALL journals + // Read ALL journals (archived + active) let mut all_journals = Vec::new(); - let mut archived_paths = std::fs::read_dir(temp_dir.path())? - .filter_map(|entry| { - let entry = entry.ok()?; - let path = entry.path(); - if path.file_name()?.to_str()?.starts_with("test.jrn.t") { - Some(path) - } else { - None - } - }) - .collect::>(); - - // Sort to ensure chronological order - archived_paths.sort(); - for archived_path in &archived_paths { - all_journals.push(std::fs::read(archived_path)?); + for archived_path in find_archived_journals(temp_dir.path())? { + let rotation_ts = extract_rotation_timestamp(&archived_path)?; + all_journals.push((std::fs::read(archived_path)?, rotation_ts)); } - // Read active journal (the last one) + // Read active journal (the last one) - use u64::MAX for active journal let active_journal = std::fs::read(temp_dir.path().join("test.jrn"))?; - all_journals.push(active_journal.clone()); + all_journals.push((active_journal.clone(), u64::MAX)); // Test 1: Verify recover_current() with ALL journals gives correct state - let all_journal_refs: Vec<&[u8]> = all_journals.iter().map(Vec::as_slice).collect(); + let all_journal_refs: Vec<(&[u8], u64)> = all_journals + .iter() + .map(|(data, ts)| (data.as_slice(), *ts)) + .collect(); let recovery_all = VersionedRecovery::new(all_journal_refs)?; let state_all = recovery_all.recover_current()?; @@ -90,7 +132,7 @@ async fn test_recover_current_only_needs_last_journal() -> anyhow::Result<()> { // Test 2: Verify recover_current() with ONLY the last journal gives the same state // This is the optimization we want to prove works! - let recovery_last = VersionedRecovery::new(vec![&active_journal])?; + let recovery_last = VersionedRecovery::new(vec![(&active_journal, u64::MAX)])?; let state_last = recovery_last.recover_current()?; let state_last_values: ahash::AHashMap = state_last @@ -127,18 +169,7 @@ async fn test_detection_compressed_journal() -> anyhow::Result<()> { store.rotate_journal().await?; // Find the archived file - let mut archived_files = std::fs::read_dir(temp_dir.path())? - .filter_map(|entry| { - let entry = entry.ok()?; - let path = entry.path(); - if path.file_name()?.to_str()?.starts_with("test.jrn.t") { - Some(path) - } else { - None - } - }) - .collect::>(); - archived_files.sort(); + let archived_files = find_archived_journals(temp_dir.path())?; let archived_path = archived_files.first().unwrap(); let compressed_data = std::fs::read(archived_path)?; @@ -148,8 +179,11 @@ async fn test_detection_compressed_journal() -> anyhow::Result<()> { "Compressed data should start with zlib magic byte" ); - // Should successfully detect and decompress - let recovery = VersionedRecovery::new(vec![&compressed_data])?; + // Decompress manually since VersionedRecovery no longer handles compression + let decompressed_data = decompress_zlib(&compressed_data)?; + + // Should successfully recover from decompressed data + let recovery = VersionedRecovery::new(vec![(&decompressed_data, u64::MAX)])?; let state = recovery.recover_current()?; assert_eq!(state.len(), 1); assert_eq!( @@ -173,59 +207,76 @@ async fn test_detection_invalid_journal_data() -> anyhow::Result<()> { let valid_data = std::fs::read(temp_dir.path().join("test.jrn"))?; // Test 1: Invalid format version + // Since VersionedRecovery no longer validates during construction, + // errors will occur when trying to recover data let mut invalid_version = vec![0u8; 32]; let version_bytes = 999u64.to_le_bytes(); invalid_version[0 .. 8].copy_from_slice(&version_bytes); - let result = VersionedRecovery::new(vec![&invalid_version]); - assert!(result.is_err()); + let recovery = VersionedRecovery::new(vec![(&invalid_version, u64::MAX)])?; + let result = recovery.recover_current(); assert!( - result - .unwrap_err() - .to_string() - .contains("Invalid journal format version"), - "Should fail with invalid version error" + result.is_err(), + "Should fail when recovering with invalid version" ); // Test 2: Data too small (smaller than header) let small_data = vec![0u8; 8]; - let result = VersionedRecovery::new(vec![&small_data]); - assert!(result.is_err()); + let recovery = VersionedRecovery::new(vec![(&small_data, u64::MAX)])?; + let result = recovery.recover_current(); assert!( - result.unwrap_err().to_string().contains("Data too small"), - "Should fail with data too small error" + result.is_err(), + "Should fail when recovering with data too small" ); // Test 3: Empty data let empty_data = vec![]; - let result = VersionedRecovery::new(vec![&empty_data]); - assert!(result.is_err()); + let recovery = VersionedRecovery::new(vec![(&empty_data, u64::MAX)])?; + let result = recovery.recover_current(); assert!( - result.unwrap_err().to_string().contains("Data too small"), - "Should fail with data too small error" + result.is_err(), + "Should fail when recovering with empty data" ); - // Test 4: Corrupted zlib header + // Test 4: Corrupted zlib data (caller should decompress before passing) + // If caller accidentally passes compressed data, it will fail during recovery let mut fake_zlib = vec![0x78, 0x9C]; // Valid zlib magic bytes fake_zlib.extend_from_slice(&[0xFF; 100]); // But garbage data - let result = VersionedRecovery::new(vec![&fake_zlib]); - assert!(result.is_err(), "Should fail with corrupted zlib data"); + let recovery = VersionedRecovery::new(vec![(&fake_zlib, u64::MAX)])?; + let result = recovery.recover_current(); + assert!( + result.is_err(), + "Should fail when recovering with compressed data" + ); // Test 5: Random garbage let garbage = vec![0xAB, 0xCD, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x90]; - let result = VersionedRecovery::new(vec![&garbage]); - assert!(result.is_err()); - let err_msg = result.unwrap_err().to_string(); + let recovery = VersionedRecovery::new(vec![(&garbage, u64::MAX)])?; + let result = recovery.recover_current(); assert!( - err_msg.contains("Data too small") || err_msg.contains("corrupt"), - "Should fail with appropriate error" + result.is_err(), + "Should fail when recovering with garbage data" ); // Test 6: Mixed valid and invalid journals + // When the last journal is invalid, recover_current() should fail let mut invalid_mixed = vec![0u8; 32]; let version_bytes = 999u64.to_le_bytes(); invalid_mixed[0 .. 8].copy_from_slice(&version_bytes); - let result = VersionedRecovery::new(vec![&valid_data, &invalid_mixed]); - assert!(result.is_err(), "Should fail if any journal is invalid"); + let recovery = VersionedRecovery::new(vec![(&valid_data, u64::MAX), (&invalid_mixed, u64::MAX)])?; + let result = recovery.recover_current(); + assert!( + result.is_err(), + "Should fail because last journal is invalid" + ); + + // Test 7: Mixed invalid and valid journals + // When the last journal is valid, recover_current() should succeed + let recovery = VersionedRecovery::new(vec![(&invalid_mixed, u64::MAX), (&valid_data, u64::MAX)])?; + let result = recovery.recover_current(); + assert!( + result.is_ok(), + "Should succeed because last journal is valid" + ); Ok(()) } @@ -253,10 +304,13 @@ fn test_detection_zlib_compression_level_5() { // Verify it starts with 0x78 (zlib magic byte) assert_eq!(compressed[0], 0x78); - // Should be able to detect and decompress - let result = VersionedRecovery::new(vec![&compressed]); + // Decompress manually since VersionedRecovery no longer handles compression + let decompressed = decompress_zlib(&compressed).unwrap(); + + // Should be able to process the decompressed data + let result = VersionedRecovery::new(vec![(&decompressed, u64::MAX)]); // May succeed or fail depending on whether the data is valid bonjson, - // but should at least attempt decompression without panicking + // but should at least attempt to parse without panicking let _ = result; } @@ -312,27 +366,19 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { let mut all_journals = Vec::new(); // Read archived journals - let archived_files = std::fs::read_dir(temp_dir.path())? - .filter_map(|entry| { - let entry = entry.ok()?; - let path = entry.path(); - if path.file_name()?.to_str()?.starts_with("test.jrn.t") { - Some(path) - } else { - None - } - }) - .collect::>(); - - for archived_path in archived_files { - all_journals.push(std::fs::read(archived_path)?); + for archived_path in find_archived_journals(temp_dir.path())? { + let rotation_ts = extract_rotation_timestamp(&archived_path)?; + all_journals.push((std::fs::read(archived_path)?, rotation_ts)); } // Read active journal - all_journals.push(std::fs::read(temp_dir.path().join("test.jrn"))?); + all_journals.push((std::fs::read(temp_dir.path().join("test.jrn"))?, u64::MAX)); // Create recovery utility with all journals - let journal_refs: Vec<&[u8]> = all_journals.iter().map(std::vec::Vec::as_slice).collect(); + let journal_refs: Vec<(&[u8], u64)> = all_journals + .iter() + .map(|(data, ts)| (data.as_slice(), *ts)) + .collect(); let recovery = VersionedRecovery::new(journal_refs)?; // Verify we can recover at early timestamp @@ -367,11 +413,7 @@ async fn test_recovery_empty_journal() -> anyhow::Result<()> { store.sync()?; let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - let recovery = VersionedRecovery::new(vec![&journal_data])?; - - // Should have timestamp range starting at base - let timestamp_range = recovery.timestamp_range(); - assert!(timestamp_range.is_some()); + let recovery = VersionedRecovery::new(vec![(&journal_data, u64::MAX)])?; // Recovering current should return empty map let state = recovery.recover_current()?; @@ -411,7 +453,7 @@ async fn test_recovery_with_overwrites() -> anyhow::Result<()> { store.sync()?; let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - let recovery = VersionedRecovery::new(vec![&journal_data])?; + let recovery = VersionedRecovery::new(vec![(&journal_data, u64::MAX)])?; // Each timestamp should show the value at that time let state_ts1 = recovery.recover_at_timestamp(ts1)?; @@ -454,7 +496,7 @@ async fn test_recovery_various_value_types() -> anyhow::Result<()> { store.sync()?; let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - let recovery = VersionedRecovery::new(vec![&journal_data])?; + let recovery = VersionedRecovery::new(vec![(&journal_data, u64::MAX)])?; let state = recovery.recover_current()?; assert_eq!(state.len(), 4); @@ -522,18 +564,7 @@ async fn test_recovery_from_compressed_archive() -> anyhow::Result<()> { store.sync()?; // Find the compressed archive - let mut archived_files = std::fs::read_dir(temp_dir.path())? - .filter_map(|entry| { - let entry = entry.ok()?; - let path = entry.path(); - if path.file_name()?.to_str()?.starts_with("test.jrn.t") { - Some(path) - } else { - None - } - }) - .collect::>(); - archived_files.sort(); + let archived_files = find_archived_journals(temp_dir.path())?; let archived_path = archived_files.first().unwrap(); assert!(archived_path.exists(), "Compressed archive should exist"); @@ -541,15 +572,17 @@ async fn test_recovery_from_compressed_archive() -> anyhow::Result<()> { let compressed_data = std::fs::read(archived_path)?; let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - // Create recovery from both journals (compressed first, then active) - let recovery = VersionedRecovery::new(vec![&compressed_data, &active_data])?; + // Decompress the archived journal manually + let decompressed_data = decompress_zlib(&compressed_data)?; - // Verify timestamp range spans both journals - let timestamp_range = recovery.timestamp_range(); - assert!(timestamp_range.is_some()); - let (min, max) = timestamp_range.unwrap(); - assert!(min <= ts1); - assert!(max >= ts3); + // Extract rotation timestamp from archived filename + let rotation_ts = extract_rotation_timestamp(archived_path)?; + + // Create recovery from both journals (decompressed first, then active) + let recovery = VersionedRecovery::new(vec![ + (&decompressed_data, rotation_ts), + (&active_data, u64::MAX), + ])?; // Recover at ts1 (should be in compressed archive) let state_ts1 = recovery.recover_at_timestamp(ts1)?; @@ -616,18 +649,7 @@ async fn test_recovery_from_multiple_compressed_archives() -> anyhow::Result<()> store.sync()?; // Collect all journal data (2 compressed + 1 active) - let mut archived_files = std::fs::read_dir(temp_dir.path())? - .filter_map(|entry| { - let entry = entry.ok()?; - let path = entry.path(); - if path.file_name()?.to_str()?.starts_with("test.jrn.t") { - Some(path) - } else { - None - } - }) - .collect::>(); - archived_files.sort(); + let archived_files = find_archived_journals(temp_dir.path())?; let archive1_path = &archived_files[0]; let archive2_path = &archived_files[1]; @@ -636,8 +658,20 @@ async fn test_recovery_from_multiple_compressed_archives() -> anyhow::Result<()> let archive2_data = std::fs::read(archive2_path)?; let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - // Create recovery from all journals - let recovery = VersionedRecovery::new(vec![&archive1_data, &archive2_data, &active_data])?; + // Decompress archived journals manually + let decompressed1 = decompress_zlib(&archive1_data)?; + let decompressed2 = decompress_zlib(&archive2_data)?; + + // Extract rotation timestamps + let archive1_ts = extract_rotation_timestamp(archive1_path)?; + let archive2_ts = extract_rotation_timestamp(archive2_path)?; + + // Create recovery from all journals (all decompressed) + let recovery = VersionedRecovery::new(vec![ + (&decompressed1, archive1_ts), + (&decompressed2, archive2_ts), + (&active_data, u64::MAX), + ])?; // Verify we can recover at any timestamp let state_ts1 = recovery.recover_at_timestamp(ts1)?; @@ -672,18 +706,7 @@ async fn test_recovery_mixed_compressed_and_uncompressed() -> anyhow::Result<()> store.rotate_journal().await?; // Get compressed archive - let mut archived_files = std::fs::read_dir(temp_dir.path())? - .filter_map(|entry| { - let entry = entry.ok()?; - let path = entry.path(); - if path.file_name()?.to_str()?.starts_with("test.jrn.t") { - Some(path) - } else { - None - } - }) - .collect::>(); - archived_files.sort(); + let archived_files = find_archived_journals(temp_dir.path())?; let compressed_archive_path = archived_files.first().unwrap(); let compressed_data = std::fs::read(compressed_archive_path)?; @@ -699,8 +722,17 @@ async fn test_recovery_mixed_compressed_and_uncompressed() -> anyhow::Result<()> uncompressed_store.sync()?; let uncompressed_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - // Recovery should handle both compressed and uncompressed - let recovery = VersionedRecovery::new(vec![&compressed_data, &uncompressed_data])?; + // Extract rotation timestamp from compressed archive + let compressed_ts = extract_rotation_timestamp(compressed_archive_path)?; + + // Decompress the compressed archive manually + let decompressed_data = decompress_zlib(&compressed_data)?; + + // Recovery now requires both journals to be decompressed + let recovery = VersionedRecovery::new(vec![ + (&decompressed_data, compressed_ts), + (&uncompressed_data, u64::MAX), + ])?; let state_final = recovery.recover_at_timestamp(ts2)?; assert_eq!(state_final.len(), 2); @@ -753,14 +785,7 @@ async fn test_recovery_at_timestamp() -> anyhow::Result<()> { let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; // Create recovery utility - let recovery = VersionedRecovery::new(vec![&journal_data])?; - - // Verify timestamp range - let timestamp_range = recovery.timestamp_range(); - assert!(timestamp_range.is_some()); - let (min, max) = timestamp_range.unwrap(); - assert!(min <= ts1); - assert!(max >= ts3); + let recovery = VersionedRecovery::new(vec![(&journal_data, u64::MAX)])?; // Recover at ts1: should have only key1=value1 let state_ts1 = recovery.recover_at_timestamp(ts1)?; @@ -846,17 +871,16 @@ async fn test_recovery_at_timestamp_with_rotation() -> anyhow::Result<()> { let archived_data = std::fs::read(&archived_path)?; let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - // Create recovery from both journals - let recovery = VersionedRecovery::new(vec![&archived_data, &active_data])?; + // Decompress the archived journal manually + let decompressed_archived = decompress_zlib(&archived_data)?; - // Verify timestamp range spans both journals - let timestamp_range = recovery.timestamp_range(); - assert!(timestamp_range.is_some()); - let (min, max) = timestamp_range.unwrap(); - assert!(min <= ts1); - assert!(max >= ts3); + // Create recovery from both journals + let recovery = VersionedRecovery::new(vec![ + (&decompressed_archived, rotation_ts), + (&active_data, u64::MAX), + ])?; - // Recover at ts1 (should be in archived journal) + // Verify we can recover at any timestamp across both journals let state_ts1 = recovery.recover_at_timestamp(ts1)?; assert_eq!(state_ts1.len(), 1); assert!(state_ts1.contains_key("key1")); @@ -877,55 +901,6 @@ async fn test_recovery_at_timestamp_with_rotation() -> anyhow::Result<()> { Ok(()) } -#[tokio::test] -async fn test_timestamp_range() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - - store - .insert("key1".to_string(), Value::String("value1".to_string())) - .await?; - let ts1 = store - .get_with_timestamp("key1") - .map(|tv| tv.timestamp) - .unwrap(); - - std::thread::sleep(std::time::Duration::from_millis(10)); - - store - .insert("key2".to_string(), Value::String("value2".to_string())) - .await?; - - std::thread::sleep(std::time::Duration::from_millis(10)); - - store - .insert("key3".to_string(), Value::String("value3".to_string())) - .await?; - let ts3 = store - .get_with_timestamp("key3") - .map(|tv| tv.timestamp) - .unwrap(); - - store.sync()?; - - let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - let recovery = VersionedRecovery::new(vec![&journal_data])?; - - let timestamp_range = recovery.timestamp_range(); - assert!(timestamp_range.is_some()); - let (min, max) = timestamp_range.unwrap(); - - // Min should be <= first timestamp, max should be >= last timestamp - assert!(min <= ts1); - assert!(max >= ts3); - - // Timestamps should be ordered - assert!(ts3 > ts1); - - Ok(()) -} - #[tokio::test] async fn test_recovery_decompression_transparent() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; @@ -945,33 +920,28 @@ async fn test_recovery_decompression_transparent() -> anyhow::Result<()> { // Create uncompressed recovery baseline let uncompressed_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - let recovery_uncompressed = VersionedRecovery::new(vec![&uncompressed_data])?; + let recovery_uncompressed = VersionedRecovery::new(vec![(&uncompressed_data, u64::MAX)])?; let state_uncompressed = recovery_uncompressed.recover_at_timestamp(ts1)?; // Rotate to compress store.rotate_journal().await?; // Read compressed archive - let mut archived_files = std::fs::read_dir(temp_dir.path())? - .filter_map(|entry| { - let entry = entry.ok()?; - let path = entry.path(); - if path.file_name()?.to_str()?.starts_with("test.jrn.t") { - Some(path) - } else { - None - } - }) - .collect::>(); - archived_files.sort(); + let archived_files = find_archived_journals(temp_dir.path())?; let compressed_path = archived_files.first().unwrap(); let compressed_data = std::fs::read(compressed_path)?; + // Extract rotation timestamp from compressed archive filename + let compressed_ts = extract_rotation_timestamp(compressed_path)?; + // Verify it's actually compressed (smaller) assert!(compressed_data.len() < uncompressed_data.len()); - // Create recovery from compressed data - let recovery_compressed = VersionedRecovery::new(vec![&compressed_data])?; + // Decompress the archived journal manually + let decompressed_data = decompress_zlib(&compressed_data)?; + + // Create recovery from decompressed data + let recovery_compressed = VersionedRecovery::new(vec![(&decompressed_data, compressed_ts)])?; let state_compressed = recovery_compressed.recover_at_timestamp(ts1)?; // Both should produce identical results @@ -1013,24 +983,19 @@ async fn test_journal_ordering_requirement() -> anyhow::Result<()> { store.sync()?; // Read both journals (archived + active) - let mut archived_files = std::fs::read_dir(temp_dir.path())? - .filter_map(|entry| { - let entry = entry.ok()?; - let path = entry.path(); - if path.file_name()?.to_str()?.starts_with("test.jrn.t") { - Some(path) - } else { - None - } - }) - .collect::>(); - archived_files.sort(); + let archived_files = find_archived_journals(temp_dir.path())?; let archived_data = std::fs::read(archived_files.first().unwrap())?; let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; + // Extract rotation timestamp from archived filename + let archived_ts = extract_rotation_timestamp(archived_files.first().unwrap())?; + // Should succeed when journals are in correct chronological order (archived, then active) - let recovery = VersionedRecovery::new(vec![&archived_data, &active_data]); + let recovery = VersionedRecovery::new(vec![ + (&archived_data, archived_ts), + (&active_data, u64::MAX), + ]); assert!(recovery.is_ok(), "Should succeed with correct ordering"); // Verify correct ordering produces expected results diff --git a/bd-resilient-kv/src/versioned_recovery.rs b/bd-resilient-kv/src/versioned_recovery.rs index 43b13c49..b6974d39 100644 --- a/bd-resilient-kv/src/versioned_recovery.rs +++ b/bd-resilient-kv/src/versioned_recovery.rs @@ -9,9 +9,6 @@ use crate::kv_journal::TimestampedValue; use ahash::AHashMap; use bd_bonjson::Value; use bd_bonjson::decoder::from_slice; -use flate2::read::ZlibDecoder; -use std::io::Read; -use std::path::Path; /// Helper function to read a u64 field from a BONJSON object. /// @@ -35,69 +32,24 @@ fn read_u64_field(obj: &AHashMap, key: &str) -> Option { /// A utility for recovering state at arbitrary timestamps from raw journal data. /// -/// This utility operates on raw byte slices from timestamped journals and can reconstruct -/// the key-value state at any historical timestamp by replaying journal entries. +/// This utility operates on raw uncompressed byte slices from timestamped journals and can +/// reconstruct the key-value state at any historical timestamp by replaying journal entries. /// -/// # Timestamp-Based Recovery +/// # Recovery Model /// -/// Timestamps are monotonically non-decreasing logical clocks (not pure wall time), -/// enabling snapshots that match specific event buffer timestamps. +/// Recovery works by replaying journal entries in chronological order up to the target timestamp. +/// When journals are rotated, compacted entries preserve their original timestamps, which means +/// entry timestamps may overlap across adjacent journal snapshots. Recovery handles this correctly +/// by replaying journals sequentially and applying entries in timestamp order. /// -/// ## Snapshot Bucketing and Entry Timestamp Overlaps +/// ## Optimization /// -/// Entry timestamps may overlap across adjacent snapshots because compacted entries preserve -/// their original timestamps during rotation. This design provides implementation simplicity -/// and audit trail preservation without affecting recovery correctness. +/// To recover the current state, only the last journal needs to be read since rotation writes +/// the complete compacted state with original timestamps preserved. For historical timestamp +/// recovery, the utility automatically identifies and replays only the necessary journals. /// -/// **Design rationale:** Preserving original timestamps is not strictly required for -/// point-in-time state reconstruction, but provides benefits at zero cost: -/// - **Implementation simplicity**: No timestamp rewriting logic needed during rotation -/// - **Semantic accuracy**: Maintains "when was this value last modified" for audit trails -/// - **Future-proof**: Preserves historical information that may become useful -/// -/// Each snapshot has: -/// - `min_timestamp`: Minimum entry timestamp in the snapshot (from actual entries) -/// - `max_timestamp`: Maximum entry timestamp in the snapshot (from actual entries) -/// - Filename timestamp: The rotation point (equals `max_timestamp` of archived journal) -/// -/// Example timeline: -/// ```text -/// Snapshot 1: store.jrn.t300.zz -/// - Entries: foo@100, bar@200, foo@300 -/// - min_timestamp: 100, max_timestamp: 300 -/// - Range: [100, 300] -/// -/// Snapshot 2: store.jrn.t500.zz -/// - Compacted entries: foo@300, bar@200 (original timestamps!) -/// - New entries: baz@400, qux@500 -/// - min_timestamp: 200, max_timestamp: 500 -/// - Range: [200, 500] — overlaps with [100, 300]! -/// ``` -/// -/// ## Recovery Bucketing Model -/// -/// To recover state for multiple logs at different timestamps efficiently: -/// -/// 1. **Bucket logs by snapshot:** Compare log timestamp against each snapshot's `[min_timestamp, -/// max_timestamp]` range -/// 2. **Sequential replay:** For each bucket, replay journals sequentially up to target timestamp -/// 3. **State reconstruction:** Overlapping timestamps are handled correctly because compacted -/// entries represent the state at rotation time -/// -/// Example: Recovering logs at timestamps [100, 250, 400, 500] -/// - Log@100: Use Snapshot 1 (100 is in range [100, 300]) -/// - Log@250: Use Snapshot 1 (250 is in range [100, 300]) -/// - Log@400: Use Snapshot 2 (400 is in range [200, 500], replay compacted state + new entries) -/// - Log@500: Use Snapshot 2 (500 is in range [200, 500]) -/// -/// ## Invariants -/// -/// - Filename timestamps strictly increase (t300 < t500) -/// - Entry timestamp ranges may overlap between adjacent snapshots -/// - Sequential replay produces correct state at any timestamp -/// -/// Supports both compressed (zlib) and uncompressed journals. Compressed journals are -/// automatically detected and decompressed transparently. +/// **Note:** Callers are responsible for decompressing journal data if needed before passing +/// it to this utility. #[derive(Debug)] pub struct VersionedRecovery { journals: Vec, @@ -106,76 +58,44 @@ pub struct VersionedRecovery { #[derive(Debug)] struct JournalInfo { data: Vec, - min_timestamp: u64, - max_timestamp: u64, + rotation_timestamp: u64, } impl VersionedRecovery { - /// Create a new recovery utility from a list of journal byte slices. + /// Create a new recovery utility from a list of uncompressed journal byte slices with rotation + /// timestamps. /// /// The journals should be provided in chronological order (oldest to newest). - /// Each journal must be a valid versioned journal (VERSION 2 format). - /// Journals may be compressed with zlib or uncompressed - decompression is automatic. + /// Each journal must be a valid uncompressed versioned journal (VERSION 2 format). + /// + /// # Arguments + /// + /// * `journals` - A vector of tuples containing (`journal_data`, `rotation_timestamp`). The + /// `rotation_timestamp` represents when this journal was archived (the snapshot boundary). For + /// the active journal (not yet rotated), use `u64::MAX`. /// /// # Errors /// /// Returns an error if any journal is invalid or cannot be parsed. - pub fn new(journals: Vec<&[u8]>) -> anyhow::Result { - let mut journal_infos = Vec::new(); - - for data in journals { - // Detect and decompress if needed - let decompressed = decompress_if_needed(data)?; - let (min_timestamp, max_timestamp) = extract_timestamp_range(&decompressed)?; - journal_infos.push(JournalInfo { - data: decompressed, - min_timestamp, - max_timestamp, - }); - } + /// + /// # Note + /// + /// Callers must decompress journal data before passing it to this method if the data + /// is compressed (e.g., with zlib). + pub fn new(journals: Vec<(&[u8], u64)>) -> anyhow::Result { + let journal_infos = journals + .into_iter() + .map(|(data, rotation_timestamp)| JournalInfo { + data: data.to_vec(), + rotation_timestamp, + }) + .collect(); Ok(Self { journals: journal_infos, }) } - /// Create a new recovery utility from journal file paths. - /// - /// This is an async convenience method that reads journal files from disk. - /// The journals should be provided in chronological order (oldest to newest). - /// - /// # Errors - /// - /// Returns an error if any file cannot be read or if any journal is invalid. - pub async fn from_files(journal_paths: Vec<&Path>) -> anyhow::Result { - let mut journal_data = Vec::new(); - - for path in journal_paths { - let data = tokio::fs::read(path).await?; - journal_data.push(data); - } - - // Convert Vec> to Vec<&[u8]> - let journal_slices: Vec<&[u8]> = journal_data.iter().map(Vec::as_slice).collect(); - - Self::new(journal_slices) - } - - /// Get the range of timestamps available in the recovery utility. - /// - /// Returns (`min_timestamp`, `max_timestamp`) tuple representing the earliest and latest - /// timestamps that can be recovered. - #[must_use] - pub fn timestamp_range(&self) -> Option<(u64, u64)> { - if self.journals.is_empty() { - return None; - } - - let min = self.journals.first().map(|j| j.min_timestamp)?; - let max = self.journals.last().map(|j| j.max_timestamp)?; - Some((min, max)) - } - /// Recover the key-value state at a specific timestamp. /// /// This method replays all journal entries from all provided journals up to and including @@ -212,18 +132,15 @@ impl VersionedRecovery { ) -> anyhow::Result> { let mut map = AHashMap::new(); - // Find all journals that might contain entries up to target timestamp + // Replay journals up to and including the journal that was active at target_timestamp. + // A journal with rotation_timestamp T was the active journal for all timestamps <= T. for journal in &self.journals { - // Skip journals that start after our target - if journal.min_timestamp > target_timestamp { - break; - } - - // Replay entries from this journal + // Replay entries from this journal up to target_timestamp replay_journal_to_timestamp(&journal.data, target_timestamp, &mut map)?; - // If this journal contains the target timestamp, we're done - if journal.max_timestamp >= target_timestamp { + // If this journal was rotated at or after our target timestamp, we're done. + // This journal contains all state up to target_timestamp. + if journal.rotation_timestamp >= target_timestamp { break; } } @@ -250,83 +167,6 @@ impl VersionedRecovery { } } -/// Decompress journal data if it's zlib-compressed, otherwise return as-is. -/// -/// Detection: Checks for zlib magic bytes first (RFC 1950). If not present, validates -/// as uncompressed journal by checking format version. -fn decompress_if_needed(data: &[u8]) -> anyhow::Result> { - const HEADER_SIZE: usize = 16; - - // Check for zlib magic bytes first (RFC 1950) - // Zlib compressed data starts with 0x78 followed by a second byte where: - // - 0x01 (no/low compression) - // - 0x5E (also valid) - // - 0x9C (default compression) - // - 0xDA (best compression) - // The second byte's lower 5 bits are the window size, and bit 5 is the FDICT flag. - // We check that bit 5 (0x20) is not set for typical zlib streams without preset dictionary. - if data.len() >= 2 && data[0] == 0x78 && (data[1] & 0x20) == 0 { - // Looks like zlib compressed data - let mut decoder = ZlibDecoder::new(data); - let mut decompressed = Vec::new(); - decoder.read_to_end(&mut decompressed)?; - return Ok(decompressed); - } - - // Otherwise, treat as uncompressed and validate it's a proper journal - if data.len() >= HEADER_SIZE { - // Read format version (first 8 bytes as u64 little-endian) - let version_bytes: [u8; 8] = data[0 .. 8] - .try_into() - .map_err(|_| anyhow::anyhow!("Failed to read version bytes"))?; - let format_version = u64::from_le_bytes(version_bytes); - - // Check for known format versions - if format_version == 1 || format_version == 2 { - return Ok(data.to_vec()); - } - - anyhow::bail!("Invalid journal format version: {format_version}"); - } - - anyhow::bail!("Data too small to be valid journal (size: {})", data.len()) -} - -/// Extract the minimum/maximum timestamps from a journal. -/// -/// Returns (`min_timestamp`, `max_timestamp`). -/// These are computed from actual entry timestamps in the journal. -fn extract_timestamp_range(buffer: &[u8]) -> anyhow::Result<(u64, u64)> { - let array = read_bonjson_payload(buffer)?; - - let mut min_timestamp = u64::MAX; - let mut max_timestamp = 0; - - if let Value::Array(entries) = array { - // Process entries to find min/max timestamps (skip metadata at index 0) - for (index, entry) in entries.iter().enumerate() { - if index == 0 { - continue; // Skip metadata - } - - if let Value::Object(obj) = entry - && let Some(t) = read_u64_field(obj, "t") - { - min_timestamp = min_timestamp.min(t); - max_timestamp = max_timestamp.max(t); - } - } - } - - // If no entries found, default to (0, 0) - if min_timestamp == u64::MAX { - min_timestamp = 0; - max_timestamp = 0; - } - - Ok((min_timestamp, max_timestamp)) -} - /// Replay journal entries up to and including the target timestamp. /// /// This function processes all journal entries with timestamp ≤ `target_timestamp`. @@ -401,6 +241,10 @@ fn read_bonjson_payload(buffer: &[u8]) -> anyhow::Result { #[allow(clippy::cast_possible_truncation)] let position = u64::from_le_bytes(position_bytes) as usize; + if position < ARRAY_BEGIN { + anyhow::bail!("Invalid position: {position}, must be at least {ARRAY_BEGIN}"); + } + if position > buffer.len() { anyhow::bail!( "Invalid position: {position}, buffer size: {}", From cf732eeaa24675c8c2f76bd60e464bb8f4740a92 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Thu, 6 Nov 2025 13:29:49 -0800 Subject: [PATCH 27/66] only consider snapshots in recovery --- .../src/tests/versioned_recovery_test.rs | 382 ++++++++++-------- bd-resilient-kv/src/versioned_kv_store.rs | 12 - bd-resilient-kv/src/versioned_recovery.rs | 102 ++--- 3 files changed, 264 insertions(+), 232 deletions(-) diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index fad0a9ef..463eb727 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -65,7 +65,7 @@ fn extract_rotation_timestamp(path: &std::path::Path) -> anyhow::Result { } #[tokio::test] -async fn test_recover_current_only_needs_last_journal() -> anyhow::Result<()> { +async fn test_recover_current_only_needs_last_snapshot() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; // Create a store with multiple rotations to build up history @@ -99,49 +99,44 @@ async fn test_recover_current_only_needs_last_journal() -> anyhow::Result<()> { .await?; store.remove("key2").await?; - store.sync()?; - - // Get expected current state using the store's hashmap - let expected_state = store.as_hashmap(); + // Final rotation to create snapshot with current state + store.rotate_journal().await?; - // Read ALL journals (archived + active) - let mut all_journals = Vec::new(); + // Read ALL archived snapshots + let archived_files = find_archived_journals(temp_dir.path())?; + let mut all_snapshots = Vec::new(); - for archived_path in find_archived_journals(temp_dir.path())? { - let rotation_ts = extract_rotation_timestamp(&archived_path)?; - all_journals.push((std::fs::read(archived_path)?, rotation_ts)); + for archived_path in &archived_files { + let compressed_data = std::fs::read(archived_path)?; + let decompressed_data = decompress_zlib(&compressed_data)?; + let snapshot_ts = extract_rotation_timestamp(archived_path)?; + all_snapshots.push((decompressed_data, snapshot_ts)); } - // Read active journal (the last one) - use u64::MAX for active journal - let active_journal = std::fs::read(temp_dir.path().join("test.jrn"))?; - all_journals.push((active_journal.clone(), u64::MAX)); - - // Test 1: Verify recover_current() with ALL journals gives correct state - let all_journal_refs: Vec<(&[u8], u64)> = all_journals + // Test 1: Verify recover_current() with ALL snapshots gives correct state + let all_snapshot_refs: Vec<(&[u8], u64)> = all_snapshots .iter() .map(|(data, ts)| (data.as_slice(), *ts)) .collect(); - let recovery_all = VersionedRecovery::new(all_journal_refs)?; + let recovery_all = VersionedRecovery::new(all_snapshot_refs)?; let state_all = recovery_all.recover_current()?; - // Convert to comparable format (Value only, not TimestampedValue) - let state_all_values: ahash::AHashMap = - state_all.into_iter().map(|(k, tv)| (k, tv.value)).collect(); - - assert_eq!(state_all_values, expected_state); - - // Test 2: Verify recover_current() with ONLY the last journal gives the same state + // Test 2: Verify recover_current() with ONLY the last snapshot gives the same state // This is the optimization we want to prove works! - let recovery_last = VersionedRecovery::new(vec![(&active_journal, u64::MAX)])?; + let last_snapshot = &all_snapshots[all_snapshots.len() - 1]; + let recovery_last = VersionedRecovery::new(vec![(last_snapshot.0.as_slice(), last_snapshot.1)])?; let state_last = recovery_last.recover_current()?; + // Convert to comparable format (Value only, not TimestampedValue) + let state_all_values: ahash::AHashMap = + state_all.into_iter().map(|(k, tv)| (k, tv.value)).collect(); let state_last_values: ahash::AHashMap = state_last .into_iter() .map(|(k, tv)| (k, tv.value)) .collect(); - // The last journal alone should give us the same current state - assert_eq!(state_last_values, expected_state); + // The last snapshot alone should give us the same current state + assert_eq!(state_last_values, state_all_values); // Verify the expected final state has the right keys assert!(state_last_values.contains_key("key1")); @@ -172,6 +167,7 @@ async fn test_detection_compressed_journal() -> anyhow::Result<()> { let archived_files = find_archived_journals(temp_dir.path())?; let archived_path = archived_files.first().unwrap(); let compressed_data = std::fs::read(archived_path)?; + let snapshot_ts = extract_rotation_timestamp(archived_path)?; // Verify it starts with zlib magic bytes (0x78) assert_eq!( @@ -183,7 +179,7 @@ async fn test_detection_compressed_journal() -> anyhow::Result<()> { let decompressed_data = decompress_zlib(&compressed_data)?; // Should successfully recover from decompressed data - let recovery = VersionedRecovery::new(vec![(&decompressed_data, u64::MAX)])?; + let recovery = VersionedRecovery::new(vec![(&decompressed_data, snapshot_ts)])?; let state = recovery.recover_current()?; assert_eq!(state.len(), 1); assert_eq!( @@ -198,13 +194,18 @@ async fn test_detection_compressed_journal() -> anyhow::Result<()> { async fn test_detection_invalid_journal_data() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - // Create valid journal for mixed test + // Create valid snapshot for mixed test let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; - store.sync()?; - let valid_data = std::fs::read(temp_dir.path().join("test.jrn"))?; + store.rotate_journal().await?; + + let archived_files = find_archived_journals(temp_dir.path())?; + let archived_path = archived_files.first().unwrap(); + let compressed_data = std::fs::read(archived_path)?; + let valid_data = decompress_zlib(&compressed_data)?; + let snapshot_ts = extract_rotation_timestamp(archived_path)?; // Test 1: Invalid format version // Since VersionedRecovery no longer validates during construction, @@ -212,7 +213,7 @@ async fn test_detection_invalid_journal_data() -> anyhow::Result<()> { let mut invalid_version = vec![0u8; 32]; let version_bytes = 999u64.to_le_bytes(); invalid_version[0 .. 8].copy_from_slice(&version_bytes); - let recovery = VersionedRecovery::new(vec![(&invalid_version, u64::MAX)])?; + let recovery = VersionedRecovery::new(vec![(&invalid_version, snapshot_ts)])?; let result = recovery.recover_current(); assert!( result.is_err(), @@ -221,7 +222,7 @@ async fn test_detection_invalid_journal_data() -> anyhow::Result<()> { // Test 2: Data too small (smaller than header) let small_data = vec![0u8; 8]; - let recovery = VersionedRecovery::new(vec![(&small_data, u64::MAX)])?; + let recovery = VersionedRecovery::new(vec![(&small_data, snapshot_ts)])?; let result = recovery.recover_current(); assert!( result.is_err(), @@ -230,7 +231,7 @@ async fn test_detection_invalid_journal_data() -> anyhow::Result<()> { // Test 3: Empty data let empty_data = vec![]; - let recovery = VersionedRecovery::new(vec![(&empty_data, u64::MAX)])?; + let recovery = VersionedRecovery::new(vec![(&empty_data, snapshot_ts)])?; let result = recovery.recover_current(); assert!( result.is_err(), @@ -241,7 +242,7 @@ async fn test_detection_invalid_journal_data() -> anyhow::Result<()> { // If caller accidentally passes compressed data, it will fail during recovery let mut fake_zlib = vec![0x78, 0x9C]; // Valid zlib magic bytes fake_zlib.extend_from_slice(&[0xFF; 100]); // But garbage data - let recovery = VersionedRecovery::new(vec![(&fake_zlib, u64::MAX)])?; + let recovery = VersionedRecovery::new(vec![(&fake_zlib, snapshot_ts)])?; let result = recovery.recover_current(); assert!( result.is_err(), @@ -250,32 +251,38 @@ async fn test_detection_invalid_journal_data() -> anyhow::Result<()> { // Test 5: Random garbage let garbage = vec![0xAB, 0xCD, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x90]; - let recovery = VersionedRecovery::new(vec![(&garbage, u64::MAX)])?; + let recovery = VersionedRecovery::new(vec![(&garbage, snapshot_ts)])?; let result = recovery.recover_current(); assert!( result.is_err(), "Should fail when recovering with garbage data" ); - // Test 6: Mixed valid and invalid journals - // When the last journal is invalid, recover_current() should fail + // Test 6: Mixed valid and invalid snapshots + // When the last snapshot is invalid, recover_current() should fail let mut invalid_mixed = vec![0u8; 32]; let version_bytes = 999u64.to_le_bytes(); invalid_mixed[0 .. 8].copy_from_slice(&version_bytes); - let recovery = VersionedRecovery::new(vec![(&valid_data, u64::MAX), (&invalid_mixed, u64::MAX)])?; + let recovery = VersionedRecovery::new(vec![ + (&valid_data, snapshot_ts), + (&invalid_mixed, snapshot_ts + 1000), + ])?; let result = recovery.recover_current(); assert!( result.is_err(), - "Should fail because last journal is invalid" + "Should fail because last snapshot is invalid" ); - // Test 7: Mixed invalid and valid journals - // When the last journal is valid, recover_current() should succeed - let recovery = VersionedRecovery::new(vec![(&invalid_mixed, u64::MAX), (&valid_data, u64::MAX)])?; + // Test 7: Mixed invalid and valid snapshots + // When the last snapshot is valid, recover_current() should succeed + let recovery = VersionedRecovery::new(vec![ + (&invalid_mixed, snapshot_ts), + (&valid_data, snapshot_ts + 1000), + ])?; let result = recovery.recover_current(); assert!( result.is_ok(), - "Should succeed because last journal is valid" + "Should succeed because last snapshot is valid" ); Ok(()) @@ -308,7 +315,8 @@ fn test_detection_zlib_compression_level_5() { let decompressed = decompress_zlib(&compressed).unwrap(); // Should be able to process the decompressed data - let result = VersionedRecovery::new(vec![(&decompressed, u64::MAX)]); + // Using arbitrary snapshot timestamp since this is synthetic test data + let result = VersionedRecovery::new(vec![(&decompressed, 1000)]); // May succeed or fail depending on whether the data is valid bonjson, // but should at least attempt to parse without panicking let _ = result; @@ -362,18 +370,19 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { .unwrap(); store.sync()?; - // Read all journal files + // Rotate to create final snapshot + store.rotate_journal().await?; + + // Read all snapshots (archived journals) let mut all_journals = Vec::new(); - // Read archived journals for archived_path in find_archived_journals(temp_dir.path())? { let rotation_ts = extract_rotation_timestamp(&archived_path)?; - all_journals.push((std::fs::read(archived_path)?, rotation_ts)); + let compressed_data = std::fs::read(&archived_path)?; + let decompressed_data = decompress_zlib(&compressed_data)?; + all_journals.push((decompressed_data, rotation_ts)); } - // Read active journal - all_journals.push((std::fs::read(temp_dir.path().join("test.jrn"))?, u64::MAX)); - // Create recovery utility with all journals let journal_refs: Vec<(&[u8], u64)> = all_journals .iter() @@ -409,11 +418,20 @@ async fn test_recovery_empty_journal() -> anyhow::Result<()> { // Create an empty store - let store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; store.sync()?; - let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - let recovery = VersionedRecovery::new(vec![(&journal_data, u64::MAX)])?; + // Rotate to create snapshot + store.rotate_journal().await?; + + // Read the snapshot + let archived_files = find_archived_journals(temp_dir.path())?; + assert_eq!(archived_files.len(), 1); + let compressed_data = std::fs::read(&archived_files[0])?; + let decompressed_data = decompress_zlib(&compressed_data)?; + let snapshot_ts = extract_rotation_timestamp(&archived_files[0])?; + + let recovery = VersionedRecovery::new(vec![(&decompressed_data, snapshot_ts)])?; // Recovering current should return empty map let state = recovery.recover_current()?; @@ -452,8 +470,17 @@ async fn test_recovery_with_overwrites() -> anyhow::Result<()> { store.sync()?; - let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - let recovery = VersionedRecovery::new(vec![(&journal_data, u64::MAX)])?; + // Rotate to create snapshot + store.rotate_journal().await?; + + // Read the snapshot + let archived_files = find_archived_journals(temp_dir.path())?; + assert_eq!(archived_files.len(), 1); + let compressed_data = std::fs::read(&archived_files[0])?; + let decompressed_data = decompress_zlib(&compressed_data)?; + let snapshot_ts = extract_rotation_timestamp(&archived_files[0])?; + + let recovery = VersionedRecovery::new(vec![(&decompressed_data, snapshot_ts)])?; // Each timestamp should show the value at that time let state_ts1 = recovery.recover_at_timestamp(ts1)?; @@ -495,8 +522,17 @@ async fn test_recovery_various_value_types() -> anyhow::Result<()> { store.insert("bool".to_string(), Value::Bool(true)).await?; store.sync()?; - let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - let recovery = VersionedRecovery::new(vec![(&journal_data, u64::MAX)])?; + // Rotate to create snapshot + store.rotate_journal().await?; + + // Read the snapshot + let archived_files = find_archived_journals(temp_dir.path())?; + assert_eq!(archived_files.len(), 1); + let compressed_data = std::fs::read(&archived_files[0])?; + let decompressed_data = decompress_zlib(&compressed_data)?; + let snapshot_ts = extract_rotation_timestamp(&archived_files[0])?; + + let recovery = VersionedRecovery::new(vec![(&decompressed_data, snapshot_ts)])?; let state = recovery.recover_current()?; assert_eq!(state.len(), 4); @@ -563,28 +599,29 @@ async fn test_recovery_from_compressed_archive() -> anyhow::Result<()> { store.sync()?; - // Find the compressed archive - let archived_files = find_archived_journals(temp_dir.path())?; - let archived_path = archived_files.first().unwrap(); - assert!(archived_path.exists(), "Compressed archive should exist"); - - // Read both journals - let compressed_data = std::fs::read(archived_path)?; - let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - - // Decompress the archived journal manually - let decompressed_data = decompress_zlib(&compressed_data)?; + // Rotate again to create final snapshot + store.rotate_journal().await?; - // Extract rotation timestamp from archived filename - let rotation_ts = extract_rotation_timestamp(archived_path)?; + // Read all snapshots + let archived_files = find_archived_journals(temp_dir.path())?; + assert_eq!(archived_files.len(), 2, "Should have two snapshots"); + + let mut all_snapshots = Vec::new(); + for archived_path in &archived_files { + let compressed_data = std::fs::read(archived_path)?; + let decompressed_data = decompress_zlib(&compressed_data)?; + let rotation_ts = extract_rotation_timestamp(archived_path)?; + all_snapshots.push((decompressed_data, rotation_ts)); + } - // Create recovery from both journals (decompressed first, then active) - let recovery = VersionedRecovery::new(vec![ - (&decompressed_data, rotation_ts), - (&active_data, u64::MAX), - ])?; + // Create recovery from all snapshots + let snapshot_refs: Vec<(&[u8], u64)> = all_snapshots + .iter() + .map(|(data, ts)| (data.as_slice(), *ts)) + .collect(); + let recovery = VersionedRecovery::new(snapshot_refs)?; - // Recover at ts1 (should be in compressed archive) + // Recover at ts1 (should be in first snapshot) let state_ts1 = recovery.recover_at_timestamp(ts1)?; assert_eq!(state_ts1.len(), 1); assert_eq!( @@ -592,11 +629,11 @@ async fn test_recovery_from_compressed_archive() -> anyhow::Result<()> { Some(&Value::String("value1".to_string())) ); - // Recover at ts2 (should be in compressed archive) + // Recover at ts2 (should be in first snapshot) let state_ts2 = recovery.recover_at_timestamp(ts2)?; assert_eq!(state_ts2.len(), 2); - // Recover at ts3 (should include data from both archives and active journal) + // Recover at ts3 (should include data from both snapshots) let state_ts3 = recovery.recover_at_timestamp(ts3)?; assert_eq!(state_ts3.len(), 3); assert_eq!( @@ -648,30 +685,27 @@ async fn test_recovery_from_multiple_compressed_archives() -> anyhow::Result<()> store.sync()?; - // Collect all journal data (2 compressed + 1 active) - let archived_files = find_archived_journals(temp_dir.path())?; - - let archive1_path = &archived_files[0]; - let archive2_path = &archived_files[1]; - - let archive1_data = std::fs::read(archive1_path)?; - let archive2_data = std::fs::read(archive2_path)?; - let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - - // Decompress archived journals manually - let decompressed1 = decompress_zlib(&archive1_data)?; - let decompressed2 = decompress_zlib(&archive2_data)?; + // Rotate again to create final snapshot + store.rotate_journal().await?; - // Extract rotation timestamps - let archive1_ts = extract_rotation_timestamp(archive1_path)?; - let archive2_ts = extract_rotation_timestamp(archive2_path)?; + // Collect all snapshots (should have 3) + let archived_files = find_archived_journals(temp_dir.path())?; + assert_eq!(archived_files.len(), 3, "Should have three snapshots"); + + let mut all_snapshots = Vec::new(); + for archived_path in &archived_files { + let compressed_data = std::fs::read(archived_path)?; + let decompressed_data = decompress_zlib(&compressed_data)?; + let rotation_ts = extract_rotation_timestamp(archived_path)?; + all_snapshots.push((decompressed_data, rotation_ts)); + } - // Create recovery from all journals (all decompressed) - let recovery = VersionedRecovery::new(vec![ - (&decompressed1, archive1_ts), - (&decompressed2, archive2_ts), - (&active_data, u64::MAX), - ])?; + // Create recovery from all snapshots + let snapshot_refs: Vec<(&[u8], u64)> = all_snapshots + .iter() + .map(|(data, ts)| (data.as_slice(), *ts)) + .collect(); + let recovery = VersionedRecovery::new(snapshot_refs)?; // Verify we can recover at any timestamp let state_ts1 = recovery.recover_at_timestamp(ts1)?; @@ -705,11 +739,6 @@ async fn test_recovery_mixed_compressed_and_uncompressed() -> anyhow::Result<()> store.sync()?; store.rotate_journal().await?; - // Get compressed archive - let archived_files = find_archived_journals(temp_dir.path())?; - let compressed_archive_path = archived_files.first().unwrap(); - let compressed_data = std::fs::read(compressed_archive_path)?; - // Create uncompressed journal data manually let mut uncompressed_store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; uncompressed_store @@ -720,19 +749,28 @@ async fn test_recovery_mixed_compressed_and_uncompressed() -> anyhow::Result<()> .map(|tv| tv.timestamp) .unwrap(); uncompressed_store.sync()?; - let uncompressed_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - // Extract rotation timestamp from compressed archive - let compressed_ts = extract_rotation_timestamp(compressed_archive_path)?; + // Rotate to create second snapshot + uncompressed_store.rotate_journal().await?; - // Decompress the compressed archive manually - let decompressed_data = decompress_zlib(&compressed_data)?; + // Get all snapshots + let archived_files = find_archived_journals(temp_dir.path())?; + assert_eq!(archived_files.len(), 2, "Should have two snapshots"); + + let mut all_snapshots = Vec::new(); + for archived_path in &archived_files { + let compressed_data = std::fs::read(archived_path)?; + let decompressed_data = decompress_zlib(&compressed_data)?; + let rotation_ts = extract_rotation_timestamp(archived_path)?; + all_snapshots.push((decompressed_data, rotation_ts)); + } - // Recovery now requires both journals to be decompressed - let recovery = VersionedRecovery::new(vec![ - (&decompressed_data, compressed_ts), - (&uncompressed_data, u64::MAX), - ])?; + // Create recovery from all snapshots + let snapshot_refs: Vec<(&[u8], u64)> = all_snapshots + .iter() + .map(|(data, ts)| (data.as_slice(), *ts)) + .collect(); + let recovery = VersionedRecovery::new(snapshot_refs)?; let state_final = recovery.recover_at_timestamp(ts2)?; assert_eq!(state_final.len(), 2); @@ -781,11 +819,18 @@ async fn test_recovery_at_timestamp() -> anyhow::Result<()> { store.sync()?; - // Read the journal data - let journal_data = std::fs::read(temp_dir.path().join("test.jrn"))?; + // Rotate to create snapshot + store.rotate_journal().await?; + + // Read the snapshot + let archived_files = find_archived_journals(temp_dir.path())?; + assert_eq!(archived_files.len(), 1); + let compressed_data = std::fs::read(&archived_files[0])?; + let decompressed_data = decompress_zlib(&compressed_data)?; + let snapshot_ts = extract_rotation_timestamp(&archived_files[0])?; // Create recovery utility - let recovery = VersionedRecovery::new(vec![(&journal_data, u64::MAX)])?; + let recovery = VersionedRecovery::new(vec![(&decompressed_data, snapshot_ts)])?; // Recover at ts1: should have only key1=value1 let state_ts1 = recovery.recover_at_timestamp(ts1)?; @@ -848,7 +893,6 @@ async fn test_recovery_at_timestamp_with_rotation() -> anyhow::Result<()> { .unwrap(); // Rotate journal - let rotation_ts = ts2; store.rotate_journal().await?; std::thread::sleep(std::time::Duration::from_millis(10)); @@ -864,34 +908,40 @@ async fn test_recovery_at_timestamp_with_rotation() -> anyhow::Result<()> { store.sync()?; - // Read both journals - let archived_path = temp_dir - .path() - .join(format!("test.jrn.t{}.zz", rotation_ts)); - let archived_data = std::fs::read(&archived_path)?; - let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; + // Rotate again to create final snapshot + store.rotate_journal().await?; - // Decompress the archived journal manually - let decompressed_archived = decompress_zlib(&archived_data)?; + // Read all snapshots + let archived_files = find_archived_journals(temp_dir.path())?; + assert_eq!(archived_files.len(), 2, "Should have two snapshots"); + + let mut all_snapshots = Vec::new(); + for archived_path in &archived_files { + let compressed_data = std::fs::read(archived_path)?; + let decompressed_data = decompress_zlib(&compressed_data)?; + let rotation_ts = extract_rotation_timestamp(archived_path)?; + all_snapshots.push((decompressed_data, rotation_ts)); + } - // Create recovery from both journals - let recovery = VersionedRecovery::new(vec![ - (&decompressed_archived, rotation_ts), - (&active_data, u64::MAX), - ])?; + // Create recovery from all snapshots + let snapshot_refs: Vec<(&[u8], u64)> = all_snapshots + .iter() + .map(|(data, ts)| (data.as_slice(), *ts)) + .collect(); + let recovery = VersionedRecovery::new(snapshot_refs)?; - // Verify we can recover at any timestamp across both journals + // Verify we can recover at any timestamp across all snapshots let state_ts1 = recovery.recover_at_timestamp(ts1)?; assert_eq!(state_ts1.len(), 1); assert!(state_ts1.contains_key("key1")); - // Recover at ts2 (should be in archived journal) + // Recover at ts2 (should be in first snapshot) let state_ts2 = recovery.recover_at_timestamp(ts2)?; assert_eq!(state_ts2.len(), 2); assert!(state_ts2.contains_key("key1")); assert!(state_ts2.contains_key("key2")); - // Recover at ts3 (should include all data) + // Recover at ts3 (should include all data from both snapshots) let state_ts3 = recovery.recover_at_timestamp(ts3)?; assert_eq!(state_ts3.len(), 3); assert!(state_ts3.contains_key("key1")); @@ -918,40 +968,24 @@ async fn test_recovery_decompression_transparent() -> anyhow::Result<()> { .unwrap(); store.sync()?; - // Create uncompressed recovery baseline - let uncompressed_data = std::fs::read(temp_dir.path().join("test.jrn"))?; - let recovery_uncompressed = VersionedRecovery::new(vec![(&uncompressed_data, u64::MAX)])?; - let state_uncompressed = recovery_uncompressed.recover_at_timestamp(ts1)?; - - // Rotate to compress + // Rotate to create first snapshot store.rotate_journal().await?; - // Read compressed archive + // Read the first snapshot let archived_files = find_archived_journals(temp_dir.path())?; - let compressed_path = archived_files.first().unwrap(); - let compressed_data = std::fs::read(compressed_path)?; - - // Extract rotation timestamp from compressed archive filename - let compressed_ts = extract_rotation_timestamp(compressed_path)?; - - // Verify it's actually compressed (smaller) - assert!(compressed_data.len() < uncompressed_data.len()); + let first_snapshot_path = archived_files.first().unwrap(); + let compressed_data = std::fs::read(first_snapshot_path)?; + let snapshot_ts = extract_rotation_timestamp(first_snapshot_path)?; - // Decompress the archived journal manually + // Decompress the snapshot data (VersionedRecovery requires uncompressed data) let decompressed_data = decompress_zlib(&compressed_data)?; + let recovery = VersionedRecovery::new(vec![(&decompressed_data, snapshot_ts)])?; + let state = recovery.recover_at_timestamp(ts1)?; - // Create recovery from decompressed data - let recovery_compressed = VersionedRecovery::new(vec![(&decompressed_data, compressed_ts)])?; - let state_compressed = recovery_compressed.recover_at_timestamp(ts1)?; - - // Both should produce identical results - assert_eq!(state_uncompressed.len(), state_compressed.len()); - assert_eq!( - state_uncompressed.get("data").map(|tv| &tv.value), - state_compressed.get("data").map(|tv| &tv.value) - ); + // Verify recovery works correctly with decompressed data + assert_eq!(state.len(), 1); assert_eq!( - state_uncompressed.get("data").map(|tv| &tv.value), + state.get("data").map(|tv| &tv.value), Some(&Value::String(compressible)) ); @@ -982,19 +1016,25 @@ async fn test_journal_ordering_requirement() -> anyhow::Result<()> { .await?; store.sync()?; - // Read both journals (archived + active) + // Rotate again to create second snapshot + store.rotate_journal().await?; + + // Read both snapshots let archived_files = find_archived_journals(temp_dir.path())?; + assert_eq!(archived_files.len(), 2, "Should have 2 archived snapshots"); - let archived_data = std::fs::read(archived_files.first().unwrap())?; - let active_data = std::fs::read(temp_dir.path().join("test.jrn"))?; + let first_snapshot_data = std::fs::read(&archived_files[0])?; + let first_snapshot_ts = extract_rotation_timestamp(&archived_files[0])?; + let decompressed_first = decompress_zlib(&first_snapshot_data)?; - // Extract rotation timestamp from archived filename - let archived_ts = extract_rotation_timestamp(archived_files.first().unwrap())?; + let second_snapshot_data = std::fs::read(&archived_files[1])?; + let second_snapshot_ts = extract_rotation_timestamp(&archived_files[1])?; + let decompressed_second = decompress_zlib(&second_snapshot_data)?; - // Should succeed when journals are in correct chronological order (archived, then active) + // Should succeed when journals are in correct chronological order (oldest first) let recovery = VersionedRecovery::new(vec![ - (&archived_data, archived_ts), - (&active_data, u64::MAX), + (&decompressed_first, first_snapshot_ts), + (&decompressed_second, second_snapshot_ts), ]); assert!(recovery.is_ok(), "Should succeed with correct ordering"); diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs index a7bc971b..37ea8102 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -76,18 +76,6 @@ async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result /// /// For detailed information about timestamp semantics, recovery bucketing, and invariants, /// see the `VersionedRecovery` documentation. -/// -/// # Example -/// ```ignore -/// use bd_resilient_kv::VersionedKVStore; -/// use bd_bonjson::Value; -/// -/// let mut store = VersionedKVStore::new("/path/to/dir", "mystore", 1024 * 1024, None)?; -/// -/// // Insert with timestamp tracking -/// let t1 = store.insert("key1".to_string(), Value::from(42))?; -/// let t2 = store.insert("key2".to_string(), Value::from("hello"))?; -/// ``` pub struct VersionedKVStore { journal: MemMappedVersionedKVJournal, cached_map: AHashMap, diff --git a/bd-resilient-kv/src/versioned_recovery.rs b/bd-resilient-kv/src/versioned_recovery.rs index b6974d39..e45df5a4 100644 --- a/bd-resilient-kv/src/versioned_recovery.rs +++ b/bd-resilient-kv/src/versioned_recovery.rs @@ -30,75 +30,77 @@ fn read_u64_field(obj: &AHashMap, key: &str) -> Option { } } -/// A utility for recovering state at arbitrary timestamps from raw journal data. +/// A utility for recovering state at arbitrary timestamps from journal snapshots. /// -/// This utility operates on raw uncompressed byte slices from timestamped journals and can -/// reconstruct the key-value state at any historical timestamp by replaying journal entries. +/// This utility operates on raw uncompressed byte slices from archived journal snapshots +/// (created during rotation) and can reconstruct the key-value state at any historical +/// timestamp by replaying journal entries. /// /// # Recovery Model /// -/// Recovery works by replaying journal entries in chronological order up to the target timestamp. -/// When journals are rotated, compacted entries preserve their original timestamps, which means -/// entry timestamps may overlap across adjacent journal snapshots. Recovery handles this correctly -/// by replaying journals sequentially and applying entries in timestamp order. +/// Recovery works exclusively with journal snapshots - complete archived journals created +/// during rotation. Each snapshot contains the full compacted state at the time of rotation, +/// with all entries preserving their original timestamps. +/// +/// Recovery replays snapshot entries in chronological order up to the target timestamp. +/// Since entry timestamps may overlap across adjacent snapshots, recovery handles this by +/// replaying snapshots sequentially and applying entries in timestamp order. /// /// ## Optimization /// -/// To recover the current state, only the last journal needs to be read since rotation writes -/// the complete compacted state with original timestamps preserved. For historical timestamp -/// recovery, the utility automatically identifies and replays only the necessary journals. +/// To recover the current state, only the last snapshot needs to be read since each snapshot +/// contains the complete compacted state at rotation time. For historical timestamp recovery, +/// the utility automatically identifies and replays only the necessary snapshots. /// -/// **Note:** Callers are responsible for decompressing journal data if needed before passing +/// **Note:** Callers are responsible for decompressing snapshot data if needed before passing /// it to this utility. #[derive(Debug)] pub struct VersionedRecovery { - journals: Vec, + snapshots: Vec, } #[derive(Debug)] -struct JournalInfo { +struct SnapshotInfo { data: Vec, - rotation_timestamp: u64, + snapshot_timestamp: u64, } impl VersionedRecovery { - /// Create a new recovery utility from a list of uncompressed journal byte slices with rotation - /// timestamps. + /// Create a new recovery utility from a list of uncompressed snapshot byte slices. /// - /// The journals should be provided in chronological order (oldest to newest). - /// Each journal must be a valid uncompressed versioned journal (VERSION 2 format). + /// The snapshots should be provided in chronological order (oldest to newest). + /// Each snapshot must be a valid uncompressed versioned journal (VERSION 2 format). /// /// # Arguments /// - /// * `journals` - A vector of tuples containing (`journal_data`, `rotation_timestamp`). The - /// `rotation_timestamp` represents when this journal was archived (the snapshot boundary). For - /// the active journal (not yet rotated), use `u64::MAX`. + /// * `snapshots` - A vector of tuples containing (`snapshot_data`, `snapshot_timestamp`). The + /// `snapshot_timestamp` represents when this snapshot was created (archived during rotation). /// /// # Errors /// - /// Returns an error if any journal is invalid or cannot be parsed. + /// Returns an error if any snapshot is invalid or cannot be parsed. /// /// # Note /// - /// Callers must decompress journal data before passing it to this method if the data + /// Callers must decompress snapshot data before passing it to this method if the data /// is compressed (e.g., with zlib). - pub fn new(journals: Vec<(&[u8], u64)>) -> anyhow::Result { - let journal_infos = journals + pub fn new(snapshots: Vec<(&[u8], u64)>) -> anyhow::Result { + let snapshot_infos = snapshots .into_iter() - .map(|(data, rotation_timestamp)| JournalInfo { + .map(|(data, snapshot_timestamp)| SnapshotInfo { data: data.to_vec(), - rotation_timestamp, + snapshot_timestamp, }) .collect(); Ok(Self { - journals: journal_infos, + snapshots: snapshot_infos, }) } /// Recover the key-value state at a specific timestamp. /// - /// This method replays all journal entries from all provided journals up to and including + /// This method replays all snapshot entries from all provided snapshots up to and including /// the target timestamp, reconstructing the exact state at that point in time. /// /// ## Important: "Up to and including" semantics @@ -124,23 +126,23 @@ impl VersionedRecovery { /// # Errors /// /// Returns an error if: - /// - The target timestamp is not found in any journal - /// - Journal data is corrupted or invalid + /// - The target timestamp is not found in any snapshot + /// - Snapshot data is corrupted or invalid pub fn recover_at_timestamp( &self, target_timestamp: u64, ) -> anyhow::Result> { let mut map = AHashMap::new(); - // Replay journals up to and including the journal that was active at target_timestamp. - // A journal with rotation_timestamp T was the active journal for all timestamps <= T. - for journal in &self.journals { - // Replay entries from this journal up to target_timestamp - replay_journal_to_timestamp(&journal.data, target_timestamp, &mut map)?; + // Replay snapshots up to and including the snapshot that was created at or after + // target_timestamp. A snapshot with snapshot_timestamp T contains all state up to time T. + for snapshot in &self.snapshots { + // Replay entries from this snapshot up to target_timestamp + replay_journal_to_timestamp(&snapshot.data, target_timestamp, &mut map)?; - // If this journal was rotated at or after our target timestamp, we're done. - // This journal contains all state up to target_timestamp. - if journal.rotation_timestamp >= target_timestamp { + // If this snapshot was created at or after our target timestamp, we're done. + // This snapshot contains all state up to target_timestamp. + if snapshot.snapshot_timestamp >= target_timestamp { break; } } @@ -148,28 +150,30 @@ impl VersionedRecovery { Ok(map) } - /// Get the current state (at the latest timestamp). + /// Get the current state from the latest snapshot. + /// + /// Since each snapshot contains the complete compacted state at rotation time, + /// only the last snapshot needs to be read to get the current state. /// /// # Errors /// - /// Returns an error if journal data is corrupted or invalid. + /// Returns an error if snapshot data is corrupted or invalid. pub fn recover_current(&self) -> anyhow::Result> { let mut map = AHashMap::new(); - // Optimization: Only read the last journal since journal rotation writes - // the complete state at the snapshot timestamp, so the last journal contains - // all current state. - if let Some(last_journal) = self.journals.last() { - replay_journal_to_timestamp(&last_journal.data, u64::MAX, &mut map)?; + // Optimization: Only read the last snapshot since rotation writes the complete + // compacted state, so the last snapshot contains all current state. + if let Some(last_snapshot) = self.snapshots.last() { + replay_journal_to_timestamp(&last_snapshot.data, u64::MAX, &mut map)?; } Ok(map) } } -/// Replay journal entries up to and including the target timestamp. +/// Replay snapshot entries up to and including the target timestamp. /// -/// This function processes all journal entries with timestamp ≤ `target_timestamp`. +/// This function processes all entries with timestamp ≤ `target_timestamp`. /// The "up to and including" behavior is essential because timestamps are monotonically /// non-decreasing (not strictly increasing): if the system clock doesn't advance between /// writes, multiple entries may share the same timestamp. All such entries must be @@ -225,7 +229,7 @@ fn replay_journal_to_timestamp( Ok(()) } -/// Read the bonjson payload from a journal buffer. +/// Read the bonjson payload from a snapshot buffer. fn read_bonjson_payload(buffer: &[u8]) -> anyhow::Result { const HEADER_SIZE: usize = 16; const ARRAY_BEGIN: usize = 16; From 747ea7f9ba9c199c5f76cdc7744315117800c838 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Thu, 6 Nov 2025 13:57:41 -0800 Subject: [PATCH 28/66] remove tests we don't need --- .../src/tests/versioned_recovery_test.rs | 585 ------------------ bd-resilient-kv/src/versioned_recovery.rs | 108 ++-- 2 files changed, 56 insertions(+), 637 deletions(-) diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index 463eb727..b2f2bc75 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -64,265 +64,6 @@ fn extract_rotation_timestamp(path: &std::path::Path) -> anyhow::Result { Ok(timestamp) } -#[tokio::test] -async fn test_recover_current_only_needs_last_snapshot() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - // Create a store with multiple rotations to build up history - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - - // Add initial data - store - .insert("key1".to_string(), Value::String("value1".to_string())) - .await?; - store - .insert("key2".to_string(), Value::String("value2".to_string())) - .await?; - - // First rotation - store.rotate_journal().await?; - - // Update key1 and add key3 - store - .insert("key1".to_string(), Value::String("updated1".to_string())) - .await?; - store - .insert("key3".to_string(), Value::String("value3".to_string())) - .await?; - - // Second rotation - store.rotate_journal().await?; - - // Add more data and delete key2 - store - .insert("key4".to_string(), Value::String("value4".to_string())) - .await?; - store.remove("key2").await?; - - // Final rotation to create snapshot with current state - store.rotate_journal().await?; - - // Read ALL archived snapshots - let archived_files = find_archived_journals(temp_dir.path())?; - let mut all_snapshots = Vec::new(); - - for archived_path in &archived_files { - let compressed_data = std::fs::read(archived_path)?; - let decompressed_data = decompress_zlib(&compressed_data)?; - let snapshot_ts = extract_rotation_timestamp(archived_path)?; - all_snapshots.push((decompressed_data, snapshot_ts)); - } - - // Test 1: Verify recover_current() with ALL snapshots gives correct state - let all_snapshot_refs: Vec<(&[u8], u64)> = all_snapshots - .iter() - .map(|(data, ts)| (data.as_slice(), *ts)) - .collect(); - let recovery_all = VersionedRecovery::new(all_snapshot_refs)?; - let state_all = recovery_all.recover_current()?; - - // Test 2: Verify recover_current() with ONLY the last snapshot gives the same state - // This is the optimization we want to prove works! - let last_snapshot = &all_snapshots[all_snapshots.len() - 1]; - let recovery_last = VersionedRecovery::new(vec![(last_snapshot.0.as_slice(), last_snapshot.1)])?; - let state_last = recovery_last.recover_current()?; - - // Convert to comparable format (Value only, not TimestampedValue) - let state_all_values: ahash::AHashMap = - state_all.into_iter().map(|(k, tv)| (k, tv.value)).collect(); - let state_last_values: ahash::AHashMap = state_last - .into_iter() - .map(|(k, tv)| (k, tv.value)) - .collect(); - - // The last snapshot alone should give us the same current state - assert_eq!(state_last_values, state_all_values); - - // Verify the expected final state has the right keys - assert!(state_last_values.contains_key("key1")); - assert!(!state_last_values.contains_key("key2")); // deleted - assert!(state_last_values.contains_key("key3")); - assert!(state_last_values.contains_key("key4")); - assert_eq!( - state_last_values.get("key1"), - Some(&Value::String("updated1".to_string())) - ); - - Ok(()) -} - -#[tokio::test] -async fn test_detection_compressed_journal() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - // Create and rotate to get compressed archive - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - store - .insert("key1".to_string(), Value::String("value1".to_string())) - .await?; - store.rotate_journal().await?; - - // Find the archived file - let archived_files = find_archived_journals(temp_dir.path())?; - let archived_path = archived_files.first().unwrap(); - let compressed_data = std::fs::read(archived_path)?; - let snapshot_ts = extract_rotation_timestamp(archived_path)?; - - // Verify it starts with zlib magic bytes (0x78) - assert_eq!( - compressed_data[0], 0x78, - "Compressed data should start with zlib magic byte" - ); - - // Decompress manually since VersionedRecovery no longer handles compression - let decompressed_data = decompress_zlib(&compressed_data)?; - - // Should successfully recover from decompressed data - let recovery = VersionedRecovery::new(vec![(&decompressed_data, snapshot_ts)])?; - let state = recovery.recover_current()?; - assert_eq!(state.len(), 1); - assert_eq!( - state.get("key1").map(|tv| &tv.value), - Some(&Value::String("value1".to_string())) - ); - - Ok(()) -} - -#[tokio::test] -async fn test_detection_invalid_journal_data() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - // Create valid snapshot for mixed test - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - store - .insert("key1".to_string(), Value::String("value1".to_string())) - .await?; - store.rotate_journal().await?; - - let archived_files = find_archived_journals(temp_dir.path())?; - let archived_path = archived_files.first().unwrap(); - let compressed_data = std::fs::read(archived_path)?; - let valid_data = decompress_zlib(&compressed_data)?; - let snapshot_ts = extract_rotation_timestamp(archived_path)?; - - // Test 1: Invalid format version - // Since VersionedRecovery no longer validates during construction, - // errors will occur when trying to recover data - let mut invalid_version = vec![0u8; 32]; - let version_bytes = 999u64.to_le_bytes(); - invalid_version[0 .. 8].copy_from_slice(&version_bytes); - let recovery = VersionedRecovery::new(vec![(&invalid_version, snapshot_ts)])?; - let result = recovery.recover_current(); - assert!( - result.is_err(), - "Should fail when recovering with invalid version" - ); - - // Test 2: Data too small (smaller than header) - let small_data = vec![0u8; 8]; - let recovery = VersionedRecovery::new(vec![(&small_data, snapshot_ts)])?; - let result = recovery.recover_current(); - assert!( - result.is_err(), - "Should fail when recovering with data too small" - ); - - // Test 3: Empty data - let empty_data = vec![]; - let recovery = VersionedRecovery::new(vec![(&empty_data, snapshot_ts)])?; - let result = recovery.recover_current(); - assert!( - result.is_err(), - "Should fail when recovering with empty data" - ); - - // Test 4: Corrupted zlib data (caller should decompress before passing) - // If caller accidentally passes compressed data, it will fail during recovery - let mut fake_zlib = vec![0x78, 0x9C]; // Valid zlib magic bytes - fake_zlib.extend_from_slice(&[0xFF; 100]); // But garbage data - let recovery = VersionedRecovery::new(vec![(&fake_zlib, snapshot_ts)])?; - let result = recovery.recover_current(); - assert!( - result.is_err(), - "Should fail when recovering with compressed data" - ); - - // Test 5: Random garbage - let garbage = vec![0xAB, 0xCD, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x90]; - let recovery = VersionedRecovery::new(vec![(&garbage, snapshot_ts)])?; - let result = recovery.recover_current(); - assert!( - result.is_err(), - "Should fail when recovering with garbage data" - ); - - // Test 6: Mixed valid and invalid snapshots - // When the last snapshot is invalid, recover_current() should fail - let mut invalid_mixed = vec![0u8; 32]; - let version_bytes = 999u64.to_le_bytes(); - invalid_mixed[0 .. 8].copy_from_slice(&version_bytes); - let recovery = VersionedRecovery::new(vec![ - (&valid_data, snapshot_ts), - (&invalid_mixed, snapshot_ts + 1000), - ])?; - let result = recovery.recover_current(); - assert!( - result.is_err(), - "Should fail because last snapshot is invalid" - ); - - // Test 7: Mixed invalid and valid snapshots - // When the last snapshot is valid, recover_current() should succeed - let recovery = VersionedRecovery::new(vec![ - (&invalid_mixed, snapshot_ts), - (&valid_data, snapshot_ts + 1000), - ])?; - let result = recovery.recover_current(); - assert!( - result.is_ok(), - "Should succeed because last snapshot is valid" - ); - - Ok(()) -} - -#[test] -fn test_detection_zlib_compression_level_5() { - use flate2::Compression; - use flate2::write::ZlibEncoder; - use std::io::Write; - - // Create some uncompressed journal-like data - let mut uncompressed = vec![0u8; 64]; - // Version 2 - uncompressed[0 .. 8].copy_from_slice(&2u64.to_le_bytes()); - // Position at end - uncompressed[8 .. 16].copy_from_slice(&64u64.to_le_bytes()); - // Some data - uncompressed[16 .. 32].copy_from_slice(b"[{\"base_version\""); - - // Test compression level 5 (what we use in production) - let mut encoder = ZlibEncoder::new(Vec::new(), Compression::new(5)); - encoder.write_all(&uncompressed).unwrap(); - let compressed = encoder.finish().unwrap(); - - // Verify it starts with 0x78 (zlib magic byte) - assert_eq!(compressed[0], 0x78); - - // Decompress manually since VersionedRecovery no longer handles compression - let decompressed = decompress_zlib(&compressed).unwrap(); - - // Should be able to process the decompressed data - // Using arbitrary snapshot timestamp since this is synthetic test data - let result = VersionedRecovery::new(vec![(&decompressed, 1000)]); - // May succeed or fail depending on whether the data is valid bonjson, - // but should at least attempt to parse without panicking - let _ = result; -} - - #[tokio::test] async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; @@ -556,230 +297,6 @@ async fn test_recovery_various_value_types() -> anyhow::Result<()> { Ok(()) } -#[tokio::test] -async fn test_recovery_from_compressed_archive() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - // Create a store and write some data - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - store - .insert("key1".to_string(), Value::String("value1".to_string())) - .await?; - let ts1 = store - .get_with_timestamp("key1") - .map(|tv| tv.timestamp) - .unwrap(); - - std::thread::sleep(std::time::Duration::from_millis(10)); - - store - .insert("key2".to_string(), Value::String("value2".to_string())) - .await?; - let ts2 = store - .get_with_timestamp("key2") - .map(|tv| tv.timestamp) - .unwrap(); - - store.sync()?; - - // Rotate to create compressed archive - store.rotate_journal().await?; - - // Add more data to active journal - std::thread::sleep(std::time::Duration::from_millis(10)); - - store - .insert("key3".to_string(), Value::String("value3".to_string())) - .await?; - let ts3 = store - .get_with_timestamp("key3") - .map(|tv| tv.timestamp) - .unwrap(); - - store.sync()?; - - // Rotate again to create final snapshot - store.rotate_journal().await?; - - // Read all snapshots - let archived_files = find_archived_journals(temp_dir.path())?; - assert_eq!(archived_files.len(), 2, "Should have two snapshots"); - - let mut all_snapshots = Vec::new(); - for archived_path in &archived_files { - let compressed_data = std::fs::read(archived_path)?; - let decompressed_data = decompress_zlib(&compressed_data)?; - let rotation_ts = extract_rotation_timestamp(archived_path)?; - all_snapshots.push((decompressed_data, rotation_ts)); - } - - // Create recovery from all snapshots - let snapshot_refs: Vec<(&[u8], u64)> = all_snapshots - .iter() - .map(|(data, ts)| (data.as_slice(), *ts)) - .collect(); - let recovery = VersionedRecovery::new(snapshot_refs)?; - - // Recover at ts1 (should be in first snapshot) - let state_ts1 = recovery.recover_at_timestamp(ts1)?; - assert_eq!(state_ts1.len(), 1); - assert_eq!( - state_ts1.get("key1").map(|tv| &tv.value), - Some(&Value::String("value1".to_string())) - ); - - // Recover at ts2 (should be in first snapshot) - let state_ts2 = recovery.recover_at_timestamp(ts2)?; - assert_eq!(state_ts2.len(), 2); - - // Recover at ts3 (should include data from both snapshots) - let state_ts3 = recovery.recover_at_timestamp(ts3)?; - assert_eq!(state_ts3.len(), 3); - assert_eq!( - state_ts3.get("key3").map(|tv| &tv.value), - Some(&Value::String("value3".to_string())) - ); - - Ok(()) -} - -#[tokio::test] -async fn test_recovery_from_multiple_compressed_archives() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - // Create a store and perform multiple rotations - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - store - .insert("key1".to_string(), Value::String("value1".to_string())) - .await?; - let ts1 = store - .get_with_timestamp("key1") - .map(|tv| tv.timestamp) - .unwrap(); - - store.rotate_journal().await?; - - std::thread::sleep(std::time::Duration::from_millis(10)); - - store - .insert("key2".to_string(), Value::String("value2".to_string())) - .await?; - let ts2 = store - .get_with_timestamp("key2") - .map(|tv| tv.timestamp) - .unwrap(); - - store.rotate_journal().await?; - - std::thread::sleep(std::time::Duration::from_millis(10)); - - store - .insert("key3".to_string(), Value::String("value3".to_string())) - .await?; - let ts3 = store - .get_with_timestamp("key3") - .map(|tv| tv.timestamp) - .unwrap(); - - store.sync()?; - - // Rotate again to create final snapshot - store.rotate_journal().await?; - - // Collect all snapshots (should have 3) - let archived_files = find_archived_journals(temp_dir.path())?; - assert_eq!(archived_files.len(), 3, "Should have three snapshots"); - - let mut all_snapshots = Vec::new(); - for archived_path in &archived_files { - let compressed_data = std::fs::read(archived_path)?; - let decompressed_data = decompress_zlib(&compressed_data)?; - let rotation_ts = extract_rotation_timestamp(archived_path)?; - all_snapshots.push((decompressed_data, rotation_ts)); - } - - // Create recovery from all snapshots - let snapshot_refs: Vec<(&[u8], u64)> = all_snapshots - .iter() - .map(|(data, ts)| (data.as_slice(), *ts)) - .collect(); - let recovery = VersionedRecovery::new(snapshot_refs)?; - - // Verify we can recover at any timestamp - let state_ts1 = recovery.recover_at_timestamp(ts1)?; - assert_eq!(state_ts1.len(), 1); - assert!(state_ts1.contains_key("key1")); - - let state_ts2 = recovery.recover_at_timestamp(ts2)?; - assert_eq!(state_ts2.len(), 2); - assert!(state_ts2.contains_key("key1")); - assert!(state_ts2.contains_key("key2")); - - let state_ts3 = recovery.recover_at_timestamp(ts3)?; - assert_eq!(state_ts3.len(), 3); - assert!(state_ts3.contains_key("key1")); - assert!(state_ts3.contains_key("key2")); - assert!(state_ts3.contains_key("key3")); - - Ok(()) -} - -#[tokio::test] -async fn test_recovery_mixed_compressed_and_uncompressed() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - // Create initial store and archive (will be compressed) - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - let _ts1 = store - .insert("key1".to_string(), Value::String("value1".to_string())) - .await?; - store.sync()?; - store.rotate_journal().await?; - - // Create uncompressed journal data manually - let mut uncompressed_store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - uncompressed_store - .insert("key2".to_string(), Value::String("value2".to_string())) - .await?; - let ts2 = uncompressed_store - .get_with_timestamp("key2") - .map(|tv| tv.timestamp) - .unwrap(); - uncompressed_store.sync()?; - - // Rotate to create second snapshot - uncompressed_store.rotate_journal().await?; - - // Get all snapshots - let archived_files = find_archived_journals(temp_dir.path())?; - assert_eq!(archived_files.len(), 2, "Should have two snapshots"); - - let mut all_snapshots = Vec::new(); - for archived_path in &archived_files { - let compressed_data = std::fs::read(archived_path)?; - let decompressed_data = decompress_zlib(&compressed_data)?; - let rotation_ts = extract_rotation_timestamp(archived_path)?; - all_snapshots.push((decompressed_data, rotation_ts)); - } - - // Create recovery from all snapshots - let snapshot_refs: Vec<(&[u8], u64)> = all_snapshots - .iter() - .map(|(data, ts)| (data.as_slice(), *ts)) - .collect(); - let recovery = VersionedRecovery::new(snapshot_refs)?; - - let state_final = recovery.recover_at_timestamp(ts2)?; - assert_eq!(state_final.len(), 2); - assert!(state_final.contains_key("key1")); - assert!(state_final.contains_key("key2")); - - Ok(()) -} - #[tokio::test] async fn test_recovery_at_timestamp() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; @@ -950,105 +467,3 @@ async fn test_recovery_at_timestamp_with_rotation() -> anyhow::Result<()> { Ok(()) } - -#[tokio::test] -async fn test_recovery_decompression_transparent() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - // Create store with compressible data - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - let compressible = "A".repeat(500); - store - .insert("data".to_string(), Value::String(compressible.clone())) - .await?; - let ts1 = store - .get_with_timestamp("data") - .map(|tv| tv.timestamp) - .unwrap(); - store.sync()?; - - // Rotate to create first snapshot - store.rotate_journal().await?; - - // Read the first snapshot - let archived_files = find_archived_journals(temp_dir.path())?; - let first_snapshot_path = archived_files.first().unwrap(); - let compressed_data = std::fs::read(first_snapshot_path)?; - let snapshot_ts = extract_rotation_timestamp(first_snapshot_path)?; - - // Decompress the snapshot data (VersionedRecovery requires uncompressed data) - let decompressed_data = decompress_zlib(&compressed_data)?; - let recovery = VersionedRecovery::new(vec![(&decompressed_data, snapshot_ts)])?; - let state = recovery.recover_at_timestamp(ts1)?; - - // Verify recovery works correctly with decompressed data - assert_eq!(state.len(), 1); - assert_eq!( - state.get("data").map(|tv| &tv.value), - Some(&Value::String(compressible)) - ); - - Ok(()) -} - -#[tokio::test] -async fn test_journal_ordering_requirement() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - // Create a store and perform rotation to get proper sequential journals - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - - // Add initial data - store - .insert("key1".to_string(), Value::String("value1".to_string())) - .await?; - store.sync()?; - - // Rotate to create first archived journal - store.rotate_journal().await?; - - std::thread::sleep(std::time::Duration::from_millis(50)); - - // Add more data after rotation - store - .insert("key2".to_string(), Value::String("value2".to_string())) - .await?; - store.sync()?; - - // Rotate again to create second snapshot - store.rotate_journal().await?; - - // Read both snapshots - let archived_files = find_archived_journals(temp_dir.path())?; - assert_eq!(archived_files.len(), 2, "Should have 2 archived snapshots"); - - let first_snapshot_data = std::fs::read(&archived_files[0])?; - let first_snapshot_ts = extract_rotation_timestamp(&archived_files[0])?; - let decompressed_first = decompress_zlib(&first_snapshot_data)?; - - let second_snapshot_data = std::fs::read(&archived_files[1])?; - let second_snapshot_ts = extract_rotation_timestamp(&archived_files[1])?; - let decompressed_second = decompress_zlib(&second_snapshot_data)?; - - // Should succeed when journals are in correct chronological order (oldest first) - let recovery = VersionedRecovery::new(vec![ - (&decompressed_first, first_snapshot_ts), - (&decompressed_second, second_snapshot_ts), - ]); - assert!(recovery.is_ok(), "Should succeed with correct ordering"); - - // Verify correct ordering produces expected results - let state = recovery?.recover_current()?; - assert_eq!(state.len(), 2); - assert!(state.contains_key("key1")); - assert!(state.contains_key("key2")); - - // Note: Journals with reversed order may not produce correct results - // because recovery replays journals sequentially. Users must provide - // journals in chronological order (oldest to newest). - // The removal of base_timestamp metadata field doesn't change this requirement - - // chronological order is determined by filename timestamps (e.g., store.jrn.t300.zz) - - Ok(()) -} diff --git a/bd-resilient-kv/src/versioned_recovery.rs b/bd-resilient-kv/src/versioned_recovery.rs index e45df5a4..97ff8728 100644 --- a/bd-resilient-kv/src/versioned_recovery.rs +++ b/bd-resilient-kv/src/versioned_recovery.rs @@ -10,26 +10,6 @@ use ahash::AHashMap; use bd_bonjson::Value; use bd_bonjson::decoder::from_slice; -/// Helper function to read a u64 field from a BONJSON object. -/// -/// BONJSON's decoder automatically converts unsigned values that fit in i64 to signed values -/// during decoding (see bd-bonjson/src/decoder.rs:227-234). This means that even though we -/// write `Value::Unsigned(version)`, the decoder returns `Value::Signed(version as i64)`. -/// -/// TODO(snowp): Consider changing BONJSON's decoder to preserve the original unsigned type -/// to avoid this normalization behavior and eliminate the need for this helper. -fn read_u64_field(obj: &AHashMap, key: &str) -> Option { - match obj.get(key) { - Some(Value::Unsigned(v)) => Some(*v), - Some(Value::Signed(v)) if *v >= 0 => - { - #[allow(clippy::cast_sign_loss)] - Some(*v as u64) - }, - _ => None, - } -} - /// A utility for recovering state at arbitrary timestamps from journal snapshots. /// /// This utility operates on raw uncompressed byte slices from archived journal snapshots @@ -188,41 +168,45 @@ fn replay_journal_to_timestamp( ) -> anyhow::Result<()> { let array = read_bonjson_payload(buffer)?; - if let Value::Array(entries) = array { - for (index, entry) in entries.iter().enumerate() { - // Skip metadata (first entry) - if index == 0 { - continue; - } + let Value::Array(entries) = &array else { + return Ok(()); + }; - if let Value::Object(obj) = entry { - // Extract timestamp (skip entries without timestamp) - let Some(entry_timestamp) = read_u64_field(obj, "t") else { - continue; - }; + for (index, entry) in entries.iter().enumerate() { + // Skip metadata (first entry) + if index == 0 { + continue; + } - // Only apply entries up to target timestamp - if entry_timestamp > target_timestamp { - break; - } + let Value::Object(obj) = entry else { + continue; + }; - // Extract key and operation - if let Some(Value::String(key)) = obj.get("k") - && let Some(operation) = obj.get("o") - { - if operation.is_null() { - map.remove(key); - } else { - map.insert( - key.clone(), - TimestampedValue { - value: operation.clone(), - timestamp: entry_timestamp, - }, - ); - } - } - } + // Extract timestamp (skip entries without timestamp) + let Some(entry_timestamp) = read_u64_field(obj, "t") else { + continue; + }; + + // Only apply entries up to target timestamp + if entry_timestamp > target_timestamp { + break; + } + + let (Some(Value::String(key)), Some(operation)) = (obj.get("k"), obj.get("o")) else { + continue; + }; + + // Extract key and operation + if operation.is_null() { + map.remove(key); + } else { + map.insert( + key.clone(), + TimestampedValue { + value: operation.clone(), + timestamp: entry_timestamp, + }, + ); } } @@ -264,3 +248,23 @@ fn read_bonjson_payload(buffer: &[u8]) -> anyhow::Result { Err(e) => anyhow::bail!("Failed to decode buffer: {e:?}"), } } + +/// Helper function to read a u64 field from a BONJSON object. +/// +/// BONJSON's decoder automatically converts unsigned values that fit in i64 to signed values +/// during decoding (see bd-bonjson/src/decoder.rs:227-234). This means that even though we +/// write `Value::Unsigned(version)`, the decoder returns `Value::Signed(version as i64)`. +/// +/// TODO(snowp): Consider changing BONJSON's decoder to preserve the original unsigned type +/// to avoid this normalization behavior and eliminate the need for this helper. +fn read_u64_field(obj: &AHashMap, key: &str) -> Option { + match obj.get(key) { + Some(Value::Unsigned(v)) => Some(*v), + Some(Value::Signed(v)) if *v >= 0 => + { + #[allow(clippy::cast_sign_loss)] + Some(*v as u64) + }, + _ => None, + } +} From 6d7ae28f7ca437c8e274f2ecf6fb7bbc1eb37157 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Thu, 6 Nov 2025 19:47:40 -0800 Subject: [PATCH 29/66] docs updates --- bd-resilient-kv/AGENTS.md | 11 ++-- bd-resilient-kv/README.md | 40 +++++---------- bd-resilient-kv/VERSIONED_FORMAT.md | 79 ++++++++++++++--------------- 3 files changed, 56 insertions(+), 74 deletions(-) diff --git a/bd-resilient-kv/AGENTS.md b/bd-resilient-kv/AGENTS.md index 4df8a008..fdfe4ff2 100644 --- a/bd-resilient-kv/AGENTS.md +++ b/bd-resilient-kv/AGENTS.md @@ -73,9 +73,8 @@ The `VersionedKVStore` provides a higher-level API built on top of `VersionedKVJ **Rotation Strategy**: - Automatic rotation when journal size exceeds high water mark (triggered during async write operations) - Current state is compacted into a new journal as versioned entries -- Old journal is archived with `.v{version}.zz` suffix +- Old journal is archived with `.t{timestamp}.zz` suffix - Archived journals are automatically compressed using zlib (RFC 1950, level 5) asynchronously -- Application controls upload/cleanup of archived journals **Compression**: - All archived journals are automatically compressed during rotation using async I/O @@ -249,8 +248,8 @@ fn set_multiple(&mut self, entries: &[(String, Value)]) -> anyhow::Result<()> { - **Action**: Handle as standard I/O errors 4. **Compression/Archive Errors (VersionedKVStore)** - - **When**: Rotation callback receives archived journal path but compression fails - - **Result**: Application-level error in rotation callback + - **When**: Asynchronous compression of archived journal fails + - **Result**: Error during rotation's async compression phase - **Action**: Retry compression, handle cleanup appropriately ### Impossible Failure Modes (Architectural Guarantees) @@ -352,7 +351,7 @@ When modifying or refactoring code in the kv_journal system (or any Rust codebas - **Always update documentation and comments** to reflect current functionality - Pay special attention to trait documentation, method comments, and module-level explanations - Update CLAUDE.md or similar architectural documentation when making significant changes -- Ensure code comments explain the "why" behind complex logic, especially around callback mechanisms and compaction strategies +- Ensure code comments explain the "why" behind complex logic, especially around compaction strategies and retry mechanisms ### Code Quality Checks After making changes, run these commands in order: @@ -369,7 +368,7 @@ After making changes, run these commands in order: ### Testing - Run the full test suite: `cargo test -p bd-resilient-kv --lib` -- Pay special attention to tests that verify callback behavior and automatic switching +- Pay special attention to tests that verify automatic switching and retry logic - When adding new functionality, include comprehensive tests covering edge cases ### Git Workflow diff --git a/bd-resilient-kv/README.md b/bd-resilient-kv/README.md index f310ac43..b76d3fcc 100644 --- a/bd-resilient-kv/README.md +++ b/bd-resilient-kv/README.md @@ -232,9 +232,8 @@ async fn main() -> anyhow::Result<()> { - **Automatic Rotation**: When the journal exceeds the high water mark, it automatically: - Creates a new journal with the current state as versioned entries (compaction) - Preserves original timestamps from the initial writes - - Archives the old journal with `.v{version}.zz` suffix + - Archives the old journal with `.t{timestamp}.zz` suffix - Compresses the archived journal using zlib (RFC 1950, level 5) asynchronously - - Invokes the rotation callback (if provided) for upload/cleanup - **Automatic Compression**: Archived journals are automatically compressed to save disk space - Active journals remain uncompressed for write performance - Typically achieves >50% size reduction for text-based data @@ -326,18 +325,7 @@ pub fn current_version(&self) -> u64 **Internal Timestamp Tracking**: The store internally tracks timestamps for all writes and preserves them during journal rotation. These timestamps are used for recovery and point-in-time operations but are not exposed in the primary API. For advanced use cases requiring timestamp access, the `get_with_timestamp()` method is available. -#### Type Aliases - -```rust -pub type RotationCallback = Box; -``` - -**Note**: The callback receives three parameters: -- `old_journal_path`: Path to the archived journal that was rotated out -- `new_journal_path`: Path to the new active journal -- `rotation_version`: The version at which rotation occurred - -## Architecture +#### Core Methods ### Storage Models @@ -362,18 +350,17 @@ The versioned store uses a different architecture optimized for version tracking 2. **Version Tracking**: Every entry includes a monotonically increasing version number 3. **Automatic Rotation**: When the journal reaches the high water mark: - Current state is serialized as versioned entries into a new journal - - Old journal is archived with `.v{version}` suffix (e.g., `store.jrn.v123`) - - Optional callback is invoked for remote upload/cleanup -4. **Point-in-Time Recovery**: Journal can be replayed up to any previous version + - Old journal is archived with `.t{timestamp}` suffix (e.g., `store.jrn.t1699564800000000000`) +4. **Point-in-Time Recovery**: Journal can be replayed up to any previous timestamp **Rotation Strategy**: ``` Before rotation: - my_store.jrn (1MB, versions 1-1000) + my_store.jrn (1MB, multiple timestamped entries) After rotation: - my_store.jrn (compacted, starts at version 1001) - my_store.jrn.v1000.zz (archived, compressed, readonly) + my_store.jrn (compacted, new entries with fresh timestamps) + my_store.jrn.t1699564800000000000.zz (archived, compressed, readonly) ``` **Compression**: @@ -471,16 +458,15 @@ my_store.jrnb # Journal B The versioned store manages a single journal with archived versions: - **Active Journal**: Current journal file (e.g., `my_store.jrn`) -- **Archived Journals**: Previous versions with `.v{version}` suffix +- **Archived Journals**: Previous versions with `.t{timestamp}` suffix - **Automatic Archival**: Old journals are preserved during rotation -- **Callback Integration**: Application controls upload/cleanup of archived journals Example file structure after multiple rotations: ``` -my_store.jrn # Active journal (current, uncompressed) -my_store.jrn.v1000.zz # Archived at version 1000 (compressed) -my_store.jrn.v2500.zz # Archived at version 2500 (compressed) -my_store.jrn.v4000.zz # Archived at version 4000 (compressed) +my_store.jrn # Active journal (current, uncompressed) +my_store.jrn.t1699564800000000000.zz # Archived (compressed) +my_store.jrn.t1699651200000000000.zz # Archived (compressed) +my_store.jrn.t1699737600000000000.zz # Archived (compressed) ``` ## Thread Safety @@ -524,7 +510,7 @@ async fn main() -> anyhow::Result<()> { // After rotation, archived journals are automatically compressed: // - my_store.jrn (active, uncompressed) - // - my_store.jrn.v10000.zz (archived, compressed with zlib asynchronously) + // - my_store.jrn.t1699564800000000000.zz (archived, compressed with zlib asynchronously) Ok(()) } diff --git a/bd-resilient-kv/VERSIONED_FORMAT.md b/bd-resilient-kv/VERSIONED_FORMAT.md index 14bd87c8..13540bc6 100644 --- a/bd-resilient-kv/VERSIONED_FORMAT.md +++ b/bd-resilient-kv/VERSIONED_FORMAT.md @@ -2,11 +2,11 @@ ## Overview -This document describes the versioned journal format (VERSION 2) that enables audit logs and remote backup by tracking both version numbers and timestamps for each write operation. +This document describes the versioned journal format (VERSION 2) that enables audit logs and remote backup by using timestamps as version identifiers for each write operation. ## Goals -1. **Version and Timestamp Tracking**: Each write operation records a monotonically non-decreasing version number and timestamp +1. **Timestamp-Based Versioning**: Each write operation records a monotonically non-decreasing timestamp (in nanoseconds since UNIX epoch) that serves as both a version identifier and a logical clock 2. **Journal Rotation**: Periodic compaction with self-contained state in each journal 3. **Remote Backup**: Archived journals can be uploaded to remote storage @@ -23,8 +23,8 @@ Unlike traditional journal systems that use separate snapshot files, this design ### 1. Active Journal (`my_store.jrn`) The current active journal receiving new writes. Active journals are **not compressed** for performance reasons. -### 2. Archived Journals (`my_store.jrn.v00020000.zz`, `my_store.jrn.v00030000.zz`, etc.) -Previous journals, archived during rotation. Each contains complete state at rotation version plus subsequent incremental writes. The version number in the filename indicates the rotation/snapshot version. +### 2. Archived Journals (`my_store.jrn.t1699564900000000000.zz`, etc.) +Previous journals, archived during rotation. Each contains complete state at rotation timestamp plus subsequent incremental writes. The timestamp in the filename indicates the rotation/snapshot timestamp. **Archived journals are automatically compressed using zlib** (indicated by the `.zz` extension) to reduce storage space and bandwidth requirements for remote backup. Compression is mandatory and occurs automatically during rotation. @@ -54,7 +54,6 @@ Previous journals, archived during rotation. Each contains complete state at rot **Versioned Journal Entry**: ```json { - "v": , "t": , "k": "", "o": @@ -62,8 +61,7 @@ Previous journals, archived during rotation. Each contains complete state at rot ``` Fields: -- `v` (version): Monotonically non-decreasing write version number -- `t` (timestamp): When the write occurred (ns since UNIX epoch), monotonically non-decreasing +- `t` (timestamp): Monotonically non-decreasing timestamp (ns since UNIX epoch) that serves as both the write time and version identifier - `k` (key): The key being written - `o` (operation): The value (for SET) or null (for DELETE) @@ -76,52 +74,51 @@ Timestamps are monotonically non-decreasing, not strictly increasing. If the sys When first created: ```json {"initialized": 1699564800000000000, "format_version": 2} -{"v": 2, "t": 1699564801000000000, "k": "key1", "o": "value1"} -{"v": 3, "t": 1699564802000000000, "k": "key2", "o": "value2"} +{"t": 1699564801000000000, "k": "key1", "o": "value1"} +{"t": 1699564802000000000, "k": "key2", "o": "value2"} ... ``` ### Rotated Journal -After rotation at version 30000, the new journal contains: +After rotation at timestamp 1699564900000000000, the new journal contains: ```json {"initialized": 1699564900000000000, "format_version": 2} -{"v": 30000, "t": 1699564800123456789, "k": "key1", "o": "value1"} // Compacted state (original timestamp) -{"v": 30000, "t": 1699564850987654321, "k": "key2", "o": "value2"} // Compacted state (original timestamp) -{"v": 30000, "t": 1699564875111222333, "k": "key3", "o": "value3"} // Compacted state (original timestamp) -{"v": 30001, "t": 1699564901000000000, "k": "key4", "o": "value4"} // New write -{"v": 30002, "t": 1699564902000000000, "k": "key1", "o": "updated1"} // New write +{"t": 1699564800123456789, "k": "key1", "o": "value1"} // Compacted state (original timestamp preserved) +{"t": 1699564850987654321, "k": "key2", "o": "value2"} // Compacted state (original timestamp preserved) +{"t": 1699564875111222333, "k": "key3", "o": "value3"} // Compacted state (original timestamp preserved) +{"t": 1699564901000000000, "k": "key4", "o": "value4"} // New write after rotation +{"t": 1699564902000000000, "k": "key1", "o": "updated1"} // New write after rotation ... ``` Key observations: -- All compacted state entries have the same version (30000) - **Timestamps are preserved**: Each compacted entry retains its original write timestamp (not the rotation time) - These are regular journal entries, not a special format -- Incremental writes continue with version 30001+ +- New writes continue with later timestamps - Each rotated journal is self-contained and can be read independently ## Rotation Process -When high water mark is reached at version N: +When high water mark is reached: -1. **Create New Journal**: Initialize fresh journal file (e.g., `my_store.jrn.tmp`) -2. **Write Compacted State**: Write all current key-value pairs as versioned entries at version N +1. **Determine Rotation Timestamp**: Calculate max timestamp T from all current entries +2. **Create New Journal**: Initialize fresh journal file (e.g., `my_store.jrn.tmp`) +3. **Write Compacted State**: Write all current key-value pairs as versioned entries - **Timestamp Preservation**: Each entry retains its original write timestamp, not the rotation timestamp - This preserves historical accuracy and allows proper temporal analysis of the data -3. **Archive Old Journal**: Rename `my_store.jrn` → `my_store.jrn.old` (temporary) -4. **Activate New Journal**: Rename `my_store.jrn.tmp` → `my_store.jrn` -5. **Compress Archive**: Compress `my_store.jrn.old` → `my_store.jrn.v{N}.zz` using zlib -6. **Delete Temporary**: Remove uncompressed `my_store.jrn.old` -7. **Callback**: Notify application for upload/cleanup of compressed archived journal +4. **Archive Old Journal**: Rename `my_store.jrn` → `my_store.jrn.old` (temporary) +5. **Activate New Journal**: Rename `my_store.jrn.tmp` → `my_store.jrn` +6. **Compress Archive**: Compress `my_store.jrn.old` → `my_store.jrn.t{T}.zz` using zlib +7. **Delete Temporary**: Remove uncompressed `my_store.jrn.old` Example: ``` -Before rotation at v30000: - my_store.jrn # Active, contains v20000-v30000 +Before rotation at t=1699564900000000000: + my_store.jrn # Active journal After rotation: - my_store.jrn # Active, contains compacted state at v30000 - my_store.jrn.v30000.zz # Compressed archive, contains v20000-v30000 + my_store.jrn # Active, contains compacted state + my_store.jrn.t1699564900000000000.zz # Compressed archive ``` ### Compression @@ -146,7 +143,7 @@ Rotation **cannot fail** due to insufficient buffer space. This is an architectu **What Can Fail:** - I/O errors (disk full, permissions, etc.) -- Compression errors in the callback phase (application-level) +- Compression errors during async compression phase **What Cannot Fail:** - Writing compacted state to new journal buffer (guaranteed to fit) @@ -162,15 +159,15 @@ While `VersionedKVStore` does not support point-in-time recovery through its API - **Audit Logging**: Review what changes were made and when - **Offline Analysis**: Process archived journals to understand historical patterns - **Remote Backup**: Upload archived journals to remote storage for disaster recovery -- **Compliance**: Maintain immutable records of all changes with version tracking +- **Compliance**: Maintain immutable records of all changes with timestamp tracking -The version numbers in each entry allow you to understand the exact sequence of operations and build custom tooling for analyzing historical data. +The timestamps in each entry allow you to understand the exact sequence of operations and build custom tooling for analyzing historical data. **Timestamp Accuracy**: All entries preserve their original write timestamps, even after rotation. This means you can accurately track when each write originally occurred, making the journals suitable for temporal analysis, compliance auditing, and debugging time-sensitive issues. ### Point-in-Time Recovery with VersionedRecovery -While `VersionedKVStore` is designed for active operation and does not support point-in-time recovery through its API, the `VersionedRecovery` utility provides a way to reconstruct state at arbitrary historical versions from raw journal bytes. +While `VersionedKVStore` is designed for active operation and does not support point-in-time recovery through its API, the `VersionedRecovery` utility provides a way to reconstruct state at arbitrary historical timestamps from raw journal bytes. #### Overview @@ -183,20 +180,20 @@ While `VersionedKVStore` is designed for active operation and does not support p #### Use Cases -- **Server-Side Analysis**: Reconstruct state at specific versions for debugging or investigation +- **Server-Side Analysis**: Reconstruct state at specific timestamps for debugging or investigation - **Audit Tooling**: Build custom audit systems that analyze historical changes - **Cross-Rotation Recovery**: Recover state spanning multiple archived journals - **Compliance**: Extract state at specific points in time for regulatory requirements -- **Testing**: Validate that state at historical versions matches expectations +- **Testing**: Validate that state at historical timestamps matches expectations #### Implementation Details - **Async File Loading**: Constructor uses async I/O to load journal files efficiently - **Automatic Decompression**: Transparently decompresses `.zz` archives when loading - **Chronological Order**: Journals should be provided oldest to newest -- **Efficient Replay**: Automatically skips journals outside the target version range +- **Efficient Replay**: Automatically skips journals outside the target timestamp range - **Cross-Rotation**: Seamlessly handles recovery across multiple archived journals -- **Version Tracking**: Replays all entries up to and including the target version +- **Timestamp Tracking**: Replays all entries up to and including the target timestamp ## Storage Efficiency @@ -217,9 +214,9 @@ While `VersionedKVStore` is designed for active operation and does not support p ## Implementation Notes -1. **Version Counter Persistence**: Stored in metadata, initialized from journal on restart -2. **Atomicity**: Version increments are atomic with writes -3. **Monotonicity**: Versions are monotonically non-decreasing (multiple entries may share the same version during rotation) +1. **Timestamp as Version**: Timestamps serve as version identifiers - no separate version counter needed +2. **Atomicity**: Timestamp assignment is atomic with writes +3. **Monotonicity**: Timestamps are monotonically non-decreasing (clock clamping if system clock goes backward) 4. **Concurrency**: Not thread-safe by design (same as current implementation) -5. **Format Field Names**: Use short names (`v`, `t`, `k`, `o`) to minimize storage overhead +5. **Format Field Names**: Use short names (`t`, `k`, `o`) to minimize storage overhead 6. **Self-Contained Journals**: Each rotated journal can be read independently without dependencies From 109f740a2a221d8c4fd84f1160cba0e8498f8def Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Thu, 6 Nov 2025 20:13:08 -0800 Subject: [PATCH 30/66] cleanup docs --- bd-resilient-kv/VERSIONED_FORMAT.md | 277 ++++++++++++++++++---------- 1 file changed, 182 insertions(+), 95 deletions(-) diff --git a/bd-resilient-kv/VERSIONED_FORMAT.md b/bd-resilient-kv/VERSIONED_FORMAT.md index 13540bc6..eadaa1b6 100644 --- a/bd-resilient-kv/VERSIONED_FORMAT.md +++ b/bd-resilient-kv/VERSIONED_FORMAT.md @@ -2,21 +2,12 @@ ## Overview -This document describes the versioned journal format (VERSION 2) that enables audit logs and remote backup by using timestamps as version identifiers for each write operation. +This document describes the versioned journal format (VERSION 2) that enables point-in-time state recovery by using timestamps as version identifiers for each write operation. ## Goals -1. **Timestamp-Based Versioning**: Each write operation records a monotonically non-decreasing timestamp (in nanoseconds since UNIX epoch) that serves as both a version identifier and a logical clock +1. **Timestamp-Based Versioning**: Each write operation records a monotonically non-decreasing timestamp (in nanoseconds since UNIX epoch) that serves as both a version identifier and a logical clock. This allows correlating entries with with time-based data. 2. **Journal Rotation**: Periodic compaction with self-contained state in each journal -3. **Remote Backup**: Archived journals can be uploaded to remote storage - -## Design Philosophy - -Unlike traditional journal systems that use separate snapshot files, this design uses a **unified format** where: -- Each journal is self-contained with complete state embedded as regular entries -- No special "snapshot entry" format needed -- First N entries in a rotated journal are just regular versioned entries (all at same version) -- Simpler file structure and uniform entry format throughout ## File Types @@ -24,39 +15,85 @@ Unlike traditional journal systems that use separate snapshot files, this design The current active journal receiving new writes. Active journals are **not compressed** for performance reasons. ### 2. Archived Journals (`my_store.jrn.t1699564900000000000.zz`, etc.) -Previous journals, archived during rotation. Each contains complete state at rotation timestamp plus subsequent incremental writes. The timestamp in the filename indicates the rotation/snapshot timestamp. +Previous journals, archived during rotation. Each contains complete state at its creation time plus subsequent incremental writes. The timestamp in the filename indicates the rotation/snapshot timestamp. **Archived journals are automatically compressed using zlib** (indicated by the `.zz` extension) to reduce storage space and bandwidth requirements for remote backup. Compression is mandatory and occurs automatically during rotation. ## Format Specification -### Journal Format (VERSION 2) +### Binary Structure + +The byte-level layout of a VERSION 2 journal file: ``` -| Position | Data | Type | Size | -|----------|--------------------------|----------------|---------| -| 0 | Format Version | u64 | 8 bytes | -| 8 | Position | u64 | 8 bytes | -| 16 | Type Code: Array Start | u8 | 1 byte | -| 17 | Metadata Object | BONJSON Object | varies | -| ... | Versioned Journal Entry | BONJSON Object | varies | -| ... | Versioned Journal Entry | BONJSON Object | varies | +┌─────────────────────────────────────────────────────────────────────────┐ +│ JOURNAL FILE HEADER │ +├──────────────────┬──────────────────┬───────────────────────────────────┤ +│ Format Version │ Position │ Array Start Type Code │ +│ (u64) │ (u64) │ (u8) │ +│ 8 bytes │ 8 bytes │ 1 byte │ +└──────────────────┴──────────────────┴───────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────────────────┐ +│ BONJSON METADATA OBJECT │ +│ (First entry in the array) │ +├─────────────────────────────────────────────────────────────────────────┤ +│ { │ +│ "initialized": 1699564800000000000, // u64 timestamp (ns) │ +│ "format_version": 2 // Format identifier │ +│ } │ +└─────────────────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────────────────┐ +│ VERSIONED JOURNAL ENTRY │ +│ (BONJSON Object) │ +├─────────────────────────────────────────────────────────────────────────┤ +│ { │ +│ "t": 1699564801000000000, // Timestamp in ns (u64) │ +│ "k": "key1", // Key (string) │ +│ "o": "value1" // Value or null (any type) │ +│ } │ +└─────────────────────────────────────────────────────────────────────────┘ ``` -**Metadata Object** (first entry in array): + +### Header Structure (17 bytes total) + +1. **Format Version** (offset 0, 8 bytes) + - Type: `u64` little-endian + - Value: `2` for VERSION 2 format + - Purpose: Allows future format evolution + +2. **Position** (offset 8, 8 bytes) + - Type: `u64` little-endian + - Value: Current write position in the buffer + - Purpose: Tracks where next entry will be written + +3. **Array Start Type Code** (offset 16, 1 byte) + - Type: `u8` + - Value: BONJSON type code indicating array start + - Purpose: Begins the BONJSON array containing all entries + +### Metadata Object (Variable size) + +The first entry in the array is always a metadata object: + ```json { - "initialized": , - "format_version": 2 + "initialized": , // Creation timestamp (nanoseconds since epoch) + "format_version": 2 // Must be 2 for this format } ``` -**Versioned Journal Entry**: +### Versioned Journal Entry Schema (Variable size) + +Each subsequent entry follows this uniform schema: + ```json { - "t": , - "k": "", - "o": + "t": , // Timestamp in nanoseconds (monotonically non-decreasing, serves as version) + "k": "", // Key being modified + "o": // Value for SET, null for DELETE } ``` @@ -65,9 +102,23 @@ Fields: - `k` (key): The key being written - `o` (operation): The value (for SET) or null (for DELETE) +**Type Flexibility**: The `"o"` field can contain any BONJSON-compatible type: +- Primitives (strings, numbers, booleans) +- Complex objects +- Arrays +- `null` (indicates DELETE operation) + **Timestamp Semantics:** Timestamps are monotonically non-decreasing, not strictly increasing. If the system clock doesn't advance between writes, multiple entries may share the same timestamp. This is expected behavior and ensures proper ordering without clock skew. +**Size Considerations:** +- **Header**: Fixed 17 bytes +- **Metadata**: ~80-100 bytes (depending on timestamp magnitude) +- **Per Entry**: Varies based on key and value size + - Minimum: ~50 bytes (short key, small value) + - Typical: 100-500 bytes + - Maximum: Limited by buffer size + ## Journal Structure ### Initial Journal @@ -93,6 +144,7 @@ After rotation at timestamp 1699564900000000000, the new journal contains: Key observations: - **Timestamps are preserved**: Each compacted entry retains its original write timestamp (not the rotation time) + - This ensures that not only is the state at any given time recoverably from a given snapshot, we'll also be able to recover how long the current state values have been active for without looking at the previous snapshot. - These are regular journal entries, not a special format - New writes continue with later timestamps - Each rotated journal is self-contained and can be read independently @@ -101,11 +153,9 @@ Key observations: When high water mark is reached: -1. **Determine Rotation Timestamp**: Calculate max timestamp T from all current entries +1. **Determine Rotation Timestamp**: Calculate max timestamp T from the most recent entry 2. **Create New Journal**: Initialize fresh journal file (e.g., `my_store.jrn.tmp`) -3. **Write Compacted State**: Write all current key-value pairs as versioned entries - - **Timestamp Preservation**: Each entry retains its original write timestamp, not the rotation timestamp - - This preserves historical accuracy and allows proper temporal analysis of the data +3. **Write Compacted State**: Write all current key-value pairs as versioned entries using their original update timestamp 4. **Archive Old Journal**: Rename `my_store.jrn` → `my_store.jrn.old` (temporary) 5. **Activate New Journal**: Rename `my_store.jrn.tmp` → `my_store.jrn` 6. **Compress Archive**: Compress `my_store.jrn.old` → `my_store.jrn.t{T}.zz` using zlib @@ -121,49 +171,129 @@ After rotation: my_store.jrn.t1699564900000000000.zz # Compressed archive ``` +### Rotation Timeline Visualization + +``` +TIME + │ + ├─ t0: Normal Operation + │ ┌────────────────────────────────────┐ + │ │ my_store.jrn │ + │ │ ├─ {"t": 1699564795000000000, ...}│ + │ │ ├─ {"t": 1699564796000000000, ...}│ + │ │ ├─ {"t": 1699564797000000000, ...}│ + │ │ ├─ {"t": 1699564798000000000, ...}│ + │ │ └─ {"t": 1699564799000000000, ...}│ + │ └────────────────────────────────────┘ + │ + ├─ t1: High Water Mark Reached + │ ┌────────────────────────────────────┐ + │ │ my_store.jrn │ + │ │ └─ {"t": 1699564800000000000, ...}│ ← TRIGGER + │ └────────────────────────────────────┘ + │ max_timestamp = 1699564800000000000 + │ + ├─ t2: Create New Journal (Step 1) + │ ┌────────────────────────────────────┐ + │ │ my_store.jrn │ (old, still active) + │ └────────────────────────────────────┘ + │ ┌────────────────────────────────────┐ + │ │ my_store.jrn.tmp │ (new, being written) + │ │ └─ [header + metadata] │ + │ └────────────────────────────────────┘ + │ + ├─ t3: Write Compacted State (Step 2) + │ ┌────────────────────────────────────┐ + │ │ my_store.jrn │ (old, still active) + │ └────────────────────────────────────┘ + │ ┌────────────────────────────────────┐ + │ │ my_store.jrn.tmp │ (new, being written) + │ │ ├─ {"t": 1699564750000000000, "k": "key1", ...}│ ← Original timestamps + │ │ ├─ {"t": 1699564780000000000, "k": "key2", ...}│ ← Original timestamps + │ │ └─ {"t": 1699564799000000000, "k": "key3", ...}│ ← Original timestamps + │ └────────────────────────────────────┘ + │ + ├─ t4: Archive Old Journal (Step 3) + │ ┌────────────────────────────────────┐ + │ │ my_store.jrn.old │ (renamed, temporary) + │ └────────────────────────────────────┘ + │ ┌────────────────────────────────────┐ + │ │ my_store.jrn.tmp │ (new, ready) + │ └────────────────────────────────────┘ + │ + ├─ t5: Activate New Journal (Step 4) + │ ┌────────────────────────────────────┐ + │ │ my_store.jrn.old │ (archived, temporary) + │ └────────────────────────────────────┘ + │ ┌────────────────────────────────────┐ + │ │ my_store.jrn │ ← NOW ACTIVE! + │ │ (contains compacted state) │ + │ └────────────────────────────────────┘ + │ + ├─ t6: Compress Archive (Step 5 - Async) + │ ┌────────────────────────────────────┐ + │ │ my_store.jrn │ (active, accepting writes) + │ │ └─ {"t": 1699564801000000000, ...}│ ← New writes + │ └────────────────────────────────────┘ + │ ┌────────────────────────────────────┐ + │ │ my_store.jrn.old │ (being compressed...) + │ └────────────────────────────────────┘ + │ ┌────────────────────────────────────┐ + │ │ my_store.jrn.t1699564800000000000.zz│ (compressed output) + │ └────────────────────────────────────┘ + │ + ├─ t7: Delete Temporary (Step 6) + │ ┌────────────────────────────────────┐ + │ │ my_store.jrn │ (active) + │ └────────────────────────────────────┘ + │ ┌────────────────────────────────────┐ + │ │ my_store.jrn.t1699564800000000000.zz│ (compressed archive) + │ └────────────────────────────────────┘ + │ + └─ t8: Continue Normal Operation + ┌────────────────────────────────────┐ + │ my_store.jrn │ + │ ├─ {"t": 1699564801000000000, ...}│ + │ ├─ {"t": 1699564802000000000, ...}│ + │ └─ {"t": 1699564803000000000, ...}│ + └────────────────────────────────────┘ + ┌────────────────────────────────────┐ + │ my_store.jrn.t1699564800000000000.zz│ (ready for upload) + └────────────────────────────────────┘ +``` + ### Compression Archived journals are automatically compressed using zlib (compression level 5) during rotation: - **Format**: Standard zlib format (RFC 1950) - **Extension**: `.zz` indicates zlib compression -- **Transparency**: `VersionedRecovery` automatically decompresses archives when reading - **Benefits**: Reduced storage space and bandwidth for remote backups -- **Performance**: Compression level 5 provides good balance between speed and compression ratio -### Rotation Failure Modes +### Rotation Failure Modes and Recovery -**Impossible Failure: Buffer Overflow During Rotation** +| Failure Point | State | Recovery | +|---------------|-------|----------| +| Before Step 3 | my_store.jrn + my_store.jrn.tmp exist | Delete .tmp, retry | +| After Step 3, before Step 4 | my_store.jrn.old exists, no active journal | Rename .old back to .jrn | +| After Step 4 | New journal active | Continue normally, cleanup may be incomplete | +| During Step 5-6 | Compression fails | .old file may remain, but new journal is valid | -Rotation **cannot fail** due to insufficient buffer space. This is an architectural guarantee: - -- **Why**: Rotation creates a new journal with the same buffer size as the original journal -- **Compaction Property**: The compacted state only includes current key-value pairs (removes redundant/old versions) -- **Mathematical Guarantee**: Compacted state size ≤ current journal size -- **Conclusion**: If data fits in the journal during normal operation, it will always fit during rotation **What Can Fail:** - I/O errors (disk full, permissions, etc.) - Compression errors during async compression phase -**What Cannot Fail:** -- Writing compacted state to new journal buffer (guaranteed to fit) - ## Recovery and Audit ### Current State Recovery Simply read the active journal (`my_store.jrn`) and replay all entries to reconstruct the current state. ### Audit and Analysis -While `VersionedKVStore` does not support point-in-time recovery through its API, archived journals contain complete historical data that can be used for: - -- **Audit Logging**: Review what changes were made and when -- **Offline Analysis**: Process archived journals to understand historical patterns -- **Remote Backup**: Upload archived journals to remote storage for disaster recovery -- **Compliance**: Maintain immutable records of all changes with timestamp tracking +While `VersionedKVStore` does not support point-in-time recovery through its API, archived journals contain complete historical data. The timestamps in each entry allow you to understand the exact sequence of operations and build custom tooling for analyzing historical data. -**Timestamp Accuracy**: All entries preserve their original write timestamps, even after rotation. This means you can accurately track when each write originally occurred, making the journals suitable for temporal analysis, compliance auditing, and debugging time-sensitive issues. +**Timestamp Accuracy**: All entries preserve their original write timestamps, even after rotation. This means you can accurately track when each write originally occurred. ### Point-in-Time Recovery with VersionedRecovery @@ -177,46 +307,3 @@ While `VersionedKVStore` is designed for active operation and does not support p - Can process multiple journals for cross-rotation recovery - Designed for offline analysis, server-side tooling, and audit systems - Completely independent from `VersionedKVStore` - -#### Use Cases - -- **Server-Side Analysis**: Reconstruct state at specific timestamps for debugging or investigation -- **Audit Tooling**: Build custom audit systems that analyze historical changes -- **Cross-Rotation Recovery**: Recover state spanning multiple archived journals -- **Compliance**: Extract state at specific points in time for regulatory requirements -- **Testing**: Validate that state at historical timestamps matches expectations - -#### Implementation Details - -- **Async File Loading**: Constructor uses async I/O to load journal files efficiently -- **Automatic Decompression**: Transparently decompresses `.zz` archives when loading -- **Chronological Order**: Journals should be provided oldest to newest -- **Efficient Replay**: Automatically skips journals outside the target timestamp range -- **Cross-Rotation**: Seamlessly handles recovery across multiple archived journals -- **Timestamp Tracking**: Replays all entries up to and including the target timestamp - -## Storage Efficiency - -**Space Requirements:** -- Active journal: Compacted state + recent writes since rotation -- Archived journals: Full history for their version ranges - -**Benefits of Unified Format:** -- Simpler file management (no separate snapshot + journal pairs) -- Each archived journal is self-contained -- Uniform entry format reduces code complexity -- Easy to understand and debug - -**Cleanup Strategy:** -- Keep N most recent archived journals for recovery -- Upload archived journals to remote storage -- Delete old archived journals after successful upload - -## Implementation Notes - -1. **Timestamp as Version**: Timestamps serve as version identifiers - no separate version counter needed -2. **Atomicity**: Timestamp assignment is atomic with writes -3. **Monotonicity**: Timestamps are monotonically non-decreasing (clock clamping if system clock goes backward) -4. **Concurrency**: Not thread-safe by design (same as current implementation) -5. **Format Field Names**: Use short names (`t`, `k`, `o`) to minimize storage overhead -6. **Self-Contained Journals**: Each rotated journal can be read independently without dependencies From f5c826de484dfbd6f79fbf67544e17b234d49355 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Thu, 6 Nov 2025 20:26:06 -0800 Subject: [PATCH 31/66] better docs --- bd-resilient-kv/VERSIONED_FORMAT.md | 32 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/bd-resilient-kv/VERSIONED_FORMAT.md b/bd-resilient-kv/VERSIONED_FORMAT.md index eadaa1b6..3d7d9951 100644 --- a/bd-resilient-kv/VERSIONED_FORMAT.md +++ b/bd-resilient-kv/VERSIONED_FORMAT.md @@ -6,8 +6,17 @@ This document describes the versioned journal format (VERSION 2) that enables po ## Goals -1. **Timestamp-Based Versioning**: Each write operation records a monotonically non-decreasing timestamp (in nanoseconds since UNIX epoch) that serves as both a version identifier and a logical clock. This allows correlating entries with with time-based data. -2. **Journal Rotation**: Periodic compaction with self-contained state in each journal +1. Enable recovery of key-value store state at any historical point in time +2. Preserve accurate write timestamps for audit and historical analysis +3. Support (near) indefinite retention of historical data without unbounded growth of active storage + +## Design Overview + +The versioned journal format uses timestamps as version identifiers for each write operation. Each entry in the journal records the timestamp, key, and value (or deletion marker) for every operation. This allows the store to reconstruct state at any point in time by replaying entries up to a target timestamp. + +To prevent unbounded growth, the system uses journal rotation: when the active journal reaches a size threshold, it is rotated out and replaced with a new journal containing only the current compacted state. The old journal is archived and compressed. Each archived journal preserves the original write timestamps of all entries, enabling point-in-time recovery across rotation boundaries. + +The format is built on top of BONJSON, a binary JSON format that provides efficient serialization while maintaining flexibility for different value types. ## File Types @@ -59,20 +68,11 @@ The byte-level layout of a VERSION 2 journal file: ### Header Structure (17 bytes total) -1. **Format Version** (offset 0, 8 bytes) - - Type: `u64` little-endian - - Value: `2` for VERSION 2 format - - Purpose: Allows future format evolution - -2. **Position** (offset 8, 8 bytes) - - Type: `u64` little-endian - - Value: Current write position in the buffer - - Purpose: Tracks where next entry will be written - -3. **Array Start Type Code** (offset 16, 1 byte) - - Type: `u8` - - Value: BONJSON type code indicating array start - - Purpose: Begins the BONJSON array containing all entries +| Field | Offset | Size | Type | Value | Purpose | +|-------|--------|------|------|-------|---------| +| Format Version | 0 | 8 bytes | u64 (little-endian) | `2` | Allows future format evolution | +| Position | 8 | 8 bytes | u64 (little-endian) | Current write position | Tracks where next entry will be written | +| Array Start Type Code | 16 | 1 byte | u8 | BONJSON array start code | Begins the BONJSON array containing all entries | ### Metadata Object (Variable size) From 520a607513041d69ab648e6a2c69d5f3028bd470 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Thu, 6 Nov 2025 20:41:20 -0800 Subject: [PATCH 32/66] use microseconds --- bd-resilient-kv/VERSIONED_FORMAT.md | 38 ++++++++++----------- bd-resilient-kv/src/kv_journal/versioned.rs | 13 ++++--- bd-resilient-kv/src/versioned_kv_store.rs | 2 +- bd-resilient-kv/src/versioned_recovery.rs | 2 +- 4 files changed, 30 insertions(+), 25 deletions(-) diff --git a/bd-resilient-kv/VERSIONED_FORMAT.md b/bd-resilient-kv/VERSIONED_FORMAT.md index 3d7d9951..3e22e657 100644 --- a/bd-resilient-kv/VERSIONED_FORMAT.md +++ b/bd-resilient-kv/VERSIONED_FORMAT.md @@ -23,7 +23,7 @@ The format is built on top of BONJSON, a binary JSON format that provides effici ### 1. Active Journal (`my_store.jrn`) The current active journal receiving new writes. Active journals are **not compressed** for performance reasons. -### 2. Archived Journals (`my_store.jrn.t1699564900000000000.zz`, etc.) +### 2. Archived Journals (`my_store.jrn.t1699564900000000.zz`, etc.) Previous journals, archived during rotation. Each contains complete state at its creation time plus subsequent incremental writes. The timestamp in the filename indicates the rotation/snapshot timestamp. **Archived journals are automatically compressed using zlib** (indicated by the `.zz` extension) to reduce storage space and bandwidth requirements for remote backup. Compression is mandatory and occurs automatically during rotation. @@ -48,7 +48,7 @@ The byte-level layout of a VERSION 2 journal file: │ (First entry in the array) │ ├─────────────────────────────────────────────────────────────────────────┤ │ { │ -│ "initialized": 1699564800000000000, // u64 timestamp (ns) │ +│ "initialized": 1699564800000000, // u64 timestamp (us) │ │ "format_version": 2 // Format identifier │ │ } │ └─────────────────────────────────────────────────────────────────────────┘ @@ -58,7 +58,7 @@ The byte-level layout of a VERSION 2 journal file: │ (BONJSON Object) │ ├─────────────────────────────────────────────────────────────────────────┤ │ { │ -│ "t": 1699564801000000000, // Timestamp in ns (u64) │ +│ "t": 1699564801000000, // Timestamp in us (u64) │ │ "k": "key1", // Key (string) │ │ "o": "value1" // Value or null (any type) │ │ } │ @@ -80,7 +80,7 @@ The first entry in the array is always a metadata object: ```json { - "initialized": , // Creation timestamp (nanoseconds since epoch) + "initialized": , // Creation timestamp (microseconds since epoch) "format_version": 2 // Must be 2 for this format } ``` @@ -91,14 +91,14 @@ Each subsequent entry follows this uniform schema: ```json { - "t": , // Timestamp in nanoseconds (monotonically non-decreasing, serves as version) + "t": , // Timestamp in microseconds (monotonically non-decreasing, serves as version) "k": "", // Key being modified "o": // Value for SET, null for DELETE } ``` Fields: -- `t` (timestamp): Monotonically non-decreasing timestamp (ns since UNIX epoch) that serves as both the write time and version identifier +- `t` (timestamp): Monotonically non-decreasing timestamp (microseconds since UNIX epoch) that serves as both the write time and version identifier - `k` (key): The key being written - `o` (operation): The value (for SET) or null (for DELETE) @@ -113,10 +113,10 @@ Timestamps are monotonically non-decreasing, not strictly increasing. If the sys **Size Considerations:** - **Header**: Fixed 17 bytes -- **Metadata**: ~80-100 bytes (depending on timestamp magnitude) +- **Metadata**: ~50-70 bytes (depending on timestamp magnitude) - **Per Entry**: Varies based on key and value size - - Minimum: ~50 bytes (short key, small value) - - Typical: 100-500 bytes + - Minimum: ~30 bytes (short key, small value) + - Typical: 70-470 bytes - Maximum: Limited by buffer size ## Journal Structure @@ -124,21 +124,21 @@ Timestamps are monotonically non-decreasing, not strictly increasing. If the sys ### Initial Journal When first created: ```json -{"initialized": 1699564800000000000, "format_version": 2} -{"t": 1699564801000000000, "k": "key1", "o": "value1"} -{"t": 1699564802000000000, "k": "key2", "o": "value2"} +{"initialized": 1699564800000000, "format_version": 2} +{"t": 1699564801000000, "k": "key1", "o": "value1"} +{"t": 1699564802000000, "k": "key2", "o": "value2"} ... ``` ### Rotated Journal -After rotation at timestamp 1699564900000000000, the new journal contains: +After rotation at timestamp 1699564900000000, the new journal contains: ```json -{"initialized": 1699564900000000000, "format_version": 2} -{"t": 1699564800123456789, "k": "key1", "o": "value1"} // Compacted state (original timestamp preserved) -{"t": 1699564850987654321, "k": "key2", "o": "value2"} // Compacted state (original timestamp preserved) -{"t": 1699564875111222333, "k": "key3", "o": "value3"} // Compacted state (original timestamp preserved) -{"t": 1699564901000000000, "k": "key4", "o": "value4"} // New write after rotation -{"t": 1699564902000000000, "k": "key1", "o": "updated1"} // New write after rotation +{"initialized": 1699564900000000, "format_version": 2} +{"t": 1699564800123456, "k": "key1", "o": "value1"} // Compacted state (original timestamp preserved) +{"t": 1699564850987654, "k": "key2", "o": "value2"} // Compacted state (original timestamp preserved) +{"t": 1699564875111222, "k": "key3", "o": "value3"} // Compacted state (original timestamp preserved) +{"t": 1699564901000000, "k": "key4", "o": "value4"} // New write after rotation +{"t": 1699564902000000, "k": "key1", "o": "updated1"} // New write after rotation ... ``` diff --git a/bd-resilient-kv/src/kv_journal/versioned.rs b/bd-resilient-kv/src/kv_journal/versioned.rs index 8e4f1506..6709019b 100644 --- a/bd-resilient-kv/src/kv_journal/versioned.rs +++ b/bd-resilient-kv/src/kv_journal/versioned.rs @@ -19,14 +19,14 @@ use std::time::{SystemTime, UNIX_EPOCH}; pub struct TimestampedValue { /// The value stored in the key-value store. pub value: Value, - /// The timestamp (in nanoseconds since UNIX epoch) when this value was last written. + /// The timestamp (in microseconds since UNIX epoch) when this value was last written. pub timestamp: u64, } /// Timestamped implementation of a key-value journaling system that uses timestamps /// as the version identifier for point-in-time recovery. /// -/// Each write operation is assigned a monotonically non-decreasing timestamp (in nanoseconds +/// Each write operation is assigned a monotonically non-decreasing timestamp (in microseconds /// since UNIX epoch), enabling exact state reconstruction at any historical timestamp. /// The monotonicity is enforced by clamping: if the system clock goes backwards, we reuse /// the same timestamp value to maintain ordering guarantees. When timestamps collide, @@ -91,12 +91,17 @@ fn read_u64_field(obj: &AHashMap, key: &str) -> Option { } } -/// Get current timestamp in nanoseconds since UNIX epoch. +/// Get current timestamp in microseconds since UNIX epoch. fn current_timestamp() -> anyhow::Result { SystemTime::now() .duration_since(UNIX_EPOCH) .map_err(|_| InvariantError::Invariant.into()) - .and_then(|d| u64::try_from(d.as_nanos()).map_err(|_| InvariantError::Invariant.into())) + .map(|d| { + #[allow(clippy::cast_possible_truncation)] + { + d.as_micros() as u64 + } + }) } diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs index 37ea8102..160c1118 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -45,7 +45,7 @@ async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result /// /// # Timestamp Semantics /// -/// Timestamps are monotonically increasing logical clocks (nanoseconds since UNIX epoch): +/// Timestamps are monotonically increasing logical clocks (microseconds since UNIX epoch): /// - Each write gets a timestamp >= all previous writes /// - If system clock goes backward, timestamps are clamped to maintain ordering /// - Multiple operations may share the same timestamp if system clock hasn't advanced diff --git a/bd-resilient-kv/src/versioned_recovery.rs b/bd-resilient-kv/src/versioned_recovery.rs index 97ff8728..4b8bf654 100644 --- a/bd-resilient-kv/src/versioned_recovery.rs +++ b/bd-resilient-kv/src/versioned_recovery.rs @@ -96,7 +96,7 @@ impl VersionedRecovery { /// /// # Arguments /// - /// * `target_timestamp` - The timestamp (in nanoseconds since UNIX epoch) to recover state at + /// * `target_timestamp` - The timestamp (in microseconds since UNIX epoch) to recover state at /// /// # Returns /// From 7f16d98d16a2a1d5ad62152f4bdf324fd89881dd Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Thu, 6 Nov 2025 20:42:40 -0800 Subject: [PATCH 33/66] revert readme --- bd-resilient-kv/README.md | 235 +------------------------------------- 1 file changed, 5 insertions(+), 230 deletions(-) diff --git a/bd-resilient-kv/README.md b/bd-resilient-kv/README.md index b76d3fcc..f9bf82f3 100644 --- a/bd-resilient-kv/README.md +++ b/bd-resilient-kv/README.md @@ -15,7 +15,6 @@ A crash-resilient key-value store library for Rust with automatic persistence, c - **🔄 Self-Managing**: Automatic high water mark detection and buffer switching - **🎯 Simple API**: HashMap-like interface that's easy to use - **🏗️ JSON-like Values**: Built on `bd-bonjson` for flexible value types -- **📊 Version Tracking**: Optional versioned store with point-in-time recovery and automatic journal rotation ## Quick Start @@ -27,17 +26,6 @@ bd-resilient-kv = { path = "path/to/bd-resilient-kv" } bd-bonjson = { path = "path/to/bd-bonjson" } ``` -### Choosing Between KVStore and VersionedKVStore - -**KVStore**: Use for general key-value storage with automatic compaction -- Best for: Configuration storage, caches, general-purpose persistence -- Features: Double-buffered journaling, automatic compaction, high performance - -**VersionedKVStore**: Use when you need version tracking -- Best for: Audit logs, state history, remote backup -- Features: Every write operation returns a version number, automatic rotation -- See: [VERSIONED_FORMAT.md](./VERSIONED_FORMAT.md) for detailed format documentation - ### Basic Usage ```rust @@ -186,67 +174,9 @@ fn main() -> anyhow::Result<()> { } ``` -## Versioned Key-Value Store - -For applications that require version tracking, audit logs, or point-in-time recovery, use `VersionedKVStore`: - -```rust -use bd_resilient_kv::VersionedKVStore; -use bd_bonjson::Value; - -#[tokio::main] -async fn main() -> anyhow::Result<()> { - // Create a versioned store with automatic rotation at 1MB - let mut store = VersionedKVStore::new( - ".", // Directory path - "versioned_store", // Journal name - 1024 * 1024, // Rotate when journal reaches 1MB - None // Optional high water mark ratio - )?; - - // Write operations are async and return version numbers - let v1 = store.insert("config".to_string(), Value::String("v1".to_string())).await?; - println!("Inserted at version: {}", v1); - - let v2 = store.insert("config".to_string(), Value::String("v2".to_string())).await?; - println!("Updated at version: {}", v2); - - // Read current state (O(1) from cache) - assert_eq!(store.get("config"), Some(&Value::String("v2".to_string()))); - - // Removing a key is also async and returns a version - let v3 = store.remove("config").await?; - if let Some(version) = v3 { - println!("Removed at version: {}", version); - } - - Ok(()) -} -``` - -### Key Features of VersionedKVStore - -- **Async API**: Write operations (`insert()`, `remove()`, `rotate_journal()`) are async and require a Tokio runtime -- **Version Tracking**: Every `insert()` and `remove()` returns a monotonically increasing version number -- **Timestamp Preservation**: Write timestamps are internally tracked and preserved during journal rotation for recovery purposes -- **Automatic Rotation**: When the journal exceeds the high water mark, it automatically: - - Creates a new journal with the current state as versioned entries (compaction) - - Preserves original timestamps from the initial writes - - Archives the old journal with `.t{timestamp}.zz` suffix - - Compresses the archived journal using zlib (RFC 1950, level 5) asynchronously -- **Automatic Compression**: Archived journals are automatically compressed to save disk space - - Active journals remain uncompressed for write performance - - Typically achieves >50% size reduction for text-based data - - Transparent decompression during recovery operations - - Compression is performed asynchronously using streaming I/O -- **O(1) Reads**: In-memory cache provides constant-time access to current state -- **Persistent**: Uses memory-mapped journals for crash-resilient storage - -See [VERSIONED_FORMAT.md](./VERSIONED_FORMAT.md) for detailed format documentation and recovery scenarios. - ## API Reference -### KVStore (Standard Key-Value Store) +### KVStore The main interface for the key-value store. @@ -282,58 +212,11 @@ pub fn remove(&mut self, key: &str) -> anyhow::Result> pub fn clear(&mut self) -> anyhow::Result<()> ``` -### VersionedKVStore (Version-Tracked Key-Value Store) +## Architecture -A higher-level store that tracks versions for every write operation and supports point-in-time recovery. Write operations are async and require a Tokio runtime. +### Double-Buffered Journaling -#### Constructor - -```rust -pub fn new>( - dir_path: P, - name: &str, - buffer_size: usize, - high_water_mark_ratio: Option -) -> anyhow::Result -``` - -- `dir_path`: Directory path where the journal will be stored -- `name`: Base name for the journal (e.g., "store" will create "store.jrn") -- `buffer_size`: Size in bytes for the journal buffer -- `high_water_mark_ratio`: Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 - -#### Core Methods - -```rust -// Read operations (O(1) from cache, synchronous) -pub fn get(&self, key: &str) -> Option<&Value> -pub fn contains_key(&self, key: &str) -> bool -pub fn len(&self) -> usize -pub fn is_empty(&self) -> bool -pub fn as_hashmap(&self) -> HashMap - -// Write operations (async, return version numbers) -pub async fn insert(&mut self, key: String, value: Value) -> anyhow::Result -pub async fn remove(&mut self, key: &str) -> anyhow::Result> - -// Manual rotation (async) -pub async fn rotate_journal(&mut self) -> anyhow::Result<()> - -// Version information (synchronous) -pub fn current_version(&self) -> u64 -``` - -**Internal Timestamp Tracking**: The store internally tracks timestamps for all writes and preserves them during journal rotation. These timestamps are used for recovery and point-in-time operations but are not exposed in the primary API. For advanced use cases requiring timestamp access, the `get_with_timestamp()` method is available. - -#### Core Methods - -### Storage Models - -The library provides two storage architectures: - -#### 1. Double-Buffered Journaling (KVStore) - -The standard store uses a double-buffered approach with two journal files: +The store uses a double-buffered approach with two journal files: 1. **Active Journal**: Receives new writes 2. **Inactive Journal**: Standby for compression @@ -342,33 +225,6 @@ The standard store uses a double-buffered approach with two journal files: - Switches the inactive journal to become the new active journal - Resets the old active journal for future use -#### 2. Versioned Single-Journal (VersionedKVStore) - -The versioned store uses a different architecture optimized for version tracking: - -1. **Single Active Journal**: All writes go to one journal file -2. **Version Tracking**: Every entry includes a monotonically increasing version number -3. **Automatic Rotation**: When the journal reaches the high water mark: - - Current state is serialized as versioned entries into a new journal - - Old journal is archived with `.t{timestamp}` suffix (e.g., `store.jrn.t1699564800000000000`) -4. **Point-in-Time Recovery**: Journal can be replayed up to any previous timestamp - -**Rotation Strategy**: -``` -Before rotation: - my_store.jrn (1MB, multiple timestamped entries) - -After rotation: - my_store.jrn (compacted, new entries with fresh timestamps) - my_store.jrn.t1699564800000000000.zz (archived, compressed, readonly) -``` - -**Compression**: -- Archived journals are automatically compressed using zlib (RFC 1950, level 5) -- Active journals remain uncompressed for optimal write performance -- Decompression is handled transparently during recovery -- File extension `.zz` indicates compressed archives - ### Memory-Mapped I/O - Uses `memmap2` for efficient file operations @@ -377,21 +233,13 @@ After rotation: ### Caching Strategy -Both `KVStore` and `VersionedKVStore` use the same caching approach: - - Maintains an in-memory `HashMap` cache of all key-value pairs - Cache is always kept in sync with the persistent state - Provides O(1) read performance - Write operations update both cache and journal -**VersionedKVStore Additions**: -- Maintains current version counter -- Can reconstruct state at any historical version by replaying journal entries - ## Performance Characteristics -### KVStore (Standard) - | Operation | Time Complexity | Notes | |------------------|-----------------|---------------------------------| | `get()` | O(1) | Reads from in-memory cache | @@ -402,19 +250,6 @@ Both `KVStore` and `VersionedKVStore` use the same caching approach: | `as_hashmap()` | O(1) | Returns reference to cache | | `clear()` | O(1) | Efficient journal clearing | -### VersionedKVStore (With Version Tracking) - -| Operation | Time Complexity | Notes | -|--------------------|-----------------|-------------------------------------| -| `get()` | O(1) | Reads from in-memory cache | -| `insert()` | O(1) amortized | Async journal write + cache + version | -| `remove()` | O(1) amortized | Async journal write + cache + version | -| `contains_key()` | O(1) | Cache lookup | -| `len()` | O(1) | Cache size | -| `as_hashmap()` | O(n) | Creates temporary map of values | -| `rotate_journal()` | O(n) | Async - serializes current state to new journal | -| `current_version()`| O(1) | Returns version counter | - ## Error Handling All write operations return `anyhow::Result` for comprehensive error handling, while read operations return values directly from the cache: @@ -437,8 +272,6 @@ fn main() -> anyhow::Result<()> { ## File Management -### KVStore Files - The library automatically manages journal files: - **Creation**: Files are created if they don't exist @@ -453,25 +286,9 @@ my_store.jrna # Journal A my_store.jrnb # Journal B ``` -### VersionedKVStore Files - -The versioned store manages a single journal with archived versions: - -- **Active Journal**: Current journal file (e.g., `my_store.jrn`) -- **Archived Journals**: Previous versions with `.t{timestamp}` suffix -- **Automatic Archival**: Old journals are preserved during rotation - -Example file structure after multiple rotations: -``` -my_store.jrn # Active journal (current, uncompressed) -my_store.jrn.t1699564800000000000.zz # Archived (compressed) -my_store.jrn.t1699651200000000000.zz # Archived (compressed) -my_store.jrn.t1699737600000000000.zz # Archived (compressed) -``` - ## Thread Safety -Both `KVStore` and `VersionedKVStore` are **not** thread-safe by design for maximum performance. For concurrent access, wrap them in appropriate synchronization primitives: +`KVStore` is **not** thread-safe by design for maximum performance. For concurrent access, wrap it in appropriate synchronization primitives: ```rust use std::sync::{Arc, Mutex}; @@ -486,48 +303,6 @@ let store = Arc::new(Mutex::new( ## Advanced Usage -### Archived Journal Compression - -**VersionedKVStore** automatically compresses archived journals asynchronously to save disk space: - -```rust -use bd_resilient_kv::VersionedKVStore; -use bd_bonjson::Value; - -#[tokio::main] -async fn main() -> anyhow::Result<()> { - let mut store = VersionedKVStore::new( - ".", // Directory path - "my_store", // Journal name - 512 * 1024, // 512KB rotation threshold - None - )?; - - // Write data that will trigger rotation - for i in 0..10000 { - store.insert(format!("key_{}", i), Value::Integer(i as i64)).await?; - } - - // After rotation, archived journals are automatically compressed: - // - my_store.jrn (active, uncompressed) - // - my_store.jrn.t1699564800000000000.zz (archived, compressed with zlib asynchronously) - - Ok(()) -} -``` - -**Compression Details**: -- **Format**: zlib (RFC 1950) with compression level 5 -- **Performance**: Balanced speed/compression ratio, performed asynchronously with streaming I/O -- **Transparency**: Recovery automatically detects and decompresses archived journals -- **Naming**: `.zz` extension indicates compressed archives -- **Typical Savings**: >50% size reduction for text-based data - -**Active vs Archived**: -- Active journals remain **uncompressed** for maximum write performance -- Only archived journals are compressed during rotation (asynchronously) -- No configuration needed - compression is automatic - ### Custom Buffer Sizes Choose buffer sizes based on your use case: From 5311501d10124704324cfa1a97e9be0e6c042d70 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Thu, 6 Nov 2025 20:44:10 -0800 Subject: [PATCH 34/66] more cleanup --- bd-resilient-kv/src/versioned_recovery.rs | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/bd-resilient-kv/src/versioned_recovery.rs b/bd-resilient-kv/src/versioned_recovery.rs index 4b8bf654..70d038ec 100644 --- a/bd-resilient-kv/src/versioned_recovery.rs +++ b/bd-resilient-kv/src/versioned_recovery.rs @@ -21,19 +21,6 @@ use bd_bonjson::decoder::from_slice; /// Recovery works exclusively with journal snapshots - complete archived journals created /// during rotation. Each snapshot contains the full compacted state at the time of rotation, /// with all entries preserving their original timestamps. -/// -/// Recovery replays snapshot entries in chronological order up to the target timestamp. -/// Since entry timestamps may overlap across adjacent snapshots, recovery handles this by -/// replaying snapshots sequentially and applying entries in timestamp order. -/// -/// ## Optimization -/// -/// To recover the current state, only the last snapshot needs to be read since each snapshot -/// contains the complete compacted state at rotation time. For historical timestamp recovery, -/// the utility automatically identifies and replays only the necessary snapshots. -/// -/// **Note:** Callers are responsible for decompressing snapshot data if needed before passing -/// it to this utility. #[derive(Debug)] pub struct VersionedRecovery { snapshots: Vec, From 421cd8beee249ac5453daa2e2ccf4a91433c0311 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Thu, 6 Nov 2025 20:46:46 -0800 Subject: [PATCH 35/66] remove --- bd-resilient-kv/src/versioned_kv_store.rs | 33 +---------------------- 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs index 160c1118..ebace364 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -36,24 +36,6 @@ async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result /// A persistent key-value store with timestamp tracking. /// /// `VersionedKVStore` provides HashMap-like semantics backed by a timestamped journal that -/// assigns a monotonically increasing timestamp to each write operation. This enables: -/// - Audit logs with timestamp tracking for every write (timestamps serve as logical clocks) -/// - Point-in-time recovery at any historical timestamp -/// - Correlation with external timestamped event streams -/// - Automatic journal rotation when high water mark is reached -/// - Optional callbacks for post-rotation operations (e.g., remote backup) -/// -/// # Timestamp Semantics -/// -/// Timestamps are monotonically increasing logical clocks (microseconds since UNIX epoch): -/// - Each write gets a timestamp >= all previous writes -/// - If system clock goes backward, timestamps are clamped to maintain ordering -/// - Multiple operations may share the same timestamp if system clock hasn't advanced -/// - Enables natural correlation with timestamped event buffers for upload -/// -/// For performance optimization, `VersionedKVStore` maintains an in-memory cache of the -/// current key-value data to provide O(1) read operations and avoid expensive journal -/// decoding on every access. /// /// # Rotation Strategy /// @@ -61,21 +43,8 @@ async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result /// The rotation process creates a snapshot of the current state while preserving timestamp /// semantics for accurate point-in-time recovery. /// -/// ## Rotation Process -/// 1. Computes `rotation_timestamp` = max timestamp of all current entries -/// 2. Archives old journal as `.jrn.t.zz` (compressed) -/// 3. Creates new journal with compacted state -/// 4. Writes compacted state with **original timestamps preserved** -/// 5. Continues normal operations in the new journal -/// -/// ## Timestamp Semantics Across Snapshots -/// -/// Compacted entries in the new journal preserve their original timestamps, which means entry -/// timestamps may overlap across adjacent snapshots. The filename timestamp (`t300`, `t500`) -/// represents the rotation point (snapshot boundary), not the minimum timestamp of entries. -/// /// For detailed information about timestamp semantics, recovery bucketing, and invariants, -/// see the `VersionedRecovery` documentation. +/// see the `VERSIONED_FORMAT.md` documentation. pub struct VersionedKVStore { journal: MemMappedVersionedKVJournal, cached_map: AHashMap, From 6e58b884d0d315dee22ea3f36199456db44fbc42 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Thu, 6 Nov 2025 21:04:45 -0800 Subject: [PATCH 36/66] clean up tests --- bd-resilient-kv/src/tests/mod.rs | 11 + .../src/tests/versioned_kv_store_test.rs | 419 +++--------------- .../src/tests/versioned_recovery_test.rs | 13 +- 3 files changed, 73 insertions(+), 370 deletions(-) diff --git a/bd-resilient-kv/src/tests/mod.rs b/bd-resilient-kv/src/tests/mod.rs index 7d838890..af549ca9 100644 --- a/bd-resilient-kv/src/tests/mod.rs +++ b/bd-resilient-kv/src/tests/mod.rs @@ -35,3 +35,14 @@ pub mod kv_test; pub mod memmapped_test; pub mod versioned_kv_store_test; pub mod versioned_recovery_test; + +/// Helper function to decompress zlib-compressed data. +pub fn decompress_zlib(data: &[u8]) -> anyhow::Result> { + use flate2::read::ZlibDecoder; + use std::io::Read; + + let mut decoder = ZlibDecoder::new(data); + let mut decompressed = Vec::new(); + decoder.read_to_end(&mut decompressed)?; + Ok(decompressed) +} diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index 0f98f0b2..9d363de6 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -8,11 +8,13 @@ #![allow(clippy::unwrap_used)] use crate::VersionedKVStore; +use crate::kv_journal::TimestampedValue; +use crate::tests::decompress_zlib; use bd_bonjson::Value; use tempfile::TempDir; #[test] -fn test_versioned_store_new() -> anyhow::Result<()> { +fn empty_store() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; let store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; @@ -21,85 +23,13 @@ fn test_versioned_store_new() -> anyhow::Result<()> { assert!(store.is_empty()); assert_eq!(store.len(), 0); - Ok(()) -} - -#[tokio::test] -async fn test_timestamp_collision_on_clamping() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - - // Insert first value - this establishes a timestamp - store - .insert("key1".to_string(), Value::String("value1".to_string())) - .await?; - let ts1 = store - .get_with_timestamp("key1") - .map(|tv| tv.timestamp) - .unwrap(); - - // Perform rapid successive writes - these might share timestamps if system clock hasn't advanced - store - .insert("key2".to_string(), Value::String("value2".to_string())) - .await?; - store - .insert("key3".to_string(), Value::String("value3".to_string())) - .await?; - store - .insert("key4".to_string(), Value::String("value4".to_string())) - .await?; - - let ts2 = store - .get_with_timestamp("key2") - .map(|tv| tv.timestamp) - .unwrap(); - let ts3 = store - .get_with_timestamp("key3") - .map(|tv| tv.timestamp) - .unwrap(); - let ts4 = store - .get_with_timestamp("key4") - .map(|tv| tv.timestamp) - .unwrap(); - - // Verify monotonicity: timestamps should never decrease - assert!( - ts2 >= ts1, - "Timestamps should be monotonically non-decreasing" - ); - assert!( - ts3 >= ts2, - "Timestamps should be monotonically non-decreasing" - ); - assert!( - ts4 >= ts3, - "Timestamps should be monotonically non-decreasing" - ); - - // Document that timestamps CAN be equal (this is the key difference from the old +1 behavior) - // When system clock doesn't advance or goes backwards, we reuse the same timestamp - // This is acceptable because version numbers provide total ordering - - // Count unique timestamps - with rapid operations, we might have collisions - let timestamps = [ts1, ts2, ts3, ts4]; - let unique_count = timestamps - .iter() - .collect::>() - .len(); - - // We should have at least 1 unique timestamp (all could be the same in extreme cases) - assert!( - unique_count >= 1 && unique_count <= 4, - "Should have 1-4 unique timestamps, got {}", - unique_count - ); + assert!(temp_dir.path().join("test.jrn").exists()); Ok(()) } - #[tokio::test] -async fn test_versioned_store_remove() -> anyhow::Result<()> { +async fn basic_crud() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; @@ -129,6 +59,14 @@ async fn test_versioned_store_remove() -> anyhow::Result<()> { let removed = store.remove("nonexistent").await?; assert!(removed.is_none()); + // Read back existing key + let val = store.get("key2"); + assert_eq!(val, Some(&Value::String("value2".to_string()))); + + // Read non-existent key + let val = store.get("key1"); + assert_eq!(val, None); + Ok(()) } @@ -138,24 +76,35 @@ async fn test_persistence_and_reload() -> anyhow::Result<()> { // Create store and write some data - { + let (ts1, ts2) = { let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - let _ts1 = store + let ts1 = store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; - let _ts2 = store.insert("key2".to_string(), Value::Signed(42)).await?; + let ts2 = store.insert("key2".to_string(), Value::Signed(42)).await?; store.sync()?; - } + + (ts1, ts2) + }; // Reopen and verify data persisted { let store = VersionedKVStore::open_existing(temp_dir.path(), "test", 4096, None)?; assert_eq!(store.len(), 2); assert_eq!( - store.get("key1"), - Some(&Value::String("value1".to_string())) + store.get_with_timestamp("key1"), + Some(&TimestampedValue { + value: Value::String("value1".to_string()), + timestamp: ts1, + }) + ); + assert_eq!( + store.get_with_timestamp("key2"), + Some(&TimestampedValue { + value: Value::Signed(42), + timestamp: ts2, + }) ); - assert_eq!(store.get("key2"), Some(&Value::Signed(42))); } Ok(()) @@ -233,6 +182,18 @@ async fn test_manual_rotation() -> anyhow::Result<()> { Some(&Value::String("value3".to_string())) ); + // Decompress the archive and load it as a Store to verify that it contains the old state. + let snapshot_store = make_store_from_snapshot_file(&temp_dir, &archived_path)?; + assert_eq!( + snapshot_store.get("key1"), + Some(&Value::String("value1".to_string())) + ); + assert_eq!( + snapshot_store.get("key2"), + Some(&Value::String("value2".to_string())) + ); + assert_eq!(snapshot_store.len(), 2); + Ok(()) } @@ -294,45 +255,6 @@ async fn test_empty_store_operations() -> anyhow::Result<()> { Ok(()) } -#[tokio::test] -async fn test_version_monotonicity() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - - let mut last_timestamp = 0u64; - - // Perform various operations and ensure timestamp always increases - for i in 0 .. 20 { - let op_timestamp = if i % 3 == 0 { - store - .insert(format!("key{}", i), Value::Signed(i as i64)) - .await? - } else if i % 3 == 1 { - store - .insert( - format!("key{}", i / 3), - Value::String(format!("updated{}", i)), - ) - .await? - } else { - store - .remove(&format!("key{}", i / 3)) - .await? - .unwrap_or(last_timestamp) - }; - - assert!( - op_timestamp >= last_timestamp, - "Timestamp should be monotonically non-decreasing" - ); - last_timestamp = op_timestamp; - } - - Ok(()) -} - #[tokio::test] async fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; @@ -342,24 +264,16 @@ async fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { let mut store = VersionedKVStore::new(temp_dir.path(), "test", 2048, Some(0.5))?; // Insert some keys and capture their timestamps - store + let ts1 = store .insert("key1".to_string(), Value::String("value1".to_string())) .await?; - let ts1 = store - .get_with_timestamp("key1") - .map(|tv| tv.timestamp) - .unwrap(); // Small sleep to ensure different timestamps std::thread::sleep(std::time::Duration::from_millis(10)); - store + let ts2 = store .insert("key2".to_string(), Value::String("value2".to_string())) .await?; - let ts2 = store - .get_with_timestamp("key2") - .map(|tv| tv.timestamp) - .unwrap(); // Verify timestamps are different assert_ne!(ts1, ts2, "Timestamps should be different"); @@ -400,233 +314,7 @@ async fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { } #[tokio::test] -async fn test_timestamp_monotonicity() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - - // Track timestamps across multiple writes - let mut timestamps = Vec::new(); - - // Perform multiple writes and collect their timestamps - for i in 0 .. 20 { - store - .insert(format!("key{}", i), Value::Signed(i as i64)) - .await?; - - let ts = store - .get_with_timestamp(&format!("key{}", i)) - .map(|tv| tv.timestamp) - .unwrap(); - - timestamps.push(ts); - } - - // Verify all timestamps are monotonically increasing - for i in 1 .. timestamps.len() { - assert!( - timestamps[i] >= timestamps[i - 1], - "Timestamp at index {} ({}) should be >= timestamp at index {} ({})", - i, - timestamps[i], - i - 1, - timestamps[i - 1] - ); - } - - // Verify that timestamps are actually different (at least some of them) - // This ensures we're not just assigning the same timestamp to everything - let unique_timestamps: std::collections::HashSet<_> = timestamps.iter().collect(); - assert!( - unique_timestamps.len() > 1, - "Expected multiple unique timestamps, got only {}", - unique_timestamps.len() - ); - - Ok(()) -} - -#[tokio::test] -async fn test_timestamp_monotonicity_across_rotation() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - - // Write before rotation - store - .insert("key1".to_string(), Value::String("value1".to_string())) - .await?; - let ts1 = store - .get_with_timestamp("key1") - .map(|tv| tv.timestamp) - .unwrap(); - - std::thread::sleep(std::time::Duration::from_millis(10)); - - store - .insert("key2".to_string(), Value::String("value2".to_string())) - .await?; - let ts2 = store - .get_with_timestamp("key2") - .map(|tv| tv.timestamp) - .unwrap(); - - // Rotate journal - store.rotate_journal().await?; - - std::thread::sleep(std::time::Duration::from_millis(10)); - - // Write after rotation - store - .insert("key3".to_string(), Value::String("value3".to_string())) - .await?; - let ts3 = store - .get_with_timestamp("key3") - .map(|tv| tv.timestamp) - .unwrap(); - - std::thread::sleep(std::time::Duration::from_millis(10)); - - store - .insert("key4".to_string(), Value::String("value4".to_string())) - .await?; - let ts4 = store - .get_with_timestamp("key4") - .map(|tv| tv.timestamp) - .unwrap(); - - // Verify monotonicity across rotation boundary - assert!(ts2 >= ts1, "ts2 should be >= ts1"); - assert!(ts3 >= ts2, "ts3 should be >= ts2 (across rotation)"); - assert!(ts4 >= ts3, "ts4 should be >= ts3"); - - // Verify ordering - let timestamps = [ts1, ts2, ts3, ts4]; - for i in 1 .. timestamps.len() { - assert!( - timestamps[i] >= timestamps[i - 1], - "Timestamp monotonicity violated at index {}: {} < {}", - i, - timestamps[i], - timestamps[i - 1] - ); - } - - Ok(()) -} - -#[tokio::test] -async fn test_compression_during_rotation() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - - // Insert some data - let data = "x".repeat(1000); // Large value to make compression effective - store - .insert("key1".to_string(), Value::String(data.clone())) - .await?; - store - .insert("key2".to_string(), Value::String(data.clone())) - .await?; - store - .insert("key3".to_string(), Value::String(data)) - .await?; - - // Get size of uncompressed journal before rotation - let uncompressed_size = std::fs::metadata(temp_dir.path().join("test.jrn"))?.len(); - - // Get max timestamp before rotation (this will be used in the archive name) - let rotation_timestamp = store - .get_with_timestamp("key3") - .map(|tv| tv.timestamp) - .unwrap(); - - // Trigger rotation - store.rotate_journal().await?; - - // Verify compressed archive exists - let archived_path = temp_dir - .path() - .join(format!("test.jrn.t{}.zz", rotation_timestamp)); - assert!( - archived_path.exists(), - "Compressed archive should exist at {:?}", - archived_path - ); - - // Verify compressed size is smaller than original - let compressed_size = std::fs::metadata(&archived_path)?.len(); - assert!( - compressed_size < uncompressed_size, - "Compressed size ({}) should be smaller than uncompressed ({})", - compressed_size, - uncompressed_size - ); - - // Verify uncompressed temporary file was deleted - let temp_archive_path = temp_dir.path().join("test.jrn.old"); - assert!( - !temp_archive_path.exists(), - "Temporary uncompressed archive should be deleted" - ); - - // Verify active journal still works - store - .insert("key4".to_string(), Value::String("value4".to_string())) - .await?; - assert_eq!(store.len(), 4); - - Ok(()) -} - -#[tokio::test] -async fn test_compression_ratio() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 8192, None)?; - - // Insert highly compressible data - let compressible_data = "A".repeat(500); - for i in 0 .. 10 { - store - .insert( - format!("key{}", i), - Value::String(compressible_data.clone()), - ) - .await?; - } - - let uncompressed_size = std::fs::metadata(temp_dir.path().join("test.jrn"))?.len(); - let rotation_timestamp = store - .get_with_timestamp(&format!("key{}", 9)) - .map(|tv| tv.timestamp) - .unwrap(); - - store.rotate_journal().await?; - - let archived_path = temp_dir - .path() - .join(format!("test.jrn.t{}.zz", rotation_timestamp)); - let compressed_size = std::fs::metadata(&archived_path)?.len(); - - // With highly compressible data, we should get significant compression - // Expecting at least 50% compression ratio for repeated characters - #[allow(clippy::cast_precision_loss)] - let compression_ratio = (compressed_size as f64) / (uncompressed_size as f64); - assert!( - compression_ratio < 0.5, - "Compression ratio should be better than 50% for repeated data, got {:.2}%", - compression_ratio * 100.0 - ); - - Ok(()) -} - -#[tokio::test] -async fn test_multiple_rotations_with_compression() -> anyhow::Result<()> { +async fn test_multiple_rotations() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; @@ -659,3 +347,18 @@ async fn test_multiple_rotations_with_compression() -> anyhow::Result<()> { Ok(()) } + +fn make_store_from_snapshot_file( + temp_dir: &TempDir, + snapshot_path: &std::path::Path, +) -> anyhow::Result { + // Decompress the snapshot and journal files into the temp directory + // so we can open them as a store. + let data = std::fs::read(snapshot_path)?; + let decompressed_snapshot = decompress_zlib(&data)?; + std::fs::write(temp_dir.path().join("snapshot.jrn"), decompressed_snapshot)?; + + let store = VersionedKVStore::open_existing(temp_dir.path(), "snapshot", 4096, None)?; + + Ok(store) +} diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index b2f2bc75..5d711a87 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -9,22 +9,11 @@ #![allow(clippy::case_sensitive_file_extension_comparisons)] use crate::VersionedKVStore; +use crate::tests::decompress_zlib; use crate::versioned_recovery::VersionedRecovery; use bd_bonjson::Value; use tempfile::TempDir; -/// Helper function to decompress zlib-compressed data. -/// The `VersionedRecovery` no longer handles compression, so tests must decompress manually. -fn decompress_zlib(data: &[u8]) -> anyhow::Result> { - use flate2::read::ZlibDecoder; - use std::io::Read; - - let mut decoder = ZlibDecoder::new(data); - let mut decompressed = Vec::new(); - decoder.read_to_end(&mut decompressed)?; - Ok(decompressed) -} - /// Helper function to find archived journal files in a directory. /// Returns sorted paths to all `.zz` compressed journal archives. fn find_archived_journals(dir: &std::path::Path) -> anyhow::Result> { From 73ed8c3d031320ed4732d81c2d1f94d603fc3531 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Thu, 6 Nov 2025 21:37:17 -0800 Subject: [PATCH 37/66] remove unused --- .../src/tests/versioned_kv_store_test.rs | 4 +-- bd-resilient-kv/src/versioned_kv_store.rs | 30 +------------------ 2 files changed, 3 insertions(+), 31 deletions(-) diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index 9d363de6..057896b1 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -214,7 +214,7 @@ async fn test_rotation_preserves_state() -> anyhow::Result<()> { .insert("key4".to_string(), Value::Float(3.14159)) .await?; - let pre_rotation_state = store.as_hashmap(); + let pre_rotation_state = store.as_hashmap().clone(); let pre_rotation_ts = store .get_with_timestamp("key4") .map(|tv| tv.timestamp) @@ -225,7 +225,7 @@ async fn test_rotation_preserves_state() -> anyhow::Result<()> { // Verify state is preserved exactly let post_rotation_state = store.as_hashmap(); - assert_eq!(pre_rotation_state, post_rotation_state); + assert_eq!(pre_rotation_state, *post_rotation_state); assert_eq!(store.len(), 4); // Verify we can continue writing diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs index ebace364..55449ac2 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -243,26 +243,10 @@ impl VersionedKVStore { /// /// This operation is O(1) as it reads from the in-memory cache. #[must_use] - pub fn as_hashmap_with_timestamps(&self) -> &AHashMap { + pub fn as_hashmap(&self) -> &AHashMap { &self.cached_map } - /// Get a reference to the current hash map (values only, without timestamps). - /// - /// Note: This method creates a temporary hashmap. For better performance, - /// consider using `get()` for individual lookups or `as_hashmap_with_timestamps()` - /// if you need the full map with timestamps. - /// - /// This operation is O(n) where n is the number of keys. - #[must_use] - pub fn as_hashmap(&self) -> AHashMap { - self - .cached_map - .iter() - .map(|(k, tv)| (k.clone(), tv.value.clone())) - .collect() - } - /// Synchronize changes to disk. /// /// This is a blocking operation that performs synchronous I/O. In async contexts, @@ -274,18 +258,6 @@ impl VersionedKVStore { self.journal.sync() } - /// Get the current buffer usage ratio (0.0 to 1.0). - #[must_use] - pub fn buffer_usage_ratio(&self) -> f32 { - self.journal.buffer_usage_ratio() - } - - /// Check if the high water mark has been triggered. - #[must_use] - pub fn is_high_water_mark_triggered(&self) -> bool { - self.journal.is_high_water_mark_triggered() - } - /// Manually trigger journal rotation. /// /// This will create a new journal with the current state compacted and archive the old journal. From 6e5eb56ba5ff77f99dc16735a11c05577692fdb5 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 06:36:40 -0800 Subject: [PATCH 38/66] use generation --- .../src/tests/versioned_kv_store_test.rs | 2 +- bd-resilient-kv/src/versioned_kv_store.rs | 163 ++++++++++++------ 2 files changed, 114 insertions(+), 51 deletions(-) diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index 057896b1..3f8a0939 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -23,7 +23,7 @@ fn empty_store() -> anyhow::Result<()> { assert!(store.is_empty()); assert_eq!(store.len(), 0); - assert!(temp_dir.path().join("test.jrn").exists()); + assert!(temp_dir.path().join("test.jrn.0").exists()); Ok(()) } diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_store.rs index 55449ac2..50824427 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_store.rs @@ -10,6 +10,47 @@ use ahash::AHashMap; use bd_bonjson::Value; use std::path::{Path, PathBuf}; +/// Find the active journal file by searching for the highest generation number. +/// +/// Returns the path to the journal and its generation number, or None if no journal exists. +/// Supports both legacy journals (`name.jrn`) and generation-based journals (`name.jrn.N`). +fn find_active_journal(dir: &Path, name: &str) -> anyhow::Result> { + // First check for legacy journal format (name.jrn without generation) + let legacy_path = dir.join(format!("{name}.jrn")); + if legacy_path.exists() { + // Migrate legacy journal to generation 0 + let gen_path = dir.join(format!("{name}.jrn.0")); + if !gen_path.exists() { + std::fs::rename(&legacy_path, &gen_path)?; + return Ok(Some((gen_path, 0))); + } + } + + // Search for generation-based journals + let pattern = format!("{name}.jrn."); + + let mut max_gen = None; + for entry in std::fs::read_dir(dir)? { + let entry = entry?; + let filename = entry.file_name(); + let filename_str = filename.to_string_lossy(); + + if let Some(suffix) = filename_str.strip_prefix(&pattern) { + // Parse generation number (before any .zz or other extensions) + if let Some(gen_str) = suffix.split('.').next() + && let Ok(generation) = gen_str.parse::() + { + max_gen = Some(max_gen.map_or(generation, |current: u64| current.max(generation))); + } + } + } + + Ok(max_gen.map(|generation| { + let path = dir.join(format!("{name}.jrn.{generation}")); + (path, generation) + })) +} + /// Compress an archived journal using zlib. /// /// This function compresses the source file to the destination using zlib compression. @@ -52,20 +93,22 @@ pub struct VersionedKVStore { journal_name: String, buffer_size: usize, high_water_mark_ratio: Option, + current_generation: u64, } impl VersionedKVStore { /// Create a new `VersionedKVStore` with the specified directory, name, and buffer size. /// - /// The journal file will be named `.jrn` within the specified directory. - /// If the file already exists, it will be loaded with its existing contents. + /// The journal file will be named `.jrn.N` where N is the generation number. + /// If a journal already exists, it will be loaded with its existing contents. + /// Legacy journals (`.jrn`) are automatically migrated to generation 0. /// If the specified size is larger than an existing file, it will be resized while preserving /// data. If the specified size is smaller and the existing data doesn't fit, a fresh journal /// will be created. /// /// # Arguments /// * `dir_path` - Directory path where the journal will be stored - /// * `name` - Base name for the journal (e.g., "store" will create "store.jrn") + /// * `name` - Base name for the journal (e.g., "store" will create "store.jrn.0") /// * `buffer_size` - Size in bytes for the journal buffer /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 /// @@ -78,7 +121,12 @@ impl VersionedKVStore { high_water_mark_ratio: Option, ) -> anyhow::Result { let dir = dir_path.as_ref(); - let journal_path = dir.join(format!("{name}.jrn")); + + // Find or create journal with generation tracking + let (journal_path, generation) = find_active_journal(dir, name)?.unwrap_or_else(|| { + let path = dir.join(format!("{name}.jrn.0")); + (path, 0) + }); let journal = if journal_path.exists() { // Try to open existing journal @@ -101,6 +149,7 @@ impl VersionedKVStore { journal_name: name.to_string(), buffer_size, high_water_mark_ratio, + current_generation: generation, }) } @@ -111,7 +160,7 @@ impl VersionedKVStore { /// /// # Arguments /// * `dir_path` - Directory path where the journal is stored - /// * `name` - Base name of the journal (e.g., "store" for "store.jrn") + /// * `name` - Base name of the journal (e.g., "store" for "store.jrn.N") /// * `buffer_size` - Size in bytes for the journal buffer /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 /// @@ -128,7 +177,10 @@ impl VersionedKVStore { high_water_mark_ratio: Option, ) -> anyhow::Result { let dir = dir_path.as_ref(); - let journal_path = dir.join(format!("{name}.jrn")); + + // Find existing journal (fail if not found) + let (journal_path, generation) = find_active_journal(dir, name)? + .ok_or_else(|| anyhow::anyhow!("No journal file found for '{name}'"))?; let journal = MemMappedVersionedKVJournal::from_file(&journal_path, buffer_size, high_water_mark_ratio)?; @@ -141,14 +193,10 @@ impl VersionedKVStore { journal_name: name.to_string(), buffer_size, high_water_mark_ratio, + current_generation: generation, }) } - /// Get the path to the active journal file. - fn journal_path(&self) -> PathBuf { - self.dir_path.join(format!("{}.jrn", self.journal_name)) - } - /// Get a value by key. /// /// This operation is O(1) as it reads from the in-memory cache. @@ -268,49 +316,60 @@ impl VersionedKVStore { /// # Errors /// Returns an error if rotation fails. pub async fn rotate_journal(&mut self) -> anyhow::Result<()> { - // Get the maximum timestamp from current state for rotation tracking - let rotation_timestamp = self - .cached_map - .values() - .map(|tv| tv.timestamp) - .max() - .unwrap_or(0); + // Increment generation counter for new journal + let next_generation = self.current_generation + 1; + let new_journal_path = self + .dir_path + .join(format!("{}.jrn.{next_generation}", self.journal_name)); - // Generate archived journal path with rotation timestamp (compressed) - let archived_path = self.generate_archived_path(rotation_timestamp); + // Create new journal with compacted state + let new_journal = self.create_rotated_journal(&new_journal_path).await?; - // Create new journal with rotated state - let new_journal = self.create_rotated_journal().await?; - - // Replace old journal with new one + // Replace in-memory journal with new one (critical section - but no file ops!) + // The old journal file remains at the previous generation number let old_journal = std::mem::replace(&mut self.journal, new_journal); + let old_generation = self.current_generation; + self.current_generation = next_generation; - // Move old journal to temporary location - drop(old_journal); // Release mmap before moving file - let journal_path = self.journal_path(); - let temp_uncompressed = self.dir_path.join(format!("{}.jrn.old", self.journal_name)); - tokio::fs::rename(&journal_path, &temp_uncompressed).await?; - - // Rename new journal to base path - let temp_path = self.dir_path.join(format!("{}.jrn.tmp", self.journal_name)); - tokio::fs::rename(&temp_path, &journal_path).await?; + // Drop the old journal to release the mmap + drop(old_journal); - // Compress the archived journal - compress_archived_journal(&temp_uncompressed, &archived_path).await?; - - // Remove uncompressed version - tokio::fs::remove_file(&temp_uncompressed).await?; + // Best-effort cleanup: compress and archive the old journal + let old_journal_path = self + .dir_path + .join(format!("{}.jrn.{old_generation}", self.journal_name)); + self.cleanup_archived_journal(&old_journal_path).await; Ok(()) } - /// Generate the archived journal path for a given rotation timestamp. - /// Archived journals use the .zz extension to indicate zlib compression. - fn generate_archived_path(&self, rotation_timestamp: u64) -> PathBuf { - self.dir_path.join(format!( + /// Clean up after successful rotation (best effort, non-critical). + /// + /// This compresses and removes the old journal. Failures are logged but not propagated. + async fn cleanup_archived_journal(&self, old_journal_path: &Path) { + // Generate archived path with timestamp + let rotation_timestamp = self + .cached_map + .values() + .map(|tv| tv.timestamp) + .max() + .unwrap_or(0); + + let archived_path = self.dir_path.join(format!( "{}.jrn.t{}.zz", self.journal_name, rotation_timestamp - )) + )); + + // Try to compress the old journal + match compress_archived_journal(old_journal_path, &archived_path).await { + Ok(()) => { + // Compression succeeded, remove uncompressed version + let _ = tokio::fs::remove_file(old_journal_path).await; + }, + Err(_e) => { + // Compression failed - keep the uncompressed version as a fallback + }, + } } /// Create a new rotated journal with compacted state. @@ -319,10 +378,10 @@ impl VersionedKVStore { /// journal with the same buffer size and compaction only removes redundant updates (old /// versions of keys), the compacted state is always ≤ the current journal size. If data fits /// during normal operation, it will always fit during rotation. - async fn create_rotated_journal(&self) -> anyhow::Result { - // Create temporary journal file - let temp_path = self.dir_path.join(format!("{}.jrn.tmp", self.journal_name)); - + async fn create_rotated_journal( + &self, + journal_path: &Path, + ) -> anyhow::Result { // Create in-memory buffer for new journal let mut buffer = vec![0u8; self.buffer_size]; @@ -333,10 +392,14 @@ impl VersionedKVStore { self.high_water_mark_ratio, )?; - // Write buffer to temporary file - tokio::fs::write(&temp_path, &buffer).await?; + // Write buffer to the new journal path + tokio::fs::write(journal_path, &buffer).await?; // Open as memory-mapped journal - MemMappedVersionedKVJournal::from_file(&temp_path, self.buffer_size, self.high_water_mark_ratio) + MemMappedVersionedKVJournal::from_file( + journal_path, + self.buffer_size, + self.high_water_mark_ratio, + ) } } From 27cdc5dfdc6d58544a4c87d65e1f841b63c9fbb8 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 11:36:49 -0800 Subject: [PATCH 39/66] use proto --- AGENTS.md | 7 +- Cargo.lock | 4 + api | 2 +- bd-proto/build.rs | 14 + bd-proto/src/protos/mod.rs | 1 + bd-proto/src/protos/state/mod.rs | 1 + bd-proto/src/protos/state/payload.rs | 550 +++++++++++++++++ bd-proto/src/protos/workflow/workflow.rs | 58 +- bd-resilient-kv/Cargo.toml | 4 + bd-resilient-kv/src/kv_journal/mod.rs | 4 - bd-resilient-kv/src/kv_journal/versioned.rs | 562 ------------------ bd-resilient-kv/src/lib.rs | 16 +- bd-resilient-kv/src/tests/mod.rs | 11 + .../src/tests/versioned_kv_store_test.rs | 91 ++- .../src/tests/versioned_recovery_test.rs | 121 ++-- .../src/versioned_kv_journal/file_manager.rs | 57 ++ .../src/versioned_kv_journal/framing.rs | 212 +++++++ .../src/versioned_kv_journal/framing_test.rs | 208 +++++++ .../memmapped_versioned.rs | 59 +- .../src/versioned_kv_journal/mod.rs | 28 + .../recovery.rs} | 129 ++-- .../store.rs} | 158 ++--- .../src/versioned_kv_journal/versioned.rs | 365 ++++++++++++ bd-test-helpers/src/workflow/mod.rs | 7 +- bd-workflows/src/config.rs | 15 +- bd-workflows/src/engine_test.rs | 2 +- 26 files changed, 1674 insertions(+), 1012 deletions(-) create mode 100644 bd-proto/src/protos/state/mod.rs create mode 100644 bd-proto/src/protos/state/payload.rs delete mode 100644 bd-resilient-kv/src/kv_journal/versioned.rs create mode 100644 bd-resilient-kv/src/versioned_kv_journal/file_manager.rs create mode 100644 bd-resilient-kv/src/versioned_kv_journal/framing.rs create mode 100644 bd-resilient-kv/src/versioned_kv_journal/framing_test.rs rename bd-resilient-kv/src/{kv_journal => versioned_kv_journal}/memmapped_versioned.rs (77%) create mode 100644 bd-resilient-kv/src/versioned_kv_journal/mod.rs rename bd-resilient-kv/src/{versioned_recovery.rs => versioned_kv_journal/recovery.rs} (72%) rename bd-resilient-kv/src/{versioned_kv_store.rs => versioned_kv_journal/store.rs} (69%) create mode 100644 bd-resilient-kv/src/versioned_kv_journal/versioned.rs diff --git a/AGENTS.md b/AGENTS.md index d03ecfee..6905513f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -19,6 +19,11 @@ - Make sure to run `cargo +nightly fmt` after making changes to apply default formatting rules. - Use pattern matching with if-let and match expressions for error handling +## Documentation Guidelines +- Avoid redundant documentation for the sake of convention. For example + - Don't include an Errors section if the only errors are generic failures. + - Don't include an Arguments section if the arguments are obvious based on the function signature. + ## Test File Conventions 1. Test files should be placed adjacent to the implementation file they're testing 2. Test files should be named with a `_test.rs` suffix (e.g., `network_quality_test.rs`) @@ -42,4 +47,4 @@ - For automatic fixing of some linting issues, use the `--fix` flag: `SKIP_PROTO_GEN=1 cargo clippy --workspace --bins --examples --tests --fix -- --no-deps` - Fix any remaining warnings before committing code -- Running clippy is especially important after code generation or modification to catch any potential issues \ No newline at end of file +- Running clippy is especially important after code generation or modification to catch any potential issues diff --git a/Cargo.lock b/Cargo.lock index 5acb1e4f..f6be6377 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1106,11 +1106,15 @@ dependencies = [ "anyhow", "bd-bonjson", "bd-client-common", + "bd-proto", "bd-workspace-hack", "bytes", + "crc32fast", "flate2", "memmap2", + "protobuf 4.0.0-alpha.0", "tempfile", + "thiserror 2.0.17", "tokio", ] diff --git a/api b/api index 5cd578ae..9f59c7f4 160000 --- a/api +++ b/api @@ -1 +1 @@ -Subproject commit 5cd578ae43f9b6119c209555eefcd93d587b4509 +Subproject commit 9f59c7f4d855a1aaec85a4647524353262f6cd58 diff --git a/bd-proto/build.rs b/bd-proto/build.rs index 55b7af83..1626e0a0 100644 --- a/bd-proto/build.rs +++ b/bd-proto/build.rs @@ -117,6 +117,20 @@ fn main() { .out_dir("src/protos/logging/") .capture_stderr() .run_from_script(); + std::fs::create_dir_all("src/protos/state").unwrap(); + protobuf_codegen::Codegen::new() + .protoc() + .customize( + Customize::default() + .gen_mod_rs(false) + .oneofs_non_exhaustive(false) + .file_header(GENERATED_HEADER.to_string()), + ) + .includes(["../api/thirdparty", "../api/src"]) + .inputs(["../api/src/bitdrift_public/protobuf/state/v1/payload.proto"]) + .out_dir("src/protos/state/") + .capture_stderr() + .run_from_script(); std::fs::create_dir_all("src/protos/log_matcher").unwrap(); protobuf_codegen::Codegen::new() .protoc() diff --git a/bd-proto/src/protos/mod.rs b/bd-proto/src/protos/mod.rs index abec4e3d..4ea21e92 100644 --- a/bd-proto/src/protos/mod.rs +++ b/bd-proto/src/protos/mod.rs @@ -17,4 +17,5 @@ pub mod logging; pub mod mme; pub mod prometheus; pub mod pulse; +pub mod state; pub mod workflow; diff --git a/bd-proto/src/protos/state/mod.rs b/bd-proto/src/protos/state/mod.rs new file mode 100644 index 00000000..fbb091fe --- /dev/null +++ b/bd-proto/src/protos/state/mod.rs @@ -0,0 +1 @@ +pub mod payload; diff --git a/bd-proto/src/protos/state/payload.rs b/bd-proto/src/protos/state/payload.rs new file mode 100644 index 00000000..31ae479b --- /dev/null +++ b/bd-proto/src/protos/state/payload.rs @@ -0,0 +1,550 @@ +// proto - bitdrift's client/server API definitions +// Copyright Bitdrift, Inc. All rights reserved. +// +// Use of this source code and APIs are governed by a source available license that can be found in +// the LICENSE file or at: +// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt + +// This file is generated by rust-protobuf 4.0.0-alpha.0. Do not edit +// .proto file is parsed by protoc 33.0 +// @generated + +// https://github.com/rust-lang/rust-clippy/issues/702 +#![allow(unknown_lints)] +#![allow(clippy::all)] + +#![allow(unused_attributes)] +#![cfg_attr(rustfmt, rustfmt::skip)] + +#![allow(dead_code)] +#![allow(missing_docs)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +#![allow(non_upper_case_globals)] +#![allow(trivial_casts)] +#![allow(unused_results)] +#![allow(unused_mut)] + +//! Generated file from `bitdrift_public/protobuf/state/v1/payload.proto` + +/// Generated files are compatible only with the same version +/// of protobuf runtime. +const _PROTOBUF_VERSION_CHECK: () = ::protobuf::VERSION_4_0_0_ALPHA_0; + +// @@protoc_insertion_point(message:bitdrift_public.protobuf.state.v1.StateValue) +#[derive(PartialEq,Clone,Default,Debug)] +pub struct StateValue { + // message oneof groups + pub value_type: ::std::option::Option, + // special fields + // @@protoc_insertion_point(special_field:bitdrift_public.protobuf.state.v1.StateValue.special_fields) + pub special_fields: ::protobuf::SpecialFields, +} + +impl<'a> ::std::default::Default for &'a StateValue { + fn default() -> &'a StateValue { + ::default_instance() + } +} + +impl StateValue { + pub fn new() -> StateValue { + ::std::default::Default::default() + } + + // string string_value = 1; + + pub fn string_value(&self) -> &str { + match self.value_type { + ::std::option::Option::Some(state_value::Value_type::StringValue(ref v)) => v, + _ => "", + } + } + + pub fn clear_string_value(&mut self) { + self.value_type = ::std::option::Option::None; + } + + pub fn has_string_value(&self) -> bool { + match self.value_type { + ::std::option::Option::Some(state_value::Value_type::StringValue(..)) => true, + _ => false, + } + } + + // Param is passed by value, moved + pub fn set_string_value(&mut self, v: ::std::string::String) { + self.value_type = ::std::option::Option::Some(state_value::Value_type::StringValue(v)) + } + + // Mutable pointer to the field. + pub fn mut_string_value(&mut self) -> &mut ::std::string::String { + if let ::std::option::Option::Some(state_value::Value_type::StringValue(_)) = self.value_type { + } else { + self.value_type = ::std::option::Option::Some(state_value::Value_type::StringValue(::std::string::String::new())); + } + match self.value_type { + ::std::option::Option::Some(state_value::Value_type::StringValue(ref mut v)) => v, + _ => panic!(), + } + } + + // Take field + pub fn take_string_value(&mut self) -> ::std::string::String { + if self.has_string_value() { + match self.value_type.take() { + ::std::option::Option::Some(state_value::Value_type::StringValue(v)) => v, + _ => panic!(), + } + } else { + ::std::string::String::new() + } + } + + // int64 int_value = 2; + + pub fn int_value(&self) -> i64 { + match self.value_type { + ::std::option::Option::Some(state_value::Value_type::IntValue(v)) => v, + _ => 0, + } + } + + pub fn clear_int_value(&mut self) { + self.value_type = ::std::option::Option::None; + } + + pub fn has_int_value(&self) -> bool { + match self.value_type { + ::std::option::Option::Some(state_value::Value_type::IntValue(..)) => true, + _ => false, + } + } + + // Param is passed by value, moved + pub fn set_int_value(&mut self, v: i64) { + self.value_type = ::std::option::Option::Some(state_value::Value_type::IntValue(v)) + } + + // double double_value = 3; + + pub fn double_value(&self) -> f64 { + match self.value_type { + ::std::option::Option::Some(state_value::Value_type::DoubleValue(v)) => v, + _ => 0., + } + } + + pub fn clear_double_value(&mut self) { + self.value_type = ::std::option::Option::None; + } + + pub fn has_double_value(&self) -> bool { + match self.value_type { + ::std::option::Option::Some(state_value::Value_type::DoubleValue(..)) => true, + _ => false, + } + } + + // Param is passed by value, moved + pub fn set_double_value(&mut self, v: f64) { + self.value_type = ::std::option::Option::Some(state_value::Value_type::DoubleValue(v)) + } + + // bool bool_value = 4; + + pub fn bool_value(&self) -> bool { + match self.value_type { + ::std::option::Option::Some(state_value::Value_type::BoolValue(v)) => v, + _ => false, + } + } + + pub fn clear_bool_value(&mut self) { + self.value_type = ::std::option::Option::None; + } + + pub fn has_bool_value(&self) -> bool { + match self.value_type { + ::std::option::Option::Some(state_value::Value_type::BoolValue(..)) => true, + _ => false, + } + } + + // Param is passed by value, moved + pub fn set_bool_value(&mut self, v: bool) { + self.value_type = ::std::option::Option::Some(state_value::Value_type::BoolValue(v)) + } + + fn generated_message_descriptor_data() -> ::protobuf::reflect::GeneratedMessageDescriptorData { + let mut fields = ::std::vec::Vec::with_capacity(4); + let mut oneofs = ::std::vec::Vec::with_capacity(1); + fields.push(::protobuf::reflect::rt::v2::make_oneof_deref_has_get_set_simpler_accessor::<_, _>( + "string_value", + StateValue::has_string_value, + StateValue::string_value, + StateValue::set_string_value, + )); + fields.push(::protobuf::reflect::rt::v2::make_oneof_copy_has_get_set_simpler_accessors::<_, _>( + "int_value", + StateValue::has_int_value, + StateValue::int_value, + StateValue::set_int_value, + )); + fields.push(::protobuf::reflect::rt::v2::make_oneof_copy_has_get_set_simpler_accessors::<_, _>( + "double_value", + StateValue::has_double_value, + StateValue::double_value, + StateValue::set_double_value, + )); + fields.push(::protobuf::reflect::rt::v2::make_oneof_copy_has_get_set_simpler_accessors::<_, _>( + "bool_value", + StateValue::has_bool_value, + StateValue::bool_value, + StateValue::set_bool_value, + )); + oneofs.push(state_value::Value_type::generated_oneof_descriptor_data()); + ::protobuf::reflect::GeneratedMessageDescriptorData::new_2::( + "StateValue", + fields, + oneofs, + ) + } +} + +impl ::protobuf::Message for StateValue { + const NAME: &'static str = "StateValue"; + + fn is_initialized(&self) -> bool { + true + } + + fn merge_from(&mut self, is: &mut ::protobuf::CodedInputStream<'_>) -> ::protobuf::Result<()> { + while let Some(tag) = is.read_raw_tag_or_eof()? { + match tag { + 10 => { + self.value_type = ::std::option::Option::Some(state_value::Value_type::StringValue(is.read_string()?)); + }, + 16 => { + self.value_type = ::std::option::Option::Some(state_value::Value_type::IntValue(is.read_int64()?)); + }, + 25 => { + self.value_type = ::std::option::Option::Some(state_value::Value_type::DoubleValue(is.read_double()?)); + }, + 32 => { + self.value_type = ::std::option::Option::Some(state_value::Value_type::BoolValue(is.read_bool()?)); + }, + tag => { + ::protobuf::rt::read_unknown_or_skip_group(tag, is, self.special_fields.mut_unknown_fields())?; + }, + }; + } + ::std::result::Result::Ok(()) + } + + // Compute sizes of nested messages + #[allow(unused_variables)] + fn compute_size(&self) -> u64 { + let mut my_size = 0; + if let ::std::option::Option::Some(ref v) = self.value_type { + match v { + &state_value::Value_type::StringValue(ref v) => { + my_size += ::protobuf::rt::string_size(1, &v); + }, + &state_value::Value_type::IntValue(v) => { + my_size += ::protobuf::rt::int64_size(2, v); + }, + &state_value::Value_type::DoubleValue(v) => { + my_size += 1 + 8; + }, + &state_value::Value_type::BoolValue(v) => { + my_size += 1 + 1; + }, + }; + } + my_size += ::protobuf::rt::unknown_fields_size(self.special_fields.unknown_fields()); + self.special_fields.cached_size().set(my_size as u32); + my_size + } + + fn write_to_with_cached_sizes(&self, os: &mut ::protobuf::CodedOutputStream<'_>) -> ::protobuf::Result<()> { + if let ::std::option::Option::Some(ref v) = self.value_type { + match v { + &state_value::Value_type::StringValue(ref v) => { + os.write_string(1, v)?; + }, + &state_value::Value_type::IntValue(v) => { + os.write_int64(2, v)?; + }, + &state_value::Value_type::DoubleValue(v) => { + os.write_double(3, v)?; + }, + &state_value::Value_type::BoolValue(v) => { + os.write_bool(4, v)?; + }, + }; + } + os.write_unknown_fields(self.special_fields.unknown_fields())?; + ::std::result::Result::Ok(()) + } + + fn special_fields(&self) -> &::protobuf::SpecialFields { + &self.special_fields + } + + fn mut_special_fields(&mut self) -> &mut ::protobuf::SpecialFields { + &mut self.special_fields + } + + fn new() -> StateValue { + StateValue::new() + } + + fn clear(&mut self) { + self.value_type = ::std::option::Option::None; + self.value_type = ::std::option::Option::None; + self.value_type = ::std::option::Option::None; + self.value_type = ::std::option::Option::None; + self.special_fields.clear(); + } + + fn default_instance() -> &'static StateValue { + static instance: StateValue = StateValue { + value_type: ::std::option::Option::None, + special_fields: ::protobuf::SpecialFields::new(), + }; + &instance + } +} + +impl ::protobuf::MessageFull for StateValue { + fn descriptor() -> ::protobuf::reflect::MessageDescriptor { + static descriptor: ::protobuf::rt::Lazy<::protobuf::reflect::MessageDescriptor> = ::protobuf::rt::Lazy::new(); + descriptor.get(|| file_descriptor().message_by_package_relative_name("StateValue").unwrap()).clone() + } +} + +impl ::std::fmt::Display for StateValue { + fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { + ::protobuf::text_format::fmt(self, f) + } +} + +impl ::protobuf::reflect::ProtobufValue for StateValue { + type RuntimeType = ::protobuf::reflect::rt::RuntimeTypeMessage; +} + +/// Nested message and enums of message `StateValue` +pub mod state_value { + + #[derive(Clone,PartialEq,Debug)] + // @@protoc_insertion_point(oneof:bitdrift_public.protobuf.state.v1.StateValue.value_type) + pub enum Value_type { + // @@protoc_insertion_point(oneof_field:bitdrift_public.protobuf.state.v1.StateValue.string_value) + StringValue(::std::string::String), + // @@protoc_insertion_point(oneof_field:bitdrift_public.protobuf.state.v1.StateValue.int_value) + IntValue(i64), + // @@protoc_insertion_point(oneof_field:bitdrift_public.protobuf.state.v1.StateValue.double_value) + DoubleValue(f64), + // @@protoc_insertion_point(oneof_field:bitdrift_public.protobuf.state.v1.StateValue.bool_value) + BoolValue(bool), + } + + impl ::protobuf::Oneof for Value_type { + } + + impl ::protobuf::OneofFull for Value_type { + fn descriptor() -> ::protobuf::reflect::OneofDescriptor { + static descriptor: ::protobuf::rt::Lazy<::protobuf::reflect::OneofDescriptor> = ::protobuf::rt::Lazy::new(); + descriptor.get(|| ::descriptor().oneof_by_name("value_type").unwrap()).clone() + } + } + + impl Value_type { + pub(in super) fn generated_oneof_descriptor_data() -> ::protobuf::reflect::GeneratedOneofDescriptorData { + ::protobuf::reflect::GeneratedOneofDescriptorData::new::("value_type") + } + } +} + +// @@protoc_insertion_point(message:bitdrift_public.protobuf.state.v1.StateKeyValuePair) +#[derive(PartialEq,Clone,Default,Debug)] +pub struct StateKeyValuePair { + // message fields + // @@protoc_insertion_point(field:bitdrift_public.protobuf.state.v1.StateKeyValuePair.key) + pub key: ::std::string::String, + // @@protoc_insertion_point(field:bitdrift_public.protobuf.state.v1.StateKeyValuePair.value) + pub value: ::protobuf::MessageField, + // special fields + // @@protoc_insertion_point(special_field:bitdrift_public.protobuf.state.v1.StateKeyValuePair.special_fields) + pub special_fields: ::protobuf::SpecialFields, +} + +impl<'a> ::std::default::Default for &'a StateKeyValuePair { + fn default() -> &'a StateKeyValuePair { + ::default_instance() + } +} + +impl StateKeyValuePair { + pub fn new() -> StateKeyValuePair { + ::std::default::Default::default() + } + + fn generated_message_descriptor_data() -> ::protobuf::reflect::GeneratedMessageDescriptorData { + let mut fields = ::std::vec::Vec::with_capacity(2); + let mut oneofs = ::std::vec::Vec::with_capacity(0); + fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( + "key", + |m: &StateKeyValuePair| { &m.key }, + |m: &mut StateKeyValuePair| { &mut m.key }, + )); + fields.push(::protobuf::reflect::rt::v2::make_message_field_accessor::<_, StateValue>( + "value", + |m: &StateKeyValuePair| { &m.value }, + |m: &mut StateKeyValuePair| { &mut m.value }, + )); + ::protobuf::reflect::GeneratedMessageDescriptorData::new_2::( + "StateKeyValuePair", + fields, + oneofs, + ) + } +} + +impl ::protobuf::Message for StateKeyValuePair { + const NAME: &'static str = "StateKeyValuePair"; + + fn is_initialized(&self) -> bool { + true + } + + fn merge_from(&mut self, is: &mut ::protobuf::CodedInputStream<'_>) -> ::protobuf::Result<()> { + while let Some(tag) = is.read_raw_tag_or_eof()? { + match tag { + 10 => { + self.key = is.read_string()?; + }, + 18 => { + ::protobuf::rt::read_singular_message_into_field(is, &mut self.value)?; + }, + tag => { + ::protobuf::rt::read_unknown_or_skip_group(tag, is, self.special_fields.mut_unknown_fields())?; + }, + }; + } + ::std::result::Result::Ok(()) + } + + // Compute sizes of nested messages + #[allow(unused_variables)] + fn compute_size(&self) -> u64 { + let mut my_size = 0; + if !self.key.is_empty() { + my_size += ::protobuf::rt::string_size(1, &self.key); + } + if let Some(v) = self.value.as_ref() { + let len = v.compute_size(); + my_size += 1 + ::protobuf::rt::compute_raw_varint64_size(len) + len; + } + my_size += ::protobuf::rt::unknown_fields_size(self.special_fields.unknown_fields()); + self.special_fields.cached_size().set(my_size as u32); + my_size + } + + fn write_to_with_cached_sizes(&self, os: &mut ::protobuf::CodedOutputStream<'_>) -> ::protobuf::Result<()> { + if !self.key.is_empty() { + os.write_string(1, &self.key)?; + } + if let Some(v) = self.value.as_ref() { + ::protobuf::rt::write_message_field_with_cached_size(2, v, os)?; + } + os.write_unknown_fields(self.special_fields.unknown_fields())?; + ::std::result::Result::Ok(()) + } + + fn special_fields(&self) -> &::protobuf::SpecialFields { + &self.special_fields + } + + fn mut_special_fields(&mut self) -> &mut ::protobuf::SpecialFields { + &mut self.special_fields + } + + fn new() -> StateKeyValuePair { + StateKeyValuePair::new() + } + + fn clear(&mut self) { + self.key.clear(); + self.value.clear(); + self.special_fields.clear(); + } + + fn default_instance() -> &'static StateKeyValuePair { + static instance: StateKeyValuePair = StateKeyValuePair { + key: ::std::string::String::new(), + value: ::protobuf::MessageField::none(), + special_fields: ::protobuf::SpecialFields::new(), + }; + &instance + } +} + +impl ::protobuf::MessageFull for StateKeyValuePair { + fn descriptor() -> ::protobuf::reflect::MessageDescriptor { + static descriptor: ::protobuf::rt::Lazy<::protobuf::reflect::MessageDescriptor> = ::protobuf::rt::Lazy::new(); + descriptor.get(|| file_descriptor().message_by_package_relative_name("StateKeyValuePair").unwrap()).clone() + } +} + +impl ::std::fmt::Display for StateKeyValuePair { + fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { + ::protobuf::text_format::fmt(self, f) + } +} + +impl ::protobuf::reflect::ProtobufValue for StateKeyValuePair { + type RuntimeType = ::protobuf::reflect::rt::RuntimeTypeMessage; +} + +static file_descriptor_proto_data: &'static [u8] = b"\ + \n/bitdrift_public/protobuf/state/v1/payload.proto\x12!bitdrift_public.p\ + rotobuf.state.v1\"\xa4\x01\n\nStateValue\x12#\n\x0cstring_value\x18\x01\ + \x20\x01(\tH\0R\x0bstringValue\x12\x1d\n\tint_value\x18\x02\x20\x01(\x03\ + H\0R\x08intValue\x12#\n\x0cdouble_value\x18\x03\x20\x01(\x01H\0R\x0bdoub\ + leValue\x12\x1f\n\nbool_value\x18\x04\x20\x01(\x08H\0R\tboolValueB\x0c\n\ + \nvalue_type\"j\n\x11StateKeyValuePair\x12\x10\n\x03key\x18\x01\x20\x01(\ + \tR\x03key\x12C\n\x05value\x18\x02\x20\x01(\x0b2-.bitdrift_public.protob\ + uf.state.v1.StateValueR\x05valueb\x06proto3\ +"; + +/// `FileDescriptorProto` object which was a source for this generated file +fn file_descriptor_proto() -> &'static ::protobuf::descriptor::FileDescriptorProto { + static file_descriptor_proto_lazy: ::protobuf::rt::Lazy<::protobuf::descriptor::FileDescriptorProto> = ::protobuf::rt::Lazy::new(); + file_descriptor_proto_lazy.get(|| { + ::protobuf::Message::parse_from_bytes(file_descriptor_proto_data).unwrap() + }) +} + +/// `FileDescriptor` object which allows dynamic access to files +pub fn file_descriptor() -> &'static ::protobuf::reflect::FileDescriptor { + static generated_file_descriptor_lazy: ::protobuf::rt::Lazy<::protobuf::reflect::GeneratedFileDescriptor> = ::protobuf::rt::Lazy::new(); + static file_descriptor: ::protobuf::rt::Lazy<::protobuf::reflect::FileDescriptor> = ::protobuf::rt::Lazy::new(); + file_descriptor.get(|| { + let generated_file_descriptor = generated_file_descriptor_lazy.get(|| { + let mut deps = ::std::vec::Vec::with_capacity(0); + let mut messages = ::std::vec::Vec::with_capacity(2); + messages.push(StateValue::generated_message_descriptor_data()); + messages.push(StateKeyValuePair::generated_message_descriptor_data()); + let mut enums = ::std::vec::Vec::with_capacity(0); + ::protobuf::reflect::GeneratedFileDescriptor::new_generated( + file_descriptor_proto(), + deps, + messages, + enums, + ) + }); + ::protobuf::reflect::FileDescriptor::new_generated_2(generated_file_descriptor) + }) +} diff --git a/bd-proto/src/protos/workflow/workflow.rs b/bd-proto/src/protos/workflow/workflow.rs index 58ebd8c0..d4391796 100644 --- a/bd-proto/src/protos/workflow/workflow.rs +++ b/bd-proto/src/protos/workflow/workflow.rs @@ -5753,9 +5753,6 @@ pub mod workflow { // @@protoc_insertion_point(message:bitdrift_public.protobuf.workflow.v1.Workflow.Action.ActionTakeScreenshot) #[derive(PartialEq,Clone,Default,Debug)] pub struct ActionTakeScreenshot { - // message fields - // @@protoc_insertion_point(field:bitdrift_public.protobuf.workflow.v1.Workflow.Action.ActionTakeScreenshot.id) - pub id: ::std::string::String, // special fields // @@protoc_insertion_point(special_field:bitdrift_public.protobuf.workflow.v1.Workflow.Action.ActionTakeScreenshot.special_fields) pub special_fields: ::protobuf::SpecialFields, @@ -5773,13 +5770,8 @@ pub mod workflow { } pub(in super::super) fn generated_message_descriptor_data() -> ::protobuf::reflect::GeneratedMessageDescriptorData { - let mut fields = ::std::vec::Vec::with_capacity(1); + let mut fields = ::std::vec::Vec::with_capacity(0); let mut oneofs = ::std::vec::Vec::with_capacity(0); - fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( - "id", - |m: &ActionTakeScreenshot| { &m.id }, - |m: &mut ActionTakeScreenshot| { &mut m.id }, - )); ::protobuf::reflect::GeneratedMessageDescriptorData::new_2::( "Workflow.Action.ActionTakeScreenshot", fields, @@ -5798,9 +5790,6 @@ pub mod workflow { fn merge_from(&mut self, is: &mut ::protobuf::CodedInputStream<'_>) -> ::protobuf::Result<()> { while let Some(tag) = is.read_raw_tag_or_eof()? { match tag { - 10 => { - self.id = is.read_string()?; - }, tag => { ::protobuf::rt::read_unknown_or_skip_group(tag, is, self.special_fields.mut_unknown_fields())?; }, @@ -5813,18 +5802,12 @@ pub mod workflow { #[allow(unused_variables)] fn compute_size(&self) -> u64 { let mut my_size = 0; - if !self.id.is_empty() { - my_size += ::protobuf::rt::string_size(1, &self.id); - } my_size += ::protobuf::rt::unknown_fields_size(self.special_fields.unknown_fields()); self.special_fields.cached_size().set(my_size as u32); my_size } fn write_to_with_cached_sizes(&self, os: &mut ::protobuf::CodedOutputStream<'_>) -> ::protobuf::Result<()> { - if !self.id.is_empty() { - os.write_string(1, &self.id)?; - } os.write_unknown_fields(self.special_fields.unknown_fields())?; ::std::result::Result::Ok(()) } @@ -5842,13 +5825,11 @@ pub mod workflow { } fn clear(&mut self) { - self.id.clear(); self.special_fields.clear(); } fn default_instance() -> &'static ActionTakeScreenshot { static instance: ActionTakeScreenshot = ActionTakeScreenshot { - id: ::std::string::String::new(), special_fields: ::protobuf::SpecialFields::new(), }; &instance @@ -7094,7 +7075,7 @@ static file_descriptor_proto_data: &'static [u8] = b"\ ic.protobuf.workflow.v1\x1a\x17validate/validate.proto\x1a5bitdrift_publ\ ic/protobuf/matcher/v1/log_matcher.proto\"f\n\x16WorkflowsConfiguration\ \x12L\n\tworkflows\x18\x01\x20\x03(\x0b2..bitdrift_public.protobuf.workf\ - low.v1.WorkflowR\tworkflows\"\xa35\n\x08Workflow\x12\x17\n\x02id\x18\x01\ + low.v1.WorkflowR\tworkflows\"\x905\n\x08Workflow\x12\x17\n\x02id\x18\x01\ \x20\x01(\tR\x02idB\x07\xfaB\x04r\x02\x10\x01\x12V\n\x06states\x18\x02\ \x20\x03(\x0b24.bitdrift_public.protobuf.workflow.v1.Workflow.StateR\x06\ statesB\x08\xfaB\x05\x92\x01\x02\x08\x01\x12V\n\texecution\x18\x03\x20\ @@ -7141,7 +7122,7 @@ static file_descriptor_proto_data: &'static [u8] = b"\ \x0eextension_type\x12\x03\xf8B\x01\x1a\x89\x01\n\x0cRuleLogMatch\x12Z\n\ \x0blog_matcher\x18\x01\x20\x01(\x0b2/.bitdrift_public.protobuf.matcher.\ v1.LogMatcherR\nlogMatcherB\x08\xfaB\x05\x8a\x01\x02\x10\x01\x12\x1d\n\ - \x05count\x18\x02\x20\x01(\rR\x05countB\x07\xfaB\x04*\x02\x20\0\x1a\xf4\ + \x05count\x18\x02\x20\x01(\rR\x05countB\x07\xfaB\x04*\x02\x20\0\x1a\xe1\ \x1d\n\x06Action\x12|\n\x14action_flush_buffers\x18\x01\x20\x01(\x0b2H.b\ itdrift_public.protobuf.workflow.v1.Workflow.Action.ActionFlushBuffersH\ \0R\x12actionFlushBuffers\x12v\n\x12action_emit_metric\x18\x02\x20\x01(\ @@ -7215,23 +7196,22 @@ static file_descriptor_proto_data: &'static [u8] = b"\ \x18\x04\x20\x01(\x08H\0R\x10logBodyExtracted\x12{\n\x16feature_flag_ext\ racted\x18\x05\x20\x01(\x0b2C.bitdrift_public.protobuf.workflow.v1.Workf\ low.FeatureFlagExtractedH\0R\x14featureFlagExtractedB\x0f\n\x08tag_type\ - \x12\x03\xf8B\x01\x1a/\n\x14ActionTakeScreenshot\x12\x17\n\x02id\x18\x01\ - \x20\x01(\tR\x02idB\x07\xfaB\x04r\x02\x10\x01B\x12\n\x0baction_type\x12\ - \x03\xf8B\x01\x1a\xb9\x01\n\tExecution\x12~\n\x13execution_exclusive\x18\ - \x01\x20\x01(\x0b2K.bitdrift_public.protobuf.workflow.v1.Workflow.Execut\ - ion.ExecutionExclusiveH\0R\x12executionExclusive\x1a\x14\n\x12ExecutionE\ - xclusiveB\x10\n\x0eexecution_typeJ\x04\x08\x02\x10\x03\x1a6\n\x15LimitMa\ - tchedLogsCount\x12\x1d\n\x05count\x18\x01\x20\x01(\rR\x05countB\x07\xfaB\ - \x04*\x02\x20\0\x1a9\n\rLimitDuration\x12(\n\x0bduration_ms\x18\x02\x20\ - \x01(\x04R\ndurationMsB\x07\xfaB\x042\x02\x20\0\x1a\xb1\x01\n\x0eFieldEx\ - tracted\x12&\n\nfield_name\x18\x01\x20\x01(\tR\tfieldNameB\x07\xfaB\x04r\ - \x02\x10\x01\x12[\n\x05exact\x18\x02\x20\x01(\x0b2C.bitdrift_public.prot\ - obuf.workflow.v1.Workflow.FieldExtracted.ExactH\0R\x05exact\x1a\x07\n\ - \x05ExactB\x11\n\x0fextraction_type\x1a\xb2\x01\n\x14FeatureFlagExtracte\ - d\x12\x1b\n\x04name\x18\x01\x20\x01(\tR\x04nameB\x07\xfaB\x04r\x02\x10\ - \x01\x12a\n\x05exact\x18\x02\x20\x01(\x0b2I.bitdrift_public.protobuf.wor\ - kflow.v1.Workflow.FeatureFlagExtracted.ExactH\0R\x05exact\x1a\x07\n\x05E\ - xactB\x11\n\x0fextraction_typeb\x06proto3\ + \x12\x03\xf8B\x01\x1a\x1c\n\x14ActionTakeScreenshotJ\x04\x08\x01\x10\x02\ + B\x12\n\x0baction_type\x12\x03\xf8B\x01\x1a\xb9\x01\n\tExecution\x12~\n\ + \x13execution_exclusive\x18\x01\x20\x01(\x0b2K.bitdrift_public.protobuf.\ + workflow.v1.Workflow.Execution.ExecutionExclusiveH\0R\x12executionExclus\ + ive\x1a\x14\n\x12ExecutionExclusiveB\x10\n\x0eexecution_typeJ\x04\x08\ + \x02\x10\x03\x1a6\n\x15LimitMatchedLogsCount\x12\x1d\n\x05count\x18\x01\ + \x20\x01(\rR\x05countB\x07\xfaB\x04*\x02\x20\0\x1a9\n\rLimitDuration\x12\ + (\n\x0bduration_ms\x18\x02\x20\x01(\x04R\ndurationMsB\x07\xfaB\x042\x02\ + \x20\0\x1a\xb1\x01\n\x0eFieldExtracted\x12&\n\nfield_name\x18\x01\x20\ + \x01(\tR\tfieldNameB\x07\xfaB\x04r\x02\x10\x01\x12[\n\x05exact\x18\x02\ + \x20\x01(\x0b2C.bitdrift_public.protobuf.workflow.v1.Workflow.FieldExtra\ + cted.ExactH\0R\x05exact\x1a\x07\n\x05ExactB\x11\n\x0fextraction_type\x1a\ + \xb2\x01\n\x14FeatureFlagExtracted\x12\x1b\n\x04name\x18\x01\x20\x01(\tR\ + \x04nameB\x07\xfaB\x04r\x02\x10\x01\x12a\n\x05exact\x18\x02\x20\x01(\x0b\ + 2I.bitdrift_public.protobuf.workflow.v1.Workflow.FeatureFlagExtracted.Ex\ + actH\0R\x05exact\x1a\x07\n\x05ExactB\x11\n\x0fextraction_typeb\x06proto3\ "; /// `FileDescriptorProto` object which was a source for this generated file diff --git a/bd-resilient-kv/Cargo.toml b/bd-resilient-kv/Cargo.toml index d9c1f60a..50ec8a51 100644 --- a/bd-resilient-kv/Cargo.toml +++ b/bd-resilient-kv/Cargo.toml @@ -15,9 +15,13 @@ tempfile.workspace = true ahash.workspace = true anyhow.workspace = true bd-bonjson = { path = "../bd-bonjson" } +bd-proto = { path = "../bd-proto" } bd-client-common = { path = "../bd-client-common" } bd-workspace-hack.workspace = true bytes.workspace = true +crc32fast.workspace = true flate2 = { workspace = true, features = ["zlib"] } memmap2.workspace = true tokio.workspace = true +protobuf.workspace = true +thiserror.workspace = true diff --git a/bd-resilient-kv/src/kv_journal/mod.rs b/bd-resilient-kv/src/kv_journal/mod.rs index b9799927..050d030e 100644 --- a/bd-resilient-kv/src/kv_journal/mod.rs +++ b/bd-resilient-kv/src/kv_journal/mod.rs @@ -94,11 +94,7 @@ pub trait KVJournal { pub mod double_buffered; pub mod in_memory; pub mod memmapped; -pub mod memmapped_versioned; -pub mod versioned; pub use double_buffered::DoubleBufferedKVJournal; pub use in_memory::InMemoryKVJournal; pub use memmapped::MemMappedKVJournal; -pub use memmapped_versioned::MemMappedVersionedKVJournal; -pub use versioned::{TimestampedValue, VersionedKVJournal}; diff --git a/bd-resilient-kv/src/kv_journal/versioned.rs b/bd-resilient-kv/src/kv_journal/versioned.rs deleted file mode 100644 index 6709019b..00000000 --- a/bd-resilient-kv/src/kv_journal/versioned.rs +++ /dev/null @@ -1,562 +0,0 @@ -// shared-core - bitdrift's common client/server libraries -// Copyright Bitdrift, Inc. All rights reserved. -// -// Use of this source code is governed by a source available license that can be found in the -// LICENSE file or at: -// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt - -use ahash::AHashMap; -use bd_bonjson::Value; -use bd_bonjson::decoder::from_slice; -use bd_bonjson::encoder::encode_into_buf; -use bd_bonjson::serialize_primitives::serialize_array_begin; -use bd_client_common::error::InvariantError; -use bytes::BufMut; -use std::time::{SystemTime, UNIX_EPOCH}; - -/// Represents a value with its associated timestamp. -#[derive(Debug, Clone, PartialEq)] -pub struct TimestampedValue { - /// The value stored in the key-value store. - pub value: Value, - /// The timestamp (in microseconds since UNIX epoch) when this value was last written. - pub timestamp: u64, -} - -/// Timestamped implementation of a key-value journaling system that uses timestamps -/// as the version identifier for point-in-time recovery. -/// -/// Each write operation is assigned a monotonically non-decreasing timestamp (in microseconds -/// since UNIX epoch), enabling exact state reconstruction at any historical timestamp. -/// The monotonicity is enforced by clamping: if the system clock goes backwards, we reuse -/// the same timestamp value to maintain ordering guarantees. When timestamps collide, -/// journal ordering determines precedence. -#[derive(Debug)] -pub struct VersionedKVJournal<'a> { - position: usize, - buffer: &'a mut [u8], - high_water_mark: usize, - high_water_mark_triggered: bool, - initialized_at_unix_time_ns: u64, - last_timestamp: u64, // Most recent timestamp written (for monotonic enforcement) -} - -// Versioned KV files have the following structure: -// | Position | Data | Type | -// |----------|--------------------------|----------------| -// | 0 | Format Version | u64 | -// | 8 | Position | u64 | -// | 16 | Type Code: Array Start | u8 | -// | 17 | Metadata Object | BONJSON Object | -// | ... | Timestamped Journal Entry| BONJSON Object | -// | ... | Timestamped Journal Entry| BONJSON Object | -// -// Metadata object: {"initialized": , "format_version": 2} -// Journal entries: {"t": , "k": "", "o": } -// -// # Timestamp Semantics -// -// Timestamps serve as both version identifiers and logical clocks with monotonic guarantees: -// - Each write gets a timestamp that is guaranteed to be >= previous writes (non-decreasing) -// - If system clock goes backward, timestamps are clamped to last_timestamp (reuse same value) -// - When timestamps collide, journal ordering determines precedence -// - This ensures total ordering while allowing correlation with external timestamped systems - -const VERSION: u64 = 2; // The versioned format version - -const HEADER_SIZE: usize = 16; -const ARRAY_BEGIN: usize = 16; -const METADATA_OFFSET: usize = 17; - -// Minimum buffer size for a valid journal -const MIN_BUFFER_SIZE: usize = HEADER_SIZE + 4; - -/// Helper function to read a u64 field from a BONJSON object. -/// -/// BONJSON's decoder automatically converts unsigned values that fit in i64 to signed values -/// during decoding (see bd-bonjson/src/decoder.rs:227-234). This means that even though we -/// write `Value::Unsigned(version)`, the decoder returns `Value::Signed(version as i64)`. -/// -/// TODO(snowp): Consider changing BONJSON's decoder to preserve the original unsigned type -/// to avoid this normalization behavior and eliminate the need for this helper. -fn read_u64_field(obj: &AHashMap, key: &str) -> Option { - match obj.get(key) { - Some(Value::Unsigned(v)) => Some(*v), - Some(Value::Signed(v)) if *v >= 0 => - { - #[allow(clippy::cast_sign_loss)] - Some(*v as u64) - }, - _ => None, - } -} - -/// Get current timestamp in microseconds since UNIX epoch. -fn current_timestamp() -> anyhow::Result { - SystemTime::now() - .duration_since(UNIX_EPOCH) - .map_err(|_| InvariantError::Invariant.into()) - .map(|d| { - #[allow(clippy::cast_possible_truncation)] - { - d.as_micros() as u64 - } - }) -} - - -/// Write to the version field of a journal buffer. -fn write_version_field(buffer: &mut [u8], version: u64) { - let version_bytes = version.to_le_bytes(); - buffer[0 .. 8].copy_from_slice(&version_bytes); -} - -/// Write the version to a journal buffer. -fn write_version(buffer: &mut [u8]) { - write_version_field(buffer, VERSION); -} - -fn read_position(buffer: &[u8]) -> anyhow::Result { - let position_bytes: [u8; 8] = buffer[8 .. 16].try_into()?; - let position_u64 = u64::from_le_bytes(position_bytes); - let position = usize::try_from(position_u64) - .map_err(|_| anyhow::anyhow!("Position {position_u64} too large for usize"))?; - let buffer_len = buffer.len(); - if position >= buffer_len { - anyhow::bail!("Invalid position: {position}, buffer size: {buffer_len}",); - } - Ok(position) -} - -/// Write the position to a journal buffer. -fn write_position(buffer: &mut [u8], position: usize) { - let position_bytes = (position as u64).to_le_bytes(); - buffer[8 .. 16].copy_from_slice(&position_bytes); -} - -/// Read the bonjson payload in this buffer. -fn read_bonjson_payload(buffer: &[u8]) -> anyhow::Result { - let position = read_position(buffer)?; - let slice_to_decode = &buffer[ARRAY_BEGIN .. position]; - - match from_slice(slice_to_decode) { - Ok((_, decoded)) => Ok(decoded), - Err(bd_bonjson::decoder::DecodeError::Partial { partial_value, .. }) => Ok(partial_value), - Err(e) => anyhow::bail!("Failed to decode buffer: {e:?}"), - } -} - -/// Create and write the metadata section of a versioned journal. -fn write_metadata(buffer: &mut [u8], timestamp: u64) -> anyhow::Result { - let buffer_len = buffer.len(); - let mut cursor = &mut buffer[METADATA_OFFSET ..]; - - // Create metadata object - let mut metadata = AHashMap::new(); - metadata.insert("initialized".to_string(), Value::Unsigned(timestamp)); - metadata.insert("format_version".to_string(), Value::Unsigned(VERSION)); - - // Write metadata object - encode_into_buf(&mut cursor, &Value::Object(metadata)) - .map_err(|e| anyhow::anyhow!("Failed to encode metadata object: {e:?}"))?; - - Ok(buffer_len - cursor.remaining_mut()) -} - -/// Extract metadata from the buffer. -fn extract_metadata_from_buffer(buffer: &[u8]) -> anyhow::Result { - let array = read_bonjson_payload(buffer)?; - if let Value::Array(entries) = array - && let Some(Value::Object(obj)) = entries.first() - { - let timestamp = read_u64_field(obj, "initialized") - .ok_or_else(|| anyhow::anyhow!("No initialized timestamp found in metadata"))?; - - return Ok(timestamp); - } - anyhow::bail!("No valid metadata found"); -} - -fn validate_buffer_len(buffer: &[u8]) -> anyhow::Result { - let buffer_len = buffer.len(); - if buffer_len < MIN_BUFFER_SIZE { - anyhow::bail!( - "Buffer too small: {buffer_len} bytes, but need at least {MIN_BUFFER_SIZE} bytes" - ); - } - Ok(buffer_len) -} - -/// Validate high water mark ratio and calculate the position from buffer length. -fn calculate_high_water_mark( - buffer_len: usize, - high_water_mark_ratio: Option, -) -> anyhow::Result { - let ratio = high_water_mark_ratio.unwrap_or(0.8); - if !(0.0 ..= 1.0).contains(&ratio) { - anyhow::bail!("High water mark ratio must be between 0.0 and 1.0, got: {ratio}"); - } - - #[allow( - clippy::cast_precision_loss, - clippy::cast_possible_truncation, - clippy::cast_sign_loss - )] - let high_water_mark = (buffer_len as f32 * ratio) as usize; - Ok(high_water_mark) -} - -impl<'a> VersionedKVJournal<'a> { - /// Create a new versioned journal using the provided buffer as storage space. - /// - /// # Arguments - /// * `buffer` - The storage buffer - /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 - /// - /// # Errors - /// Returns an error if the buffer is too small or if `high_water_mark_ratio` is invalid. - pub fn new(buffer: &'a mut [u8], high_water_mark_ratio: Option) -> anyhow::Result { - let buffer_len = validate_buffer_len(buffer)?; - let high_water_mark = calculate_high_water_mark(buffer_len, high_water_mark_ratio)?; - - // Write array begin marker right after header - let mut cursor = &mut buffer[HEADER_SIZE ..]; - serialize_array_begin(&mut cursor).map_err(|_| InvariantError::Invariant)?; - - // Write metadata with current timestamp - let timestamp = current_timestamp()?; - let position = write_metadata(buffer, timestamp)?; - - write_position(buffer, position); - write_version(buffer); - - Ok(Self { - position, - buffer, - high_water_mark, - high_water_mark_triggered: false, - initialized_at_unix_time_ns: timestamp, - last_timestamp: timestamp, - }) - } - - /// Create a new versioned journal with state loaded from the provided buffer. - /// - /// # Arguments - /// * `buffer` - The storage buffer containing existing versioned KV data - /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 - /// - /// # Errors - /// Returns an error if the buffer is invalid, corrupted, or if `high_water_mark_ratio` is - /// invalid. - pub fn from_buffer( - buffer: &'a mut [u8], - high_water_mark_ratio: Option, - ) -> anyhow::Result { - let buffer_len = validate_buffer_len(buffer)?; - let position = read_position(buffer)?; - let init_timestamp = extract_metadata_from_buffer(buffer)?; - let high_water_mark = calculate_high_water_mark(buffer_len, high_water_mark_ratio)?; - - // Find the highest timestamp in the journal - let highest_timestamp = Self::find_highest_timestamp(buffer)?; - let last_timestamp = highest_timestamp.unwrap_or(init_timestamp); - - Ok(Self { - position, - buffer, - high_water_mark, - high_water_mark_triggered: position >= high_water_mark, - initialized_at_unix_time_ns: init_timestamp, - last_timestamp, - }) - } - - /// Find the highest timestamp in the journal. - /// - /// Since timestamps are monotonically increasing, this simply returns the timestamp - /// from the last entry in the journal. - fn find_highest_timestamp(buffer: &[u8]) -> anyhow::Result> { - let array = read_bonjson_payload(buffer)?; - - if let Value::Array(entries) = array { - // Skip metadata (index 0) and get the last actual entry - // Since timestamps are monotonically increasing, the last entry has the highest timestamp - if entries.len() > 1 - && let Some(Value::Object(obj)) = entries.last() - { - return Ok(read_u64_field(obj, "t")); - } - } - - Ok(None) - } - - /// Get the next monotonically increasing timestamp. - /// - /// This ensures that even if the system clock goes backwards, timestamps remain - /// monotonically increasing by clamping to `last_timestamp` (reusing the same value). - /// This prevents artificial clock skew while maintaining ordering guarantees. - fn next_monotonic_timestamp(&mut self) -> anyhow::Result { - let current = current_timestamp()?; - let monotonic = std::cmp::max(current, self.last_timestamp); - self.last_timestamp = monotonic; - Ok(monotonic) - } - - fn set_position(&mut self, position: usize) { - self.position = position; - write_position(self.buffer, position); - self.check_high_water_mark(); - } - - fn check_high_water_mark(&mut self) { - if self.position >= self.high_water_mark { - self.trigger_high_water(); - } - } - - fn trigger_high_water(&mut self) { - self.high_water_mark_triggered = true; - } - - /// Write a versioned journal entry and return the timestamp. - fn write_versioned_entry(&mut self, key: &str, value: &Value) -> anyhow::Result { - // Get monotonically increasing timestamp before borrowing buffer - let timestamp = self.next_monotonic_timestamp()?; - - let buffer_len = self.buffer.len(); - let mut cursor = &mut self.buffer[self.position ..]; - - // Create entry object: {"t": timestamp, "k": key, "o": value} - // TODO(snowp): It would be nice to be able to pass impl AsRef for key or Cow to avoid - // allocating small strings repeatedly. - let entry = AHashMap::from([ - ("t".to_string(), Value::Unsigned(timestamp)), - ("k".to_string(), Value::String(key.to_string())), - ("o".to_string(), value.clone()), - ]); - - encode_into_buf(&mut cursor, &Value::Object(entry)) - .map_err(|e| anyhow::anyhow!("Failed to encode versioned entry: {e:?}"))?; - - let remaining = cursor.remaining_mut(); - self.set_position(buffer_len - remaining); - Ok(timestamp) - } - - /// Set a key-value pair. - /// Returns the timestamp of the operation. - /// - /// The timestamp is monotonically non-decreasing and serves as the version identifier. - /// If the system clock goes backwards, timestamps are clamped to maintain monotonicity. - pub fn set_versioned(&mut self, key: &str, value: &Value) -> anyhow::Result { - self.write_versioned_entry(key, value) - } - - /// Delete a key. - /// Returns the timestamp of the operation. - /// - /// The timestamp is monotonically non-decreasing and serves as the version identifier. - /// If the system clock goes backwards, timestamps are clamped to maintain monotonicity. - pub fn delete_versioned(&mut self, key: &str) -> anyhow::Result { - self.write_versioned_entry(key, &Value::Null) - } - - /// Get the high water mark position. - #[must_use] - pub fn high_water_mark(&self) -> usize { - self.high_water_mark - } - - /// Check if the high water mark has been triggered. - #[must_use] - pub fn is_high_water_mark_triggered(&self) -> bool { - self.high_water_mark_triggered - } - - /// Get the current buffer usage as a percentage (0.0 to 1.0). - #[must_use] - pub fn buffer_usage_ratio(&self) -> f32 { - #[allow(clippy::cast_precision_loss)] - let position_f32 = self.position as f32; - #[allow(clippy::cast_precision_loss)] - let buffer_len_f32 = self.buffer.len() as f32; - position_f32 / buffer_len_f32 - } - - /// Get the initialization timestamp. - #[must_use] - pub fn get_init_time(&self) -> u64 { - self.initialized_at_unix_time_ns - } - - /// Reconstruct the hashmap by replaying all journal entries. - pub fn as_hashmap(&self) -> anyhow::Result> { - let array = read_bonjson_payload(self.buffer)?; - let mut map = AHashMap::new(); - - if let Value::Array(entries) = array { - for (index, entry) in entries.iter().enumerate() { - // Skip metadata (first entry) - if index == 0 { - continue; - } - - if let Value::Object(obj) = entry { - // Extract key and operation from versioned entry - if let Some(Value::String(key)) = obj.get("k") - && let Some(operation) = obj.get("o") - { - if operation.is_null() { - map.remove(key); - } else { - map.insert(key.clone(), operation.clone()); - } - } - } - } - } - - Ok(map) - } - - /// Reconstruct the hashmap with timestamps by replaying all journal entries. - pub fn as_hashmap_with_timestamps(&self) -> anyhow::Result> { - let array = read_bonjson_payload(self.buffer)?; - let mut map = AHashMap::new(); - - if let Value::Array(entries) = array { - for (index, entry) in entries.iter().enumerate() { - // Skip metadata (first entry) - if index == 0 { - continue; - } - - if let Value::Object(obj) = entry { - // Extract key, operation, and timestamp from versioned entry - if let Some(Value::String(key)) = obj.get("k") - && let Some(operation) = obj.get("o") - { - // Extract timestamp (default to 0 if not found) - let timestamp = read_u64_field(obj, "t").unwrap_or(0); - - if operation.is_null() { - map.remove(key); - } else { - map.insert( - key.clone(), - TimestampedValue { - value: operation.clone(), - timestamp, - }, - ); - } - } - } - } - } - - Ok(map) - } - - /// Get a copy of the buffer for testing purposes - #[cfg(test)] - #[must_use] - pub fn buffer_copy(&self) -> Vec { - self.buffer.to_vec() - } -} - -/// Rotation utilities for creating new journals with compacted state -impl<'a> VersionedKVJournal<'a> { - /// Create a new journal initialized with the compacted state from a snapshot. - /// - /// The new journal will have all current key-value pairs written with their **original - /// timestamps** to preserve historical accuracy. The journal's monotonic timestamp - /// enforcement will respect the highest timestamp in the provided state. - /// - /// ## Timestamp Preservation and Snapshot Overlaps - /// - /// This function preserves the original timestamps of all entries, which means the new - /// journal's entry timestamps may overlap with or equal timestamps from the previous journal. - /// - /// Example during rotation: - /// ```text - /// Old journal (about to be archived as store.jrn.t300.zz): - /// - Entries: key="foo" t=100, key="foo" t=200, key="foo" t=300 - /// - Final state: foo=v3@300, bar=v1@200 - /// - rotation_timestamp = 300 (max of all timestamps) - /// - /// New journal (created by this function): - /// - Compacted entries: foo=v3@300, bar=v1@200 ← Original timestamps preserved! - /// - These timestamps (300, 200) may equal/overlap with old journal's range [100, 300] - /// - Future entries will have t >= 300 (enforced by last_timestamp initialization) - /// ``` - /// - /// ## Design Rationale - /// - /// Preserving original timestamps is **not strictly required** for point-in-time state - /// reconstruction (we could rewrite all compacted entries to `rotation_timestamp`), but it - /// provides benefits at zero cost: - /// - /// - **Implementation simplicity**: No timestamp rewriting logic needed - /// - **Semantic accuracy**: Preserves "when was this value last modified" for audit trails - /// - **Future-proof**: Maintains historical information that may be useful later - /// - **Zero overhead**: No performance difference vs rewriting timestamps - /// - /// Recovery systems bucket logs to snapshots using min/max timestamp ranges and replay - /// journals sequentially to reconstruct state at any point in time. - /// - /// # Arguments - /// * `buffer` - The buffer to write the new journal to - /// * `state` - The current key-value state with timestamps to write - /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark - /// - /// # Errors - /// Returns an error if serialization fails or buffer is too small. - pub fn create_rotated_journal( - buffer: &'a mut [u8], - state: &AHashMap, - high_water_mark_ratio: Option, - ) -> anyhow::Result { - // Create a new journal - let mut journal = Self::new(buffer, high_water_mark_ratio)?; - - // Find the maximum timestamp in the state to maintain monotonicity - let max_state_timestamp = state.values().map(|tv| tv.timestamp).max().unwrap_or(0); - - // Write all current state with their original timestamps - for (key, timestamped_value) in state { - let buffer_len = journal.buffer.len(); - let mut cursor = &mut journal.buffer[journal.position ..]; - - // Update last_timestamp to ensure monotonicity is maintained - // We use the actual timestamp from the entry, but track the maximum for future writes - journal.last_timestamp = std::cmp::max(journal.last_timestamp, timestamped_value.timestamp); - - // Create entry object: {"t": timestamp, "k": key, "o": value} - // TODO(snowp): It would be nice to be able to pass impl AsRef for key or Cow to avoid - // allocating small strings repeatedly. - let entry = AHashMap::from([ - ( - "t".to_string(), - Value::Unsigned(timestamped_value.timestamp), - ), - ("k".to_string(), Value::String(key.clone())), - ("o".to_string(), timestamped_value.value.clone()), - ]); - - encode_into_buf(&mut cursor, &Value::Object(entry)) - .map_err(|e| anyhow::anyhow!("Failed to encode state entry: {e:?}"))?; - - let remaining = cursor.remaining_mut(); - journal.set_position(buffer_len - remaining); - } - - // Ensure last_timestamp reflects the maximum timestamp we've written - journal.last_timestamp = std::cmp::max(journal.last_timestamp, max_state_timestamp); - - Ok(journal) - } -} diff --git a/bd-resilient-kv/src/lib.rs b/bd-resilient-kv/src/lib.rs index 17d70bd0..bd4e3f79 100644 --- a/bd-resilient-kv/src/lib.rs +++ b/bd-resilient-kv/src/lib.rs @@ -19,17 +19,9 @@ mod tests; pub mod kv_journal; pub mod kv_store; -pub mod versioned_kv_store; -pub mod versioned_recovery; +mod versioned_kv_journal; -pub use kv_journal::{ - DoubleBufferedKVJournal, - InMemoryKVJournal, - KVJournal, - MemMappedKVJournal, - MemMappedVersionedKVJournal, - VersionedKVJournal, -}; +pub use kv_journal::{DoubleBufferedKVJournal, InMemoryKVJournal, KVJournal, MemMappedKVJournal}; pub use kv_store::KVStore; -pub use versioned_kv_store::VersionedKVStore; -pub use versioned_recovery::VersionedRecovery; +pub use versioned_kv_journal::recovery::VersionedRecovery; +pub use versioned_kv_journal::store::VersionedKVStore; diff --git a/bd-resilient-kv/src/tests/mod.rs b/bd-resilient-kv/src/tests/mod.rs index af549ca9..1cb5bfc8 100644 --- a/bd-resilient-kv/src/tests/mod.rs +++ b/bd-resilient-kv/src/tests/mod.rs @@ -22,6 +22,8 @@ clippy::items_after_statements )] +use bd_proto::protos::state; + pub mod boundary_test; pub mod concurrency_test; pub mod double_buffered_automatic_switching_test; @@ -46,3 +48,12 @@ pub fn decompress_zlib(data: &[u8]) -> anyhow::Result> { decoder.read_to_end(&mut decompressed)?; Ok(decompressed) } + +pub fn make_string_value(s: &str) -> state::payload::StateValue { + state::payload::StateValue { + value_type: Some(state::payload::state_value::Value_type::StringValue( + s.to_string(), + )), + ..Default::default() + } +} diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index 3f8a0939..f041c783 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -8,16 +8,16 @@ #![allow(clippy::unwrap_used)] use crate::VersionedKVStore; -use crate::kv_journal::TimestampedValue; use crate::tests::decompress_zlib; -use bd_bonjson::Value; +use crate::versioned_kv_journal::{TimestampedValue, make_string_value}; +use bd_proto::protos::state::payload::StateValue; use tempfile::TempDir; #[test] fn empty_store() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; // Should start empty assert!(store.is_empty()); @@ -33,14 +33,14 @@ async fn basic_crud() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; // Insert some values let ts1 = store - .insert("key1".to_string(), Value::String("value1".to_string())) + .insert("key1".to_string(), make_string_value("value1")) .await?; let ts2 = store - .insert("key2".to_string(), Value::String("value2".to_string())) + .insert("key2".to_string(), make_string_value("value2")) .await?; assert_eq!(store.len(), 2); @@ -61,7 +61,7 @@ async fn basic_crud() -> anyhow::Result<()> { // Read back existing key let val = store.get("key2"); - assert_eq!(val, Some(&Value::String("value2".to_string()))); + assert_eq!(val, Some(&make_string_value("value2"))); // Read non-existent key let val = store.get("key1"); @@ -77,11 +77,13 @@ async fn test_persistence_and_reload() -> anyhow::Result<()> { // Create store and write some data let (ts1, ts2) = { - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; let ts1 = store - .insert("key1".to_string(), Value::String("value1".to_string())) + .insert("key1".to_string(), make_string_value("value1")) + .await?; + let ts2 = store + .insert("key2".to_string(), make_string_value("foo")) .await?; - let ts2 = store.insert("key2".to_string(), Value::Signed(42)).await?; store.sync()?; (ts1, ts2) @@ -94,14 +96,14 @@ async fn test_persistence_and_reload() -> anyhow::Result<()> { assert_eq!( store.get_with_timestamp("key1"), Some(&TimestampedValue { - value: Value::String("value1".to_string()), + value: make_string_value("value1"), timestamp: ts1, }) ); assert_eq!( store.get_with_timestamp("key2"), Some(&TimestampedValue { - value: Value::Signed(42), + value: make_string_value("42"), timestamp: ts2, }) ); @@ -115,16 +117,18 @@ async fn test_null_value_is_deletion() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; // Insert a value store - .insert("key1".to_string(), Value::String("value1".to_string())) + .insert("key1".to_string(), make_string_value("value1")) .await?; assert!(store.contains_key("key1")); - // Insert null to delete - store.insert("key1".to_string(), Value::Null).await?; + // Insert empty state to delete + store + .insert("key1".to_string(), StateValue::default()) + .await?; assert!(!store.contains_key("key1")); assert_eq!(store.len(), 0); @@ -136,14 +140,14 @@ async fn test_manual_rotation() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; // Insert some data let _ts1 = store - .insert("key1".to_string(), Value::String("value1".to_string())) + .insert("key1".to_string(), make_string_value("value1")) .await?; let ts2 = store - .insert("key2".to_string(), Value::String("value2".to_string())) + .insert("key2".to_string(), make_string_value("value2")) .await?; // Get max timestamp before rotation (this will be used in the archive name) @@ -163,34 +167,25 @@ async fn test_manual_rotation() -> anyhow::Result<()> { // Verify active journal still works let ts3 = store - .insert("key3".to_string(), Value::String("value3".to_string())) + .insert("key3".to_string(), make_string_value("value3")) .await?; assert!(ts3 >= ts2); assert_eq!(store.len(), 3); // Verify data is intact - assert_eq!( - store.get("key1"), - Some(&Value::String("value1".to_string())) - ); - assert_eq!( - store.get("key2"), - Some(&Value::String("value2".to_string())) - ); - assert_eq!( - store.get("key3"), - Some(&Value::String("value3".to_string())) - ); + assert_eq!(store.get("key1"), Some(&make_string_value("value1"))); + assert_eq!(store.get("key2"), Some(&make_string_value("value2"))); + assert_eq!(store.get("key3"), Some(&make_string_value("value3"))); // Decompress the archive and load it as a Store to verify that it contains the old state. let snapshot_store = make_store_from_snapshot_file(&temp_dir, &archived_path)?; assert_eq!( snapshot_store.get("key1"), - Some(&Value::String("value1".to_string())) + Some(&make_string_value("value1")) ); assert_eq!( snapshot_store.get("key2"), - Some(&Value::String("value2".to_string())) + Some(&make_string_value("value2")) ); assert_eq!(snapshot_store.len(), 2); @@ -202,16 +197,10 @@ async fn test_rotation_preserves_state() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - // Create complex state store - .insert("key1".to_string(), Value::String("value1".to_string())) - .await?; - store.insert("key2".to_string(), Value::Signed(42)).await?; - store.insert("key3".to_string(), Value::Bool(true)).await?; - store - .insert("key4".to_string(), Value::Float(3.14159)) + .insert("key1".to_string(), make_string_value("value1")) .await?; let pre_rotation_state = store.as_hashmap().clone(); @@ -230,7 +219,7 @@ async fn test_rotation_preserves_state() -> anyhow::Result<()> { // Verify we can continue writing let ts_new = store - .insert("key5".to_string(), Value::String("value5".to_string())) + .insert("key5".to_string(), make_string_value("value5")) .await?; assert!(ts_new >= pre_rotation_ts); assert_eq!(store.len(), 5); @@ -243,7 +232,7 @@ async fn test_empty_store_operations() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; // Operations on empty store assert_eq!(store.get("nonexistent"), None); @@ -261,18 +250,18 @@ async fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { // Create store with small buffer to trigger rotation easily - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 2048, Some(0.5))?; + let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 2048, Some(0.5))?; // Insert some keys and capture their timestamps let ts1 = store - .insert("key1".to_string(), Value::String("value1".to_string())) + .insert("key1".to_string(), make_string_value("value1")) .await?; // Small sleep to ensure different timestamps std::thread::sleep(std::time::Duration::from_millis(10)); let ts2 = store - .insert("key2".to_string(), Value::String("value2".to_string())) + .insert("key2".to_string(), make_string_value("value2")) .await?; // Verify timestamps are different @@ -281,7 +270,9 @@ async fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { // Write enough data to trigger rotation for i in 0 .. 50 { - store.insert(format!("fill{i}"), Value::Signed(i)).await?; + store + .insert(format!("fill{i}"), make_string_value("foo")) + .await?; } // Verify that after rotation, the original timestamps are preserved @@ -318,14 +309,14 @@ async fn test_multiple_rotations() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; let mut rotation_timestamps = Vec::new(); // Perform multiple rotations for i in 0 .. 3 { let key = format!("key{}", i); - let value = Value::String(format!("value{}", i)); + let value = make_string_value(&format!("value{}", i)); store.insert(key.clone(), value).await?; let timestamp = store .get_with_timestamp(&key) diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index 5d711a87..76a7affe 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -10,8 +10,8 @@ use crate::VersionedKVStore; use crate::tests::decompress_zlib; -use crate::versioned_recovery::VersionedRecovery; -use bd_bonjson::Value; +use crate::versioned_kv_journal::make_string_value; +use crate::versioned_kv_journal::recovery::VersionedRecovery; use tempfile::TempDir; /// Helper function to find archived journal files in a directory. @@ -59,10 +59,10 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { // Create a store with larger buffer to avoid BufferFull errors during test - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 2048, None)?; + let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 2048, None)?; store - .insert("key1".to_string(), Value::String("value1".to_string())) + .insert("key1".to_string(), make_string_value("value1")) .await?; let ts1 = store .get_with_timestamp("key1") @@ -72,12 +72,14 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { std::thread::sleep(std::time::Duration::from_millis(10)); store - .insert("key2".to_string(), Value::String("value2".to_string())) + .insert("key2".to_string(), make_string_value("value2")) .await?; // Write more data to trigger rotation for i in 0 .. 20 { - store.insert(format!("key{i}"), Value::Signed(i)).await?; + store + .insert(format!("key{i}"), make_string_value("foo")) + .await?; } let ts_middle = store @@ -89,10 +91,7 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { // Write more after rotation store - .insert( - "final".to_string(), - Value::String("final_value".to_string()), - ) + .insert("final".to_string(), make_string_value("final_value")) .await?; let ts_final = store .get_with_timestamp("final") @@ -136,7 +135,7 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { assert!(state_final.contains_key("final")); assert_eq!( state_final.get("final").map(|tv| &tv.value), - Some(&Value::String("final_value".to_string())) + Some(&make_string_value("final_value")) ); Ok(()) @@ -148,7 +147,7 @@ async fn test_recovery_empty_journal() -> anyhow::Result<()> { // Create an empty store - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; store.sync()?; // Rotate to create snapshot @@ -175,8 +174,10 @@ async fn test_recovery_with_overwrites() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - store.insert("key".to_string(), Value::Signed(1)).await?; + let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + store + .insert("key".to_string(), make_string_value("1)")) + .await?; let ts1 = store .get_with_timestamp("key") .map(|tv| tv.timestamp) @@ -184,7 +185,9 @@ async fn test_recovery_with_overwrites() -> anyhow::Result<()> { std::thread::sleep(std::time::Duration::from_millis(10)); - store.insert("key".to_string(), Value::Signed(2)).await?; + store + .insert("key".to_string(), make_string_value("2)")) + .await?; let ts2 = store .get_with_timestamp("key") .map(|tv| tv.timestamp) @@ -192,7 +195,9 @@ async fn test_recovery_with_overwrites() -> anyhow::Result<()> { std::thread::sleep(std::time::Duration::from_millis(10)); - store.insert("key".to_string(), Value::Signed(3)).await?; + store + .insert("key".to_string(), make_string_value("3)")) + .await?; let ts3 = store .get_with_timestamp("key") .map(|tv| tv.timestamp) @@ -216,71 +221,19 @@ async fn test_recovery_with_overwrites() -> anyhow::Result<()> { let state_ts1 = recovery.recover_at_timestamp(ts1)?; assert_eq!( state_ts1.get("key").map(|tv| &tv.value), - Some(&Value::Signed(1)) + Some(&make_string_value("1")) ); let state_ts2 = recovery.recover_at_timestamp(ts2)?; assert_eq!( state_ts2.get("key").map(|tv| &tv.value), - Some(&Value::Signed(2)) + Some(&make_string_value("2")) ); let state_ts3 = recovery.recover_at_timestamp(ts3)?; assert_eq!( state_ts3.get("key").map(|tv| &tv.value), - Some(&Value::Signed(3)) - ); - - Ok(()) -} - -#[tokio::test] -async fn test_recovery_various_value_types() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; - store - .insert("string".to_string(), Value::String("hello".to_string())) - .await?; - store - .insert("number".to_string(), Value::Signed(42)) - .await?; - store - .insert("float".to_string(), Value::Float(3.14)) - .await?; - store.insert("bool".to_string(), Value::Bool(true)).await?; - store.sync()?; - - // Rotate to create snapshot - store.rotate_journal().await?; - - // Read the snapshot - let archived_files = find_archived_journals(temp_dir.path())?; - assert_eq!(archived_files.len(), 1); - let compressed_data = std::fs::read(&archived_files[0])?; - let decompressed_data = decompress_zlib(&compressed_data)?; - let snapshot_ts = extract_rotation_timestamp(&archived_files[0])?; - - let recovery = VersionedRecovery::new(vec![(&decompressed_data, snapshot_ts)])?; - - let state = recovery.recover_current()?; - assert_eq!(state.len(), 4); - assert_eq!( - state.get("string").map(|tv| &tv.value), - Some(&Value::String("hello".to_string())) - ); - assert_eq!( - state.get("number").map(|tv| &tv.value), - Some(&Value::Signed(42)) - ); - assert_eq!( - state.get("float").map(|tv| &tv.value), - Some(&Value::Float(3.14)) - ); - assert_eq!( - state.get("bool").map(|tv| &tv.value), - Some(&Value::Bool(true)) + Some(&make_string_value("3")) ); Ok(()) @@ -291,10 +244,10 @@ async fn test_recovery_at_timestamp() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; // Create a store and write some timestamped data - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; store - .insert("key1".to_string(), Value::String("value1".to_string())) + .insert("key1".to_string(), make_string_value("value1")) .await?; let ts1 = store .get_with_timestamp("key1") @@ -305,7 +258,7 @@ async fn test_recovery_at_timestamp() -> anyhow::Result<()> { std::thread::sleep(std::time::Duration::from_millis(10)); store - .insert("key2".to_string(), Value::String("value2".to_string())) + .insert("key2".to_string(), make_string_value("value2")) .await?; let ts2 = store .get_with_timestamp("key2") @@ -316,7 +269,7 @@ async fn test_recovery_at_timestamp() -> anyhow::Result<()> { std::thread::sleep(std::time::Duration::from_millis(10)); store - .insert("key1".to_string(), Value::String("updated1".to_string())) + .insert("key1".to_string(), make_string_value("updated1")) .await?; let ts3 = store .get_with_timestamp("key1") @@ -343,7 +296,7 @@ async fn test_recovery_at_timestamp() -> anyhow::Result<()> { assert_eq!(state_ts1.len(), 1); assert_eq!( state_ts1.get("key1").map(|tv| &tv.value), - Some(&Value::String("value1".to_string())) + Some(&make_string_value("value1")) ); // Recover at ts2: should have key1=value1, key2=value2 @@ -351,11 +304,11 @@ async fn test_recovery_at_timestamp() -> anyhow::Result<()> { assert_eq!(state_ts2.len(), 2); assert_eq!( state_ts2.get("key1").map(|tv| &tv.value), - Some(&Value::String("value1".to_string())) + Some(&make_string_value("value1")) ); assert_eq!( state_ts2.get("key2").map(|tv| &tv.value), - Some(&Value::String("value2".to_string())) + Some(&make_string_value("value2")) ); // Recover at ts3: should have key1=updated1, key2=value2 @@ -363,11 +316,11 @@ async fn test_recovery_at_timestamp() -> anyhow::Result<()> { assert_eq!(state_ts3.len(), 2); assert_eq!( state_ts3.get("key1").map(|tv| &tv.value), - Some(&Value::String("updated1".to_string())) + Some(&make_string_value("updated1")) ); assert_eq!( state_ts3.get("key2").map(|tv| &tv.value), - Some(&Value::String("value2".to_string())) + Some(&make_string_value("value2")) ); Ok(()) @@ -377,11 +330,11 @@ async fn test_recovery_at_timestamp() -> anyhow::Result<()> { async fn test_recovery_at_timestamp_with_rotation() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; - let mut store = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; // Write some data before rotation store - .insert("key1".to_string(), Value::String("value1".to_string())) + .insert("key1".to_string(), make_string_value("value1")) .await?; let ts1 = store .get_with_timestamp("key1") @@ -391,7 +344,7 @@ async fn test_recovery_at_timestamp_with_rotation() -> anyhow::Result<()> { std::thread::sleep(std::time::Duration::from_millis(10)); store - .insert("key2".to_string(), Value::String("value2".to_string())) + .insert("key2".to_string(), make_string_value("value2")) .await?; let ts2 = store .get_with_timestamp("key2") @@ -405,7 +358,7 @@ async fn test_recovery_at_timestamp_with_rotation() -> anyhow::Result<()> { // Write data after rotation store - .insert("key3".to_string(), Value::String("value3".to_string())) + .insert("key3".to_string(), make_string_value("value3")) .await?; let ts3 = store .get_with_timestamp("key3") diff --git a/bd-resilient-kv/src/versioned_kv_journal/file_manager.rs b/bd-resilient-kv/src/versioned_kv_journal/file_manager.rs new file mode 100644 index 00000000..591de718 --- /dev/null +++ b/bd-resilient-kv/src/versioned_kv_journal/file_manager.rs @@ -0,0 +1,57 @@ +use std::path::{Path, PathBuf}; + +/// Find the active journal file by searching for the highest generation number. If we failed +/// to read the directory or there are no journal files, we return generation 0. +pub fn find_active_journal(dir: &Path, name: &str) -> (PathBuf, u64) { + // Search for generation-based journals + let pattern = format!("{name}.jrn."); + + let mut max_gen = 0u64; + let Ok(entries) = std::fs::read_dir(dir) else { + return (dir.join(format!("{name}.jrn.{max_gen}")), max_gen); + }; + + for entry in entries { + let Ok(entry) = entry else { + continue; + }; + + let filename = entry.file_name(); + let filename_str = filename.to_string_lossy(); + + if let Some(suffix) = filename_str.strip_prefix(&pattern) { + // Parse generation number (before any .zz or other extensions) + if let Some(gen_str) = suffix.split('.').next() + && let Ok(generation) = gen_str.parse::() + { + max_gen = max_gen.max(generation); + } + } + } + + let path = dir.join(format!("{name}.jrn.{max_gen}")); + (path, max_gen) +} + +/// Compress an archived journal using zlib. +/// +/// This function compresses the source file to the destination using zlib compression. +/// The compression is performed in a blocking task to avoid holding up the async runtime. +pub async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result<()> { + let source = source.to_owned(); + let dest = dest.to_owned(); + + tokio::task::spawn_blocking(move || { + use flate2::Compression; + use flate2::write::ZlibEncoder; + use std::io::{BufReader, copy}; + + let source_file = std::fs::File::open(&source)?; + let dest_file = std::fs::File::create(&dest)?; + let mut encoder = ZlibEncoder::new(dest_file, Compression::new(5)); + copy(&mut BufReader::new(source_file), &mut encoder)?; + encoder.finish()?; + Ok::<_, anyhow::Error>(()) + }) + .await? +} diff --git a/bd-resilient-kv/src/versioned_kv_journal/framing.rs b/bd-resilient-kv/src/versioned_kv_journal/framing.rs new file mode 100644 index 00000000..9e1cebac --- /dev/null +++ b/bd-resilient-kv/src/versioned_kv_journal/framing.rs @@ -0,0 +1,212 @@ +// shared-core - bitdrift's common client/server libraries +// Copyright Bitdrift, Inc. All rights reserved. +// +// Use of this source code is governed by a source available license that can be found in the +// LICENSE file or at: +// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt + +//! Wire format framing for journal entries. +//! +//! Per-entry format: +//! ```text +//! [length: u32][timestamp_micros: varint][payload: bytes][crc32: u32] +//! ``` +//! +//! - `length`: Total length of the frame (timestamp + payload + crc) +//! - `timestamp_micros`: Microseconds since UNIX epoch (varint encoded) +//! - `payload`: Opaque binary data (format determined by caller) +//! - `crc32`: CRC32 checksum of (`timestamp_bytes` + payload) + +use bytes::BufMut; +use crc32fast::Hasher; + +/// Maximum varint size for u64 (10 bytes) +const MAX_VARINT_SIZE: usize = 10; + +/// Encode a u64 as a varint into the buffer. +/// Returns the number of bytes written. +pub fn encode_varint(value: u64, buf: &mut [u8]) -> usize { + let mut val = value; + let mut idx = 0; + + #[allow(clippy::cast_possible_truncation)] + while val >= 0x80 { + buf[idx] = (val as u8) | 0x80; + val >>= 7; + idx += 1; + } + #[allow(clippy::cast_possible_truncation)] + { + buf[idx] = val as u8; + } + idx + 1 +} + +/// Decode a varint from the buffer. +/// Returns (value, `bytes_read`) or None if buffer is incomplete/invalid. +#[must_use] +pub fn decode_varint(buf: &[u8]) -> Option<(u64, usize)> { + let mut value: u64 = 0; + let mut shift = 0; + + for (idx, &byte) in buf.iter().enumerate() { + if idx >= MAX_VARINT_SIZE { + return None; // Varint too long + } + + value |= u64::from(byte & 0x7F) << shift; + shift += 7; + + if byte & 0x80 == 0 { + return Some((value, idx + 1)); + } + } + + None // Incomplete varint +} + +/// Frame structure for a journal entry. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Frame { + /// Timestamp in microseconds since UNIX epoch. + pub timestamp_micros: u64, + /// Opaque payload data. + pub payload: M, +} + +impl Frame { + pub fn decode_timestamp(buf: &[u8]) -> anyhow::Result<(u64, usize)> { + let (timestamp_micros, timestamp_len) = + decode_varint(buf).ok_or_else(|| anyhow::anyhow!("Invalid varint"))?; + Ok((timestamp_micros, timestamp_len)) + } +} + +impl Frame { + /// Create a new frame. + #[must_use] + pub fn new(timestamp_micros: u64, payload: M) -> Self { + Self { + timestamp_micros, + payload, + } + } + + /// Calculate the encoded size of this frame. + #[must_use] + pub fn encoded_size(&self) -> usize { + // Calculate varint size + let mut temp_buf = [0u8; MAX_VARINT_SIZE]; + let varint_size = encode_varint(self.timestamp_micros, &mut temp_buf); + let payload_size: usize = self.payload.compute_size().try_into().unwrap_or(0); + + // length(4) + timestamp_varint + payload + crc(4) + 4 + varint_size + payload_size + 4 + } + + /// Encode this frame into a buffer. + /// + /// # Errors + /// Returns an error if the buffer is too small. + pub fn encode(&self, buf: &mut [u8]) -> anyhow::Result { + let required_size = self.encoded_size(); + if buf.len() < required_size { + anyhow::bail!( + "Buffer too small: need {} bytes, have {} bytes", + required_size, + buf.len() + ); + } + + let mut cursor = buf; + + // Encode timestamp to calculate frame length + let mut timestamp_buf = [0u8; MAX_VARINT_SIZE]; + let timestamp_len = encode_varint(self.timestamp_micros, &mut timestamp_buf); + + let payload_bytes = self + .payload + .write_to_bytes() + .map_err(|e| anyhow::anyhow!("Failed to serialize payload: {}", e))?; + + // Frame length = timestamp + payload + crc + let frame_len = timestamp_len + payload_bytes.len() + 4; + #[allow(clippy::cast_possible_truncation)] + { + cursor.put_u32_le(frame_len as u32); + } + + // Write timestamp varint + cursor.put_slice(×tamp_buf[.. timestamp_len]); + + // Write payload + cursor.put_slice(&payload_bytes); + + // Calculate CRC over timestamp + payload + let mut hasher = Hasher::new(); + hasher.update(×tamp_buf[.. timestamp_len]); + hasher.update(payload_bytes.as_slice()); + let crc = hasher.finalize(); + + // Write CRC + cursor.put_u32_le(crc); + + Ok(required_size) + } + + /// Decode a frame from a buffer. + /// + /// Returns (Frame, `bytes_consumed`) or error if invalid/incomplete. + pub fn decode(buf: &[u8]) -> anyhow::Result<(Self, usize)> { + if buf.len() < 4 { + anyhow::bail!("Buffer too small for length field"); + } + + // Read frame length + let frame_len = u32::from_le_bytes(buf[0 .. 4].try_into()?) as usize; + + // Check if we have the complete frame + let total_len = 4 + frame_len; // length field + frame + if buf.len() < total_len { + anyhow::bail!( + "Incomplete frame: need {} bytes, have {} bytes", + total_len, + buf.len() + ); + } + + let frame_data = &buf[4 .. total_len]; + + // Decode timestamp varint + let (timestamp_micros, timestamp_len) = + decode_varint(frame_data).ok_or_else(|| anyhow::anyhow!("Invalid varint"))?; + + // Extract payload and CRC + if frame_data.len() < timestamp_len + 4 { + anyhow::bail!("Frame too small for CRC"); + } + + let payload_end = frame_data.len() - 4; + let payload = frame_data[timestamp_len .. payload_end].to_vec(); + let stored_crc = u32::from_le_bytes(frame_data[payload_end ..].try_into()?); + + // Verify CRC + let mut hasher = Hasher::new(); + hasher.update(&frame_data[.. timestamp_len]); // timestamp bytes + hasher.update(&payload); // payload + let computed_crc = hasher.finalize(); + + if stored_crc != computed_crc { + anyhow::bail!("CRC mismatch: expected 0x{stored_crc:08x}, got 0x{computed_crc:08x}"); + } + + let payload = M::parse_from_bytes(&payload) + .map_err(|e| anyhow::anyhow!("Failed to parse payload: {}", e))?; + + Ok((Self::new(timestamp_micros, payload), total_len)) + } +} + +#[cfg(test)] +#[path = "./framing_test.rs"] +mod tests; diff --git a/bd-resilient-kv/src/versioned_kv_journal/framing_test.rs b/bd-resilient-kv/src/versioned_kv_journal/framing_test.rs new file mode 100644 index 00000000..6ee5f13c --- /dev/null +++ b/bd-resilient-kv/src/versioned_kv_journal/framing_test.rs @@ -0,0 +1,208 @@ +// shared-core - bitdrift's common client/server libraries +// Copyright Bitdrift, Inc. All rights reserved. +// +// Use of this source code is governed by a source available license that can be found in the +// LICENSE file or at: +// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt + +#![allow(clippy::unwrap_used)] + +use super::*; +use crate::tests::make_string_value; +use bd_proto::protos::state::payload::StateValue; + +#[test] +fn varint_encoding() { + let test_cases = vec![ + (0u64, vec![0x00]), + (1u64, vec![0x01]), + (127u64, vec![0x7F]), + (128u64, vec![0x80, 0x01]), + (300u64, vec![0xAC, 0x02]), + (16_384u64, vec![0x80, 0x80, 0x01]), + ( + u64::MAX, + vec![0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01], + ), + ]; + + for (value, expected) in test_cases { + let mut buf = [0u8; MAX_VARINT_SIZE]; + let len = encode_varint(value, &mut buf); + assert_eq!(&buf[.. len], &expected[..], "Failed for value {value}"); + } +} + +#[test] +fn varint_decoding() { + let test_cases = vec![ + (vec![0x00], 0u64, 1), + (vec![0x01], 1u64, 1), + (vec![0x7F], 127u64, 1), + (vec![0x80, 0x01], 128u64, 2), + (vec![0xAC, 0x02], 300u64, 2), + (vec![0x80, 0x80, 0x01], 16_384u64, 3), + ( + vec![0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01], + u64::MAX, + 10, + ), + ]; + + for (buf, expected_value, expected_len) in test_cases { + let (value, len) = decode_varint(&buf).unwrap(); + assert_eq!(value, expected_value, "Failed for buffer {buf:?}"); + assert_eq!(len, expected_len, "Wrong length for buffer {buf:?}"); + } +} + +#[test] +fn varint_roundtrip() { + let values = vec![0, 1, 127, 128, 255, 256, 65535, 65536, 1_000_000, u64::MAX]; + + for value in values { + let mut buf = [0u8; MAX_VARINT_SIZE]; + let encoded_len = encode_varint(value, &mut buf); + let (decoded_value, decoded_len) = decode_varint(&buf).unwrap(); + + assert_eq!(decoded_value, value, "Roundtrip failed for {value}"); + assert_eq!(decoded_len, encoded_len, "Length mismatch for {value}"); + } +} + +#[test] +fn varint_incomplete() { + // Incomplete varint (has continuation bit but no next byte) + let buf = vec![0x80]; + assert!(decode_varint(&buf).is_none()); +} + +#[test] +fn varint_too_long() { + // 11 bytes (exceeds MAX_VARINT_SIZE) + let buf = vec![ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, + ]; + assert!(decode_varint(&buf).is_none()); +} + +#[test] +fn frame_encode_decode() { + let frame = Frame::new(1_700_000_000_000_000, make_string_value("value")); + + let mut buf = vec![0u8; 1024]; + let encoded_len = frame.encode(&mut buf).unwrap(); + + let (decoded_frame, decoded_len) = Frame::::decode(&buf).unwrap(); + + assert_eq!(decoded_frame, frame); + assert_eq!(decoded_len, encoded_len); +} + +#[test] +fn frame_with_delete() { + let frame = Frame::new(1_700_000_000_000_000, make_string_value("")); + + let mut buf = vec![0u8; 1024]; + let encoded_len = frame.encode(&mut buf).unwrap(); + + let (decoded_frame, decoded_len) = Frame::::decode(&buf).unwrap(); + + assert_eq!(decoded_frame, frame); + assert_eq!(decoded_len, encoded_len); +} + +#[test] +fn frame_empty_payload() { + let frame = Frame::new(1_700_000_000_000_000, StateValue::default()); + + let mut buf = vec![0u8; 1024]; + let encoded_len = frame.encode(&mut buf).unwrap(); + + let (decoded_frame, decoded_len) = Frame::::decode(&buf).unwrap(); + + assert_eq!(decoded_frame, frame); + assert_eq!(decoded_len, encoded_len); +} + +#[test] +fn frame_various_timestamps() { + let timestamps = vec![0, 1, 127, 128, 1_000_000, 1_700_000_000_000_000, u64::MAX]; + + for timestamp in timestamps { + let frame = Frame::new(timestamp, make_string_value("test")); + let mut buf = vec![0u8; 1024]; + let encoded_len = frame.encode(&mut buf).unwrap(); + let (decoded_frame, decoded_len) = Frame::::decode(&buf).unwrap(); + + assert_eq!(decoded_frame.timestamp_micros, timestamp); + assert_eq!(decoded_frame.payload, make_string_value("test")); + assert_eq!(decoded_len, encoded_len); + } +} + +#[test] +fn frame_buffer_too_small() { + let frame = Frame::new(1_700_000_000_000_000, make_string_value("key:value")); + let mut buf = vec![0u8; 5]; // Too small + + let result = frame.encode(&mut buf); + assert!(result.is_err()); +} + +#[test] +fn frame_incomplete_length() { + let buf = vec![0x01, 0x02]; // Only 2 bytes (need 4 for length) + + let result = Frame::::decode(&buf); + assert!(result.is_err()); +} + +#[test] +fn frame_incomplete_data() { + // Frame says it needs 100 bytes but we only provide 20 + let mut buf = vec![0u8; 20]; + buf[0 .. 4].copy_from_slice(&100u32.to_le_bytes()); + + let result = Frame::::decode(&buf); + assert!(result.is_err()); +} + +#[test] +fn frame_crc_mismatch() { + let frame = Frame::new(1_700_000_000_000_000, make_string_value("key:value")); + + let mut buf = vec![0u8; 1024]; + let encoded_len = frame.encode(&mut buf).unwrap(); + + // Corrupt the CRC + buf[encoded_len - 1] ^= 0xFF; + + let result = Frame::::decode(&buf); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("CRC mismatch")); +} + +#[test] +fn frame_multiple_frames() { + let frame1 = Frame::new(1000, make_string_value("first")); + let frame2 = Frame::new(2000, make_string_value("second")); + let frame3 = Frame::new(3000, make_string_value("third")); + + let mut buf = vec![0u8; 1024]; + let len1 = frame1.encode(&mut buf).unwrap(); + let len2 = frame2.encode(&mut buf[len1 ..]).unwrap(); + let len3 = frame3.encode(&mut buf[len1 + len2 ..]).unwrap(); + + // Decode all three + let (decoded1, consumed1) = Frame::::decode(&buf).unwrap(); + let (decoded2, consumed2) = Frame::::decode(&buf[consumed1 ..]).unwrap(); + let (decoded3, consumed3) = Frame::::decode(&buf[consumed1 + consumed2 ..]).unwrap(); + + assert_eq!(decoded1, frame1); + assert_eq!(decoded2, frame2); + assert_eq!(decoded3, frame3); + assert_eq!(consumed1, len1); + assert_eq!(consumed2, len2); + assert_eq!(consumed3, len3); +} diff --git a/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs b/bd-resilient-kv/src/versioned_kv_journal/memmapped_versioned.rs similarity index 77% rename from bd-resilient-kv/src/kv_journal/memmapped_versioned.rs rename to bd-resilient-kv/src/versioned_kv_journal/memmapped_versioned.rs index 6a1ae2e1..ba06f231 100644 --- a/bd-resilient-kv/src/kv_journal/memmapped_versioned.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/memmapped_versioned.rs @@ -5,9 +5,9 @@ // LICENSE file or at: // https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt -use super::versioned::{TimestampedValue, VersionedKVJournal}; +use super::versioned::VersionedKVJournal; +use crate::versioned_kv_journal::TimestampedValue; use ahash::AHashMap; -use bd_bonjson::Value; use memmap2::{MmapMut, MmapOptions}; use std::fs::OpenOptions; use std::path::Path; @@ -118,30 +118,10 @@ impl MemMappedVersionedKVJournal { Ok(Self { mmap, versioned_kv }) } - /// Set a key-value pair with automatic timestamp assignment. - /// + /// Insert a new entry into the journal with the given payload. /// Returns the timestamp of the operation. - /// - /// # Errors - /// Returns an error if the journal entry cannot be written. - pub fn set_versioned(&mut self, key: &str, value: &Value) -> anyhow::Result { - self.versioned_kv.set_versioned(key, value) - } - - /// Delete a key with automatic timestamp assignment. - /// - /// Returns the timestamp of the operation. - /// - /// # Errors - /// Returns an error if the journal entry cannot be written. - pub fn delete_versioned(&mut self, key: &str) -> anyhow::Result { - self.versioned_kv.delete_versioned(key) - } - - /// Get the current high water mark position. - #[must_use] - pub fn high_water_mark(&self) -> usize { - self.versioned_kv.high_water_mark() + pub fn insert_entry(&mut self, message: impl protobuf::MessageFull) -> anyhow::Result { + self.versioned_kv.insert_entry(message) } /// Check if the high water mark has been triggered. @@ -150,26 +130,12 @@ impl MemMappedVersionedKVJournal { self.versioned_kv.is_high_water_mark_triggered() } - /// Get the current buffer usage as a percentage (0.0 to 1.0). - #[must_use] - pub fn buffer_usage_ratio(&self) -> f32 { - self.versioned_kv.buffer_usage_ratio() - } - - /// Reconstruct the hashmap by replaying all journal entries. - /// - /// # Errors - /// Returns an error if the buffer cannot be decoded. - pub fn as_hashmap(&self) -> anyhow::Result> { - self.versioned_kv.as_hashmap() - } - /// Reconstruct the hashmap with timestamps by replaying all journal entries. /// /// # Errors /// Returns an error if the buffer cannot be decoded. pub fn as_hashmap_with_timestamps(&self) -> anyhow::Result> { - self.versioned_kv.as_hashmap_with_timestamps() + self.versioned_kv.to_hashmap_with_timestamps() } /// Synchronize changes to disk. @@ -186,17 +152,4 @@ impl MemMappedVersionedKVJournal { pub fn sync(&self) -> anyhow::Result<()> { self.mmap.flush().map_err(Into::into) } - - /// Get the size of the underlying file in bytes. - #[must_use] - pub fn file_size(&self) -> usize { - self.mmap.len() - } - - /// Get a copy of the buffer for testing purposes - #[cfg(test)] - #[must_use] - pub fn buffer_copy(&self) -> Vec { - self.versioned_kv.buffer_copy() - } } diff --git a/bd-resilient-kv/src/versioned_kv_journal/mod.rs b/bd-resilient-kv/src/versioned_kv_journal/mod.rs new file mode 100644 index 00000000..0a62522b --- /dev/null +++ b/bd-resilient-kv/src/versioned_kv_journal/mod.rs @@ -0,0 +1,28 @@ +use bd_proto::protos::state; + +mod file_manager; +mod framing; +mod memmapped_versioned; +pub mod recovery; +pub mod store; +mod versioned; + +/// Represents a value with its associated timestamp. +#[derive(Debug, Clone, PartialEq)] +pub struct TimestampedValue { + /// The value stored in the key-value store. + pub value: state::payload::StateValue, + + /// The timestamp (in microseconds since UNIX epoch) when this value was last written. + pub timestamp: u64, +} + +#[cfg(test)] +pub fn make_string_value(s: &str) -> state::payload::StateValue { + state::payload::StateValue { + value_type: Some(state::payload::state_value::Value_type::StringValue( + s.to_string(), + )), + ..Default::default() + } +} diff --git a/bd-resilient-kv/src/versioned_recovery.rs b/bd-resilient-kv/src/versioned_kv_journal/recovery.rs similarity index 72% rename from bd-resilient-kv/src/versioned_recovery.rs rename to bd-resilient-kv/src/versioned_kv_journal/recovery.rs index 70d038ec..7c209b5f 100644 --- a/bd-resilient-kv/src/versioned_recovery.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/recovery.rs @@ -5,10 +5,10 @@ // LICENSE file or at: // https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt -use crate::kv_journal::TimestampedValue; +use crate::versioned_kv_journal::TimestampedValue; +use crate::versioned_kv_journal::framing::Frame; use ahash::AHashMap; -use bd_bonjson::Value; -use bd_bonjson::decoder::from_slice; +use bd_proto::protos::state::payload::StateKeyValuePair; /// A utility for recovering state at arbitrary timestamps from journal snapshots. /// @@ -36,7 +36,7 @@ impl VersionedRecovery { /// Create a new recovery utility from a list of uncompressed snapshot byte slices. /// /// The snapshots should be provided in chronological order (oldest to newest). - /// Each snapshot must be a valid uncompressed versioned journal (VERSION 2 format). + /// Each snapshot must be a valid uncompressed versioned journal (VERSION 3 format). /// /// # Arguments /// @@ -153,71 +153,22 @@ fn replay_journal_to_timestamp( target_timestamp: u64, map: &mut AHashMap, ) -> anyhow::Result<()> { - let array = read_bonjson_payload(buffer)?; - - let Value::Array(entries) = &array else { - return Ok(()); - }; - - for (index, entry) in entries.iter().enumerate() { - // Skip metadata (first entry) - if index == 0 { - continue; - } - - let Value::Object(obj) = entry else { - continue; - }; - - // Extract timestamp (skip entries without timestamp) - let Some(entry_timestamp) = read_u64_field(obj, "t") else { - continue; - }; - - // Only apply entries up to target timestamp - if entry_timestamp > target_timestamp { - break; - } - - let (Some(Value::String(key)), Some(operation)) = (obj.get("k"), obj.get("o")) else { - continue; - }; - - // Extract key and operation - if operation.is_null() { - map.remove(key); - } else { - map.insert( - key.clone(), - TimestampedValue { - value: operation.clone(), - timestamp: entry_timestamp, - }, - ); - } - } - - Ok(()) -} - -/// Read the bonjson payload from a snapshot buffer. -fn read_bonjson_payload(buffer: &[u8]) -> anyhow::Result { - const HEADER_SIZE: usize = 16; - const ARRAY_BEGIN: usize = 16; + // Skip the header (17 bytes: version + position + reserved) + const HEADER_SIZE: usize = 17; if buffer.len() < HEADER_SIZE { anyhow::bail!("Buffer too small: {}", buffer.len()); } - // Read position from header + // Read position from header (bytes 8-15) let position_bytes: [u8; 8] = buffer[8 .. 16] .try_into() .map_err(|_| anyhow::anyhow!("Failed to read position"))?; #[allow(clippy::cast_possible_truncation)] let position = u64::from_le_bytes(position_bytes) as usize; - if position < ARRAY_BEGIN { - anyhow::bail!("Invalid position: {position}, must be at least {ARRAY_BEGIN}"); + if position < HEADER_SIZE { + anyhow::bail!("Invalid position: {position}, must be at least {HEADER_SIZE}"); } if position > buffer.len() { @@ -227,31 +178,43 @@ fn read_bonjson_payload(buffer: &[u8]) -> anyhow::Result { ); } - let slice_to_decode = &buffer[ARRAY_BEGIN .. position]; - - match from_slice(slice_to_decode) { - Ok((_, decoded)) => Ok(decoded), - Err(bd_bonjson::decoder::DecodeError::Partial { partial_value, .. }) => Ok(partial_value), - Err(e) => anyhow::bail!("Failed to decode buffer: {e:?}"), + // Decode frames from the journal data + let mut offset = 0; + let data = &buffer[HEADER_SIZE .. position]; + + while offset < data.len() { + match Frame::::decode(&data[offset ..]) { + Ok((frame, bytes_read)) => { + // Only apply entries up to target timestamp + if frame.timestamp_micros > target_timestamp { + break; + } + + if let Some(value) = frame.payload.value.into_option() { + // Insertion - parse the value string back to a Value + // For now, we store everything as strings since that's what the current + // implementation does. In the future, you can parse the value_str to + // reconstruct the original type. + map.insert( + frame.payload.key, + TimestampedValue { + value, + timestamp: frame.timestamp_micros, + }, + ); + } else { + // Deletion + map.remove(&frame.payload.key); + } + + offset += bytes_read; + }, + Err(_) => { + // End of valid data or corrupted frame + break; + }, + } } -} -/// Helper function to read a u64 field from a BONJSON object. -/// -/// BONJSON's decoder automatically converts unsigned values that fit in i64 to signed values -/// during decoding (see bd-bonjson/src/decoder.rs:227-234). This means that even though we -/// write `Value::Unsigned(version)`, the decoder returns `Value::Signed(version as i64)`. -/// -/// TODO(snowp): Consider changing BONJSON's decoder to preserve the original unsigned type -/// to avoid this normalization behavior and eliminate the need for this helper. -fn read_u64_field(obj: &AHashMap, key: &str) -> Option { - match obj.get(key) { - Some(Value::Unsigned(v)) => Some(*v), - Some(Value::Signed(v)) if *v >= 0 => - { - #[allow(clippy::cast_sign_loss)] - Some(*v as u64) - }, - _ => None, - } + Ok(()) } diff --git a/bd-resilient-kv/src/versioned_kv_store.rs b/bd-resilient-kv/src/versioned_kv_journal/store.rs similarity index 69% rename from bd-resilient-kv/src/versioned_kv_store.rs rename to bd-resilient-kv/src/versioned_kv_journal/store.rs index 50824427..fa1c84e8 100644 --- a/bd-resilient-kv/src/versioned_kv_store.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/store.rs @@ -5,73 +5,18 @@ // LICENSE file or at: // https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt -use crate::kv_journal::{MemMappedVersionedKVJournal, TimestampedValue, VersionedKVJournal}; +use crate::versioned_kv_journal::TimestampedValue; +use crate::versioned_kv_journal::file_manager::{self, compress_archived_journal}; +use crate::versioned_kv_journal::memmapped_versioned::MemMappedVersionedKVJournal; +use crate::versioned_kv_journal::versioned::VersionedKVJournal; use ahash::AHashMap; -use bd_bonjson::Value; +use bd_proto::protos::state::payload::{StateKeyValuePair, StateValue}; use std::path::{Path, PathBuf}; -/// Find the active journal file by searching for the highest generation number. -/// -/// Returns the path to the journal and its generation number, or None if no journal exists. -/// Supports both legacy journals (`name.jrn`) and generation-based journals (`name.jrn.N`). -fn find_active_journal(dir: &Path, name: &str) -> anyhow::Result> { - // First check for legacy journal format (name.jrn without generation) - let legacy_path = dir.join(format!("{name}.jrn")); - if legacy_path.exists() { - // Migrate legacy journal to generation 0 - let gen_path = dir.join(format!("{name}.jrn.0")); - if !gen_path.exists() { - std::fs::rename(&legacy_path, &gen_path)?; - return Ok(Some((gen_path, 0))); - } - } - - // Search for generation-based journals - let pattern = format!("{name}.jrn."); - - let mut max_gen = None; - for entry in std::fs::read_dir(dir)? { - let entry = entry?; - let filename = entry.file_name(); - let filename_str = filename.to_string_lossy(); - - if let Some(suffix) = filename_str.strip_prefix(&pattern) { - // Parse generation number (before any .zz or other extensions) - if let Some(gen_str) = suffix.split('.').next() - && let Ok(generation) = gen_str.parse::() - { - max_gen = Some(max_gen.map_or(generation, |current: u64| current.max(generation))); - } - } - } - - Ok(max_gen.map(|generation| { - let path = dir.join(format!("{name}.jrn.{generation}")); - (path, generation) - })) -} - -/// Compress an archived journal using zlib. -/// -/// This function compresses the source file to the destination using zlib compression. -/// The compression is performed in a blocking task to avoid holding up the async runtime. -async fn compress_archived_journal(source: &Path, dest: &Path) -> anyhow::Result<()> { - let source = source.to_owned(); - let dest = dest.to_owned(); - - tokio::task::spawn_blocking(move || { - use flate2::Compression; - use flate2::write::ZlibEncoder; - use std::io::{BufReader, copy}; - - let source_file = std::fs::File::open(&source)?; - let dest_file = std::fs::File::create(&dest)?; - let mut encoder = ZlibEncoder::new(dest_file, Compression::new(5)); - copy(&mut BufReader::new(source_file), &mut encoder)?; - encoder.finish()?; - Ok::<_, anyhow::Error>(()) - }) - .await? +#[derive(Debug)] +pub enum DataLoss { + Total, + None, } /// A persistent key-value store with timestamp tracking. @@ -106,51 +51,54 @@ impl VersionedKVStore { /// data. If the specified size is smaller and the existing data doesn't fit, a fresh journal /// will be created. /// - /// # Arguments - /// * `dir_path` - Directory path where the journal will be stored - /// * `name` - Base name for the journal (e.g., "store" will create "store.jrn.0") - /// * `buffer_size` - Size in bytes for the journal buffer - /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 - /// /// # Errors - /// Returns an error if the journal file cannot be created/opened or if initialization fails. + /// Returns an error if we failed to create or open the journal file. pub fn new>( dir_path: P, name: &str, buffer_size: usize, high_water_mark_ratio: Option, - ) -> anyhow::Result { + ) -> anyhow::Result<(Self, DataLoss)> { let dir = dir_path.as_ref(); - // Find or create journal with generation tracking - let (journal_path, generation) = find_active_journal(dir, name)?.unwrap_or_else(|| { - let path = dir.join(format!("{name}.jrn.0")); - (path, 0) - }); + let (journal_path, generation) = file_manager::find_active_journal(dir, name); - let journal = if journal_path.exists() { + let (journal, data_loss) = if journal_path.exists() { // Try to open existing journal MemMappedVersionedKVJournal::from_file(&journal_path, buffer_size, high_water_mark_ratio) + .map(|j| (j, DataLoss::None)) .or_else(|_| { + // TODO(snowp): Distinguish between partial and total data loss. + // Data is corrupt or unreadable, create fresh journal - MemMappedVersionedKVJournal::new(&journal_path, buffer_size, high_water_mark_ratio) + Ok::<_, anyhow::Error>(( + MemMappedVersionedKVJournal::new(&journal_path, buffer_size, high_water_mark_ratio)?, + DataLoss::Total, + )) })? } else { // Create new journal - MemMappedVersionedKVJournal::new(&journal_path, buffer_size, high_water_mark_ratio)? + + ( + MemMappedVersionedKVJournal::new(&journal_path, buffer_size, high_water_mark_ratio)?, + DataLoss::None, + ) }; let cached_map = journal.as_hashmap_with_timestamps()?; - Ok(Self { - journal, - cached_map, - dir_path: dir.to_path_buf(), - journal_name: name.to_string(), - buffer_size, - high_water_mark_ratio, - current_generation: generation, - }) + Ok(( + Self { + journal, + cached_map, + dir_path: dir.to_path_buf(), + journal_name: name.to_string(), + buffer_size, + high_water_mark_ratio, + current_generation: generation, + }, + data_loss, + )) } /// Open an existing `VersionedKVStore` from a pre-existing journal file. @@ -178,9 +126,7 @@ impl VersionedKVStore { ) -> anyhow::Result { let dir = dir_path.as_ref(); - // Find existing journal (fail if not found) - let (journal_path, generation) = find_active_journal(dir, name)? - .ok_or_else(|| anyhow::anyhow!("No journal file found for '{name}'"))?; + let (journal_path, generation) = file_manager::find_active_journal(dir, name); let journal = MemMappedVersionedKVJournal::from_file(&journal_path, buffer_size, high_water_mark_ratio)?; @@ -201,7 +147,7 @@ impl VersionedKVStore { /// /// This operation is O(1) as it reads from the in-memory cache. #[must_use] - pub fn get(&self, key: &str) -> Option<&Value> { + pub fn get(&self, key: &str) -> Option<&StateValue> { self.cached_map.get(key).map(|tv| &tv.value) } @@ -219,14 +165,21 @@ impl VersionedKVStore { /// /// # Errors /// Returns an error if the value cannot be written to the journal. - pub async fn insert(&mut self, key: String, value: Value) -> anyhow::Result { - let timestamp = if matches!(value, Value::Null) { + pub async fn insert(&mut self, key: String, value: StateValue) -> anyhow::Result { + let timestamp = if value.value_type.is_none() { // Inserting null is equivalent to deletion - let timestamp = self.journal.delete_versioned(&key)?; + let timestamp = self.journal.insert_entry(StateKeyValuePair { + key: key.clone(), + ..Default::default() + })?; self.cached_map.remove(&key); timestamp } else { - let timestamp = self.journal.set_versioned(&key, &value)?; + let timestamp = self.journal.insert_entry(StateKeyValuePair { + key: key.clone(), + value: Some(value.clone()).into(), + ..Default::default() + })?; self .cached_map .insert(key, TimestampedValue { value, timestamp }); @@ -252,7 +205,10 @@ impl VersionedKVStore { return Ok(None); } - let timestamp = self.journal.delete_versioned(key)?; + let timestamp = self.journal.insert_entry(StateKeyValuePair { + key: key.to_string(), + ..Default::default() + })?; self.cached_map.remove(key); // Check if rotation is needed @@ -299,9 +255,6 @@ impl VersionedKVStore { /// /// This is a blocking operation that performs synchronous I/O. In async contexts, /// consider wrapping this call with `tokio::task::spawn_blocking`. - /// - /// # Errors - /// Returns an error if the sync operation fails. pub fn sync(&self) -> anyhow::Result<()> { self.journal.sync() } @@ -312,9 +265,6 @@ impl VersionedKVStore { /// The archived journal will be compressed using zlib to reduce storage size. /// Rotation typically happens automatically when the high water mark is reached, but this /// method allows manual control when needed. - /// - /// # Errors - /// Returns an error if rotation fails. pub async fn rotate_journal(&mut self) -> anyhow::Result<()> { // Increment generation counter for new journal let next_generation = self.current_generation + 1; diff --git a/bd-resilient-kv/src/versioned_kv_journal/versioned.rs b/bd-resilient-kv/src/versioned_kv_journal/versioned.rs new file mode 100644 index 00000000..93f684b6 --- /dev/null +++ b/bd-resilient-kv/src/versioned_kv_journal/versioned.rs @@ -0,0 +1,365 @@ +// shared-core - bitdrift's common client/server libraries +// Copyright Bitdrift, Inc. All rights reserved. +// +// Use of this source code is governed by a source available license that can be found in the +// LICENSE file or at: +// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt + +use super::framing::Frame; +use crate::versioned_kv_journal::TimestampedValue; +use ahash::AHashMap; +use bd_client_common::error::InvariantError; +use bd_proto::protos::state::payload::StateKeyValuePair; +use std::time::{SystemTime, UNIX_EPOCH}; + +/// Timestamped implementation of a key-value journaling system that uses timestamps +/// as the version identifier for point-in-time recovery. +/// +/// Each write operation is assigned a monotonically non-decreasing timestamp (in microseconds +/// since UNIX epoch), enabling exact state reconstruction at any historical timestamp. +/// The monotonicity is enforced by clamping: if the system clock goes backwards, we reuse +/// the same timestamp value to maintain ordering guarantees. When timestamps collide, +/// journal ordering determines precedence. +#[derive(Debug)] +pub struct VersionedKVJournal<'a> { + position: usize, + buffer: &'a mut [u8], + high_water_mark: usize, + high_water_mark_triggered: bool, + last_timestamp: u64, // Most recent timestamp written (for monotonic enforcement) +} + +// Versioned KV files have the following structure: +// | Position | Data | Type | +// |----------|--------------------------|----------------| +// | 0 | Format Version | u64 | +// | 8 | Position | u64 | +// | 16 | Reserved | u8 | +// | 17 | Frame 1 | Framed Entry | +// | ... | Frame 2 | Framed Entry | +// | ... | Frame N | Framed Entry | +// +// Frame format: [length: u32][timestamp_micros: varint][payload: bytes][crc32: u32] +// +// For testing, payload format is: +// - Set operation: "key:value" (UTF-8 string) +// - Delete operation: "key:null" (UTF-8 string) +// +// # Timestamp Semantics +// +// Timestamps serve as both version identifiers and logical clocks with monotonic guarantees: +// - Each write gets a timestamp that is guaranteed to be >= previous writes (non-decreasing) +// - If system clock goes backward, timestamps are clamped to last_timestamp (reuse same value) +// - When timestamps collide, journal ordering determines precedence +// - This ensures total ordering while allowing correlation with external timestamped systems + +const VERSION: u64 = 3; // The versioned format version (bumped for new framing format) + +const HEADER_SIZE: usize = 17; + +// Minimum buffer size for a valid journal +const MIN_BUFFER_SIZE: usize = HEADER_SIZE + 4; + +/// Get current timestamp in microseconds since UNIX epoch. +fn current_timestamp() -> anyhow::Result { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_err(|_| InvariantError::Invariant.into()) + .map(|d| { + #[allow(clippy::cast_possible_truncation)] + { + d.as_micros() as u64 + } + }) +} + + +/// Write to the version field of a journal buffer. +fn write_version_field(buffer: &mut [u8], version: u64) { + let version_bytes = version.to_le_bytes(); + buffer[0 .. 8].copy_from_slice(&version_bytes); +} + +/// Write the version to a journal buffer. +fn write_version(buffer: &mut [u8]) { + write_version_field(buffer, VERSION); +} + +fn read_position(buffer: &[u8]) -> anyhow::Result { + let position_bytes: [u8; 8] = buffer[8 .. 16].try_into()?; + let position_u64 = u64::from_le_bytes(position_bytes); + let position = usize::try_from(position_u64) + .map_err(|_| anyhow::anyhow!("Position {position_u64} too large for usize"))?; + let buffer_len = buffer.len(); + if position >= buffer_len { + anyhow::bail!("Invalid position: {position}, buffer size: {buffer_len}",); + } + Ok(position) +} + +/// Write the position to a journal buffer. +fn write_position(buffer: &mut [u8], position: usize) { + let position_bytes = (position as u64).to_le_bytes(); + buffer[8 .. 16].copy_from_slice(&position_bytes); +} + +fn validate_buffer_len(buffer: &[u8]) -> anyhow::Result { + let buffer_len = buffer.len(); + if buffer_len < MIN_BUFFER_SIZE { + anyhow::bail!( + "Buffer too small: {buffer_len} bytes, but need at least {MIN_BUFFER_SIZE} bytes" + ); + } + Ok(buffer_len) +} + +/// Validate high water mark ratio and calculate the position from buffer length. +fn calculate_high_water_mark( + buffer_len: usize, + high_water_mark_ratio: Option, +) -> anyhow::Result { + let ratio = high_water_mark_ratio.unwrap_or(0.8); + if !(0.0 ..= 1.0).contains(&ratio) { + anyhow::bail!("High water mark ratio must be between 0.0 and 1.0, got: {ratio}"); + } + + #[allow( + clippy::cast_precision_loss, + clippy::cast_possible_truncation, + clippy::cast_sign_loss + )] + let high_water_mark = (buffer_len as f32 * ratio) as usize; + Ok(high_water_mark) +} + +impl<'a> VersionedKVJournal<'a> { + /// Create a new versioned journal using the provided buffer as storage space. + /// + /// # Arguments + /// * `buffer` - The storage buffer + /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 + /// + /// # Errors + /// Returns an error if the buffer is too small or if `high_water_mark_ratio` is invalid. + pub fn new(buffer: &'a mut [u8], high_water_mark_ratio: Option) -> anyhow::Result { + let buffer_len = validate_buffer_len(buffer)?; + let high_water_mark = calculate_high_water_mark(buffer_len, high_water_mark_ratio)?; + + // Write header + let timestamp = current_timestamp()?; + let position = HEADER_SIZE; + + write_position(buffer, position); + write_version(buffer); + buffer[16] = 0; // Reserved byte + + Ok(Self { + position, + buffer, + high_water_mark, + high_water_mark_triggered: false, + last_timestamp: timestamp, + }) + } + + /// Create a new versioned journal with state loaded from the provided buffer. + /// + /// # Arguments + /// * `buffer` - The storage buffer containing existing versioned KV data + /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 + /// + /// # Errors + /// Returns an error if the buffer is invalid, corrupted, or if `high_water_mark_ratio` is + /// invalid. + pub fn from_buffer( + buffer: &'a mut [u8], + high_water_mark_ratio: Option, + ) -> anyhow::Result { + let buffer_len = validate_buffer_len(buffer)?; + let position = read_position(buffer)?; + let high_water_mark = calculate_high_water_mark(buffer_len, high_water_mark_ratio)?; + + // Read version + let version_bytes: [u8; 8] = buffer[0 .. 8].try_into()?; + let version = u64::from_le_bytes(version_bytes); + + if version != VERSION { + anyhow::bail!("Unsupported version: {version}, expected {VERSION}"); + } + + // Find initialization timestamp and highest timestamp in the journal + let highest_timestamp = Self::find_latest_timestamp(buffer, position); + + Ok(Self { + position, + buffer, + high_water_mark, + high_water_mark_triggered: position >= high_water_mark, + last_timestamp: highest_timestamp, + }) + } + + /// Scan the journal to find the highest timestamp. + fn find_latest_timestamp(buffer: &[u8], position: usize) -> u64 { + let mut cursor = HEADER_SIZE; + let mut highest_timestamp = 0u64; + + while cursor < position { + let remaining = &buffer[cursor .. position]; + + match Frame::<()>::decode_timestamp(remaining) { + Ok((timestamp_micros, consumed)) => { + highest_timestamp = timestamp_micros; + cursor += consumed; + }, + Err(_) => { + // Stop on first decode error (partial frame or corruption) + break; + }, + } + } + + highest_timestamp + } + + /// Get the next monotonically increasing timestamp. + /// + /// This ensures that even if the system clock goes backwards, timestamps remain + /// monotonically increasing by clamping to `last_timestamp` (reusing the same value). + /// This prevents artificial clock skew while maintaining ordering guarantees. + fn next_monotonic_timestamp(&mut self) -> anyhow::Result { + let current = current_timestamp()?; + let monotonic = std::cmp::max(current, self.last_timestamp); + self.last_timestamp = monotonic; + Ok(monotonic) + } + + fn set_position(&mut self, position: usize) { + self.position = position; + write_position(self.buffer, position); + self.check_high_water_mark(); + } + + fn check_high_water_mark(&mut self) { + if self.position >= self.high_water_mark { + self.trigger_high_water(); + } + } + + fn trigger_high_water(&mut self) { + self.high_water_mark_triggered = true; + } + + /// Insert a new entry into the journal with the given payload. + /// Returns the timestamp of the operation. + /// + /// The timestamp is monotonically non-decreasing and serves as the version identifier. + /// If the system clock goes backwards, timestamps are clamped to maintain monotonicity. + pub fn insert_entry(&mut self, message: impl protobuf::MessageFull) -> anyhow::Result { + let timestamp = self.next_monotonic_timestamp()?; + + // Create payload + let frame = Frame::new(timestamp, message); + + // Encode frame + let available_space = &mut self.buffer[self.position ..]; + let encoded_len = frame.encode(available_space)?; + + self.set_position(self.position + encoded_len); + Ok(timestamp) + } + + /// Check if the high water mark has been triggered. + #[must_use] + pub fn is_high_water_mark_triggered(&self) -> bool { + self.high_water_mark_triggered + } + + /// Reconstruct the hashmap with timestamps by replaying all journal entries. + pub fn to_hashmap_with_timestamps(&self) -> anyhow::Result> { + let mut map = AHashMap::new(); + let mut cursor = HEADER_SIZE; + + while cursor < self.position { + let remaining = &self.buffer[cursor .. self.position]; + + match Frame::::decode(remaining) { + Ok((frame, consumed)) => { + if let Some(value) = frame.payload.value.into_option() { + map.insert( + frame.payload.key, + TimestampedValue { + value, + timestamp: frame.timestamp_micros, + }, + ); + } else { + map.remove(&frame.payload.key); + } + + cursor += consumed; + }, + Err(_) => { + // Stop on first decode error + break; + }, + } + } + + Ok(map) + } +} + +/// Rotation utilities for creating new journals with compacted state +impl<'a> VersionedKVJournal<'a> { + /// Create a new journal initialized with the compacted state from a snapshot. + /// + /// The new journal will have all current key-value pairs written with their **original + /// timestamps** to preserve historical accuracy. The journal's monotonic timestamp + /// enforcement will respect the highest timestamp in the provided state. + /// + /// # Arguments + /// * `buffer` - The buffer to write the new journal to + /// * `state` - The current key-value state with timestamps to write + /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark + /// + /// # Errors + /// Returns an error if serialization fails or buffer is too small. + pub fn create_rotated_journal( + buffer: &'a mut [u8], + state: &AHashMap, + high_water_mark_ratio: Option, + ) -> anyhow::Result { + // Create a new journal + let mut journal = Self::new(buffer, high_water_mark_ratio)?; + + // Find the maximum timestamp in the state to maintain monotonicity + let max_state_timestamp = state.values().map(|tv| tv.timestamp).max().unwrap_or(0); + + // Write all current state with their original timestamps + for (key, timestamped_value) in state { + // Update last_timestamp to ensure monotonicity is maintained + journal.last_timestamp = std::cmp::max(journal.last_timestamp, timestamped_value.timestamp); + + let frame = Frame::new( + timestamped_value.timestamp, + StateKeyValuePair { + key: key.clone(), + value: Some(timestamped_value.value.clone()).into(), + ..Default::default() + }, + ); + + // Encode frame + let available_space = &mut journal.buffer[journal.position ..]; + let encoded_len = frame.encode(available_space)?; + + journal.set_position(journal.position + encoded_len); + } + + // Ensure last_timestamp reflects the maximum timestamp we've written + journal.last_timestamp = std::cmp::max(journal.last_timestamp, max_state_timestamp); + + Ok(journal) + } +} diff --git a/bd-test-helpers/src/workflow/mod.rs b/bd-test-helpers/src/workflow/mod.rs index edeffdce..b3ad1041 100644 --- a/bd-test-helpers/src/workflow/mod.rs +++ b/bd-test-helpers/src/workflow/mod.rs @@ -407,11 +407,8 @@ pub fn make_emit_sankey_action(id: &str, limit: u32, tags: Vec) -> Action_t } #[must_use] -pub fn make_take_screenshot_action(id: &str) -> Action_type { - Action_type::ActionTakeScreenshot(ActionTakeScreenshotProto { - id: id.to_string(), - ..Default::default() - }) +pub fn make_take_screenshot_action() -> Action_type { + Action_type::ActionTakeScreenshot(ActionTakeScreenshotProto::default()) } #[must_use] diff --git a/bd-workflows/src/config.rs b/bd-workflows/src/config.rs index 7c91db62..315680a9 100644 --- a/bd-workflows/src/config.rs +++ b/bd-workflows/src/config.rs @@ -29,7 +29,6 @@ use workflow::workflow::action::{ Action_type, ActionEmitMetric as ActionEmitMetricProto, ActionEmitSankeyDiagram as ActionEmitSankeyDiagramProto, - ActionTakeScreenshot as ActionTakeScreenshotProto, }; use workflow::workflow::rule::Rule_type; use workflow::workflow::transition_extension::sankey_diagram_value_extraction; @@ -594,9 +593,7 @@ impl Action { Action_type::ActionEmitSankeyDiagram(diagram) => { Ok(Self::EmitSankey(ActionEmitSankey::try_from_proto(diagram)?)) }, - Action_type::ActionTakeScreenshot(action) => Ok(Self::TakeScreenshot( - ActionTakeScreenshot::try_from_proto(action), - )), + Action_type::ActionTakeScreenshot(_) => Ok(Self::TakeScreenshot(ActionTakeScreenshot)), Action_type::ActionGenerateLog(action) => Ok(Self::GenerateLog(action)), } } @@ -814,15 +811,7 @@ impl ActionEmitSankey { // #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] -pub struct ActionTakeScreenshot { - id: String, -} - -impl ActionTakeScreenshot { - fn try_from_proto(proto: ActionTakeScreenshotProto) -> Self { - Self { id: proto.id } - } -} +pub struct ActionTakeScreenshot; pub type FieldKey = String; diff --git a/bd-workflows/src/engine_test.rs b/bd-workflows/src/engine_test.rs index 0b803829..61b80c94 100644 --- a/bd-workflows/src/engine_test.rs +++ b/bd-workflows/src/engine_test.rs @@ -3299,7 +3299,7 @@ async fn take_screenshot_action() { let a = state("A").declare_transition_with_actions( &b, rule!(message_equals("foo")), - &[make_take_screenshot_action("screenshot_action_id")], + &[make_take_screenshot_action()], ); let workflow = WorkflowBuilder::new("1", &[&a, &b]).make_config(); From 16cce687eb65b7e574c70ab273c1d81ae129d845 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 11:46:20 -0800 Subject: [PATCH 40/66] fix tests --- Cargo.lock | 3 +++ bd-resilient-kv/Cargo.toml | 3 +++ bd-resilient-kv/src/lib.rs | 8 ++++++++ .../src/tests/versioned_kv_store_test.rs | 15 +++++++++------ .../src/tests/versioned_recovery_test.rs | 6 +++--- bd-resilient-kv/src/versioned_kv_journal/store.rs | 6 ++++++ 6 files changed, 32 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f6be6377..9eb7097d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1106,11 +1106,14 @@ dependencies = [ "anyhow", "bd-bonjson", "bd-client-common", + "bd-log", "bd-proto", "bd-workspace-hack", "bytes", "crc32fast", + "ctor", "flate2", + "log", "memmap2", "protobuf 4.0.0-alpha.0", "tempfile", diff --git a/bd-resilient-kv/Cargo.toml b/bd-resilient-kv/Cargo.toml index 50ec8a51..76d9a62a 100644 --- a/bd-resilient-kv/Cargo.toml +++ b/bd-resilient-kv/Cargo.toml @@ -10,12 +10,15 @@ doctest = false [dev-dependencies] tempfile.workspace = true +ctor.workspace = true [dependencies] ahash.workspace = true +log.workspace = true anyhow.workspace = true bd-bonjson = { path = "../bd-bonjson" } bd-proto = { path = "../bd-proto" } +bd-log = { path = "../bd-log" } bd-client-common = { path = "../bd-client-common" } bd-workspace-hack.workspace = true bytes.workspace = true diff --git a/bd-resilient-kv/src/lib.rs b/bd-resilient-kv/src/lib.rs index bd4e3f79..bd0bca6c 100644 --- a/bd-resilient-kv/src/lib.rs +++ b/bd-resilient-kv/src/lib.rs @@ -14,6 +14,14 @@ clippy::unwrap_used )] +#[cfg(test)] +#[ctor::ctor] +fn test_global_init() { + // TODO(snowp): Ideally we'll depend on bd-test-helpers here, but that would create a cyclic + // dependency. + bd_log::SwapLogger::initialize(); +} + #[cfg(test)] mod tests; diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index f041c783..0198831c 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -103,7 +103,7 @@ async fn test_persistence_and_reload() -> anyhow::Result<()> { assert_eq!( store.get_with_timestamp("key2"), Some(&TimestampedValue { - value: make_string_value("42"), + value: make_string_value("foo"), timestamp: ts2, }) ); @@ -205,7 +205,7 @@ async fn test_rotation_preserves_state() -> anyhow::Result<()> { let pre_rotation_state = store.as_hashmap().clone(); let pre_rotation_ts = store - .get_with_timestamp("key4") + .get_with_timestamp("key1") .map(|tv| tv.timestamp) .unwrap(); @@ -215,14 +215,14 @@ async fn test_rotation_preserves_state() -> anyhow::Result<()> { // Verify state is preserved exactly let post_rotation_state = store.as_hashmap(); assert_eq!(pre_rotation_state, *post_rotation_state); - assert_eq!(store.len(), 4); + assert_eq!(store.len(), 1); // Verify we can continue writing let ts_new = store - .insert("key5".to_string(), make_string_value("value5")) + .insert("key2".to_string(), make_string_value("value2")) .await?; assert!(ts_new >= pre_rotation_ts); - assert_eq!(store.len(), 5); + assert_eq!(store.len(), 2); Ok(()) } @@ -347,7 +347,10 @@ fn make_store_from_snapshot_file( // so we can open them as a store. let data = std::fs::read(snapshot_path)?; let decompressed_snapshot = decompress_zlib(&data)?; - std::fs::write(temp_dir.path().join("snapshot.jrn"), decompressed_snapshot)?; + std::fs::write( + temp_dir.path().join("snapshot.jrn.0"), + decompressed_snapshot, + )?; let store = VersionedKVStore::open_existing(temp_dir.path(), "snapshot", 4096, None)?; diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index 76a7affe..d72198b7 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -176,7 +176,7 @@ async fn test_recovery_with_overwrites() -> anyhow::Result<()> { let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; store - .insert("key".to_string(), make_string_value("1)")) + .insert("key".to_string(), make_string_value("1")) .await?; let ts1 = store .get_with_timestamp("key") @@ -186,7 +186,7 @@ async fn test_recovery_with_overwrites() -> anyhow::Result<()> { std::thread::sleep(std::time::Duration::from_millis(10)); store - .insert("key".to_string(), make_string_value("2)")) + .insert("key".to_string(), make_string_value("2")) .await?; let ts2 = store .get_with_timestamp("key") @@ -196,7 +196,7 @@ async fn test_recovery_with_overwrites() -> anyhow::Result<()> { std::thread::sleep(std::time::Duration::from_millis(10)); store - .insert("key".to_string(), make_string_value("3)")) + .insert("key".to_string(), make_string_value("3")) .await?; let ts3 = store .get_with_timestamp("key") diff --git a/bd-resilient-kv/src/versioned_kv_journal/store.rs b/bd-resilient-kv/src/versioned_kv_journal/store.rs index fa1c84e8..326c3992 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/store.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/store.rs @@ -63,6 +63,12 @@ impl VersionedKVStore { let (journal_path, generation) = file_manager::find_active_journal(dir, name); + log::debug!( + "Opening VersionedKVStore journal at {:?} (generation {})", + journal_path, + generation + ); + let (journal, data_loss) = if journal_path.exists() { // Try to open existing journal MemMappedVersionedKVJournal::from_file(&journal_path, buffer_size, high_water_mark_ratio) From d29e75f2064635261f05551ffbd969b55fdb131d Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 11:54:04 -0800 Subject: [PATCH 41/66] propagate failures, improve errors --- .../src/tests/versioned_kv_store_test.rs | 4 +- .../src/versioned_kv_journal/framing.rs | 6 +-- .../memmapped_versioned.rs | 6 +-- .../src/versioned_kv_journal/store.rs | 42 ++++++++++++------- .../src/versioned_kv_journal/versioned.rs | 10 ++++- 5 files changed, 42 insertions(+), 26 deletions(-) diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index 0198831c..d1361a00 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -91,7 +91,7 @@ async fn test_persistence_and_reload() -> anyhow::Result<()> { // Reopen and verify data persisted { - let store = VersionedKVStore::open_existing(temp_dir.path(), "test", 4096, None)?; + let (store, _) = VersionedKVStore::open_existing(temp_dir.path(), "test", 4096, None)?; assert_eq!(store.len(), 2); assert_eq!( store.get_with_timestamp("key1"), @@ -352,7 +352,7 @@ fn make_store_from_snapshot_file( decompressed_snapshot, )?; - let store = VersionedKVStore::open_existing(temp_dir.path(), "snapshot", 4096, None)?; + let (store, _) = VersionedKVStore::open_existing(temp_dir.path(), "snapshot", 4096, None)?; Ok(store) } diff --git a/bd-resilient-kv/src/versioned_kv_journal/framing.rs b/bd-resilient-kv/src/versioned_kv_journal/framing.rs index 9e1cebac..3ccf2a13 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/framing.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/framing.rs @@ -127,7 +127,7 @@ impl Frame { let payload_bytes = self .payload .write_to_bytes() - .map_err(|e| anyhow::anyhow!("Failed to serialize payload: {}", e))?; + .map_err(|e| anyhow::anyhow!("Failed to serialize payload: {e}"))?; // Frame length = timestamp + payload + crc let frame_len = timestamp_len + payload_bytes.len() + 4; @@ -200,8 +200,8 @@ impl Frame { anyhow::bail!("CRC mismatch: expected 0x{stored_crc:08x}, got 0x{computed_crc:08x}"); } - let payload = M::parse_from_bytes(&payload) - .map_err(|e| anyhow::anyhow!("Failed to parse payload: {}", e))?; + let payload = + M::parse_from_bytes(&payload).map_err(|e| anyhow::anyhow!("Failed to parse payload: {e}"))?; Ok((Self::new(timestamp_micros, payload), total_len)) } diff --git a/bd-resilient-kv/src/versioned_kv_journal/memmapped_versioned.rs b/bd-resilient-kv/src/versioned_kv_journal/memmapped_versioned.rs index ba06f231..d8f521ee 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/memmapped_versioned.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/memmapped_versioned.rs @@ -124,6 +124,7 @@ impl MemMappedVersionedKVJournal { self.versioned_kv.insert_entry(message) } + /// Check if the high water mark has been triggered. #[must_use] pub fn is_high_water_mark_triggered(&self) -> bool { @@ -131,10 +132,7 @@ impl MemMappedVersionedKVJournal { } /// Reconstruct the hashmap with timestamps by replaying all journal entries. - /// - /// # Errors - /// Returns an error if the buffer cannot be decoded. - pub fn as_hashmap_with_timestamps(&self) -> anyhow::Result> { + pub fn as_hashmap_with_timestamps(&self) -> (AHashMap, bool) { self.versioned_kv.to_hashmap_with_timestamps() } diff --git a/bd-resilient-kv/src/versioned_kv_journal/store.rs b/bd-resilient-kv/src/versioned_kv_journal/store.rs index 326c3992..b5678d37 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/store.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/store.rs @@ -13,9 +13,10 @@ use ahash::AHashMap; use bd_proto::protos::state::payload::{StateKeyValuePair, StateValue}; use std::path::{Path, PathBuf}; -#[derive(Debug)] +#[derive(Debug, PartialEq, Eq)] pub enum DataLoss { Total, + Partial, None, } @@ -69,7 +70,7 @@ impl VersionedKVStore { generation ); - let (journal, data_loss) = if journal_path.exists() { + let (journal, mut data_loss) = if journal_path.exists() { // Try to open existing journal MemMappedVersionedKVJournal::from_file(&journal_path, buffer_size, high_water_mark_ratio) .map(|j| (j, DataLoss::None)) @@ -91,7 +92,11 @@ impl VersionedKVStore { ) }; - let cached_map = journal.as_hashmap_with_timestamps()?; + let (cached_map, incomplete) = journal.as_hashmap_with_timestamps(); + + if incomplete && data_loss == DataLoss::None { + data_loss = DataLoss::Partial; + } Ok(( Self { @@ -129,24 +134,31 @@ impl VersionedKVStore { name: &str, buffer_size: usize, high_water_mark_ratio: Option, - ) -> anyhow::Result { + ) -> anyhow::Result<(Self, DataLoss)> { let dir = dir_path.as_ref(); let (journal_path, generation) = file_manager::find_active_journal(dir, name); let journal = MemMappedVersionedKVJournal::from_file(&journal_path, buffer_size, high_water_mark_ratio)?; - let cached_map = journal.as_hashmap_with_timestamps()?; - - Ok(Self { - journal, - cached_map, - dir_path: dir.to_path_buf(), - journal_name: name.to_string(), - buffer_size, - high_water_mark_ratio, - current_generation: generation, - }) + let (cached_map, incomplete) = journal.as_hashmap_with_timestamps(); + + Ok(( + Self { + journal, + cached_map, + dir_path: dir.to_path_buf(), + journal_name: name.to_string(), + buffer_size, + high_water_mark_ratio, + current_generation: generation, + }, + if incomplete { + DataLoss::Partial + } else { + DataLoss::None + }, + )) } /// Get a value by key. diff --git a/bd-resilient-kv/src/versioned_kv_journal/versioned.rs b/bd-resilient-kv/src/versioned_kv_journal/versioned.rs index 93f684b6..0a55ce0f 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/versioned.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/versioned.rs @@ -276,10 +276,15 @@ impl<'a> VersionedKVJournal<'a> { } /// Reconstruct the hashmap with timestamps by replaying all journal entries. - pub fn to_hashmap_with_timestamps(&self) -> anyhow::Result> { + /// + /// Returns a tuple containing: + /// - A hashmap of all key-value pairs with their timestamps + /// - A boolean indicating if we failed to decode the entire journal (true if incomplete) + pub fn to_hashmap_with_timestamps(&self) -> (AHashMap, bool) { let mut map = AHashMap::new(); let mut cursor = HEADER_SIZE; + let mut incomplete = false; while cursor < self.position { let remaining = &self.buffer[cursor .. self.position]; @@ -301,12 +306,13 @@ impl<'a> VersionedKVJournal<'a> { }, Err(_) => { // Stop on first decode error + incomplete = true; break; }, } } - Ok(map) + (map, incomplete) } } From 8470cf5ac1e0a9426bced2a2e4186cebd17e7d08 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 11:58:23 -0800 Subject: [PATCH 42/66] clippy --- bd-resilient-kv/src/versioned_kv_journal/store.rs | 5 ++--- bd-resilient-kv/src/versioned_kv_journal/versioned.rs | 7 ++++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/bd-resilient-kv/src/versioned_kv_journal/store.rs b/bd-resilient-kv/src/versioned_kv_journal/store.rs index b5678d37..10f87f1d 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/store.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/store.rs @@ -65,9 +65,8 @@ impl VersionedKVStore { let (journal_path, generation) = file_manager::find_active_journal(dir, name); log::debug!( - "Opening VersionedKVStore journal at {:?} (generation {})", - journal_path, - generation + "Opening VersionedKVStore journal at {} (generation {generation})", + journal_path.display() ); let (journal, mut data_loss) = if journal_path.exists() { diff --git a/bd-resilient-kv/src/versioned_kv_journal/versioned.rs b/bd-resilient-kv/src/versioned_kv_journal/versioned.rs index 0a55ce0f..37519b3a 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/versioned.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/versioned.rs @@ -304,7 +304,12 @@ impl<'a> VersionedKVJournal<'a> { cursor += consumed; }, - Err(_) => { + Err(e) => { + // TODO(snowp): In this case we may want to reset the position to cursor to avoid + // carrying forward partial/corrupted data. This matters as the recovery will bail on + // corrupt data resulting in further writes also being lost. + log::debug!("Failed to decode frame at offset {cursor}: {e}"); + // Stop on first decode error incomplete = true; break; From c96c20e7553e0cbd753f834e1522690684e66bc1 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 13:20:59 -0800 Subject: [PATCH 43/66] docs fixes, naming --- bd-resilient-kv/AGENTS.md | 8 +- bd-resilient-kv/VERSIONED_FORMAT.md | 251 +++++++++--------- .../memmapped_versioned.rs | 21 +- .../src/versioned_kv_journal/recovery.rs | 5 +- .../src/versioned_kv_journal/store.rs | 10 +- .../src/versioned_kv_journal/versioned.rs | 47 ++-- 6 files changed, 170 insertions(+), 172 deletions(-) diff --git a/bd-resilient-kv/AGENTS.md b/bd-resilient-kv/AGENTS.md index fdfe4ff2..62aac92c 100644 --- a/bd-resilient-kv/AGENTS.md +++ b/bd-resilient-kv/AGENTS.md @@ -55,14 +55,14 @@ The `VersionedKVStore` provides a higher-level API built on top of `VersionedKVJ - The async API enables efficient background compression without blocking the main thread **Version Tracking**: -- Every write operation (`insert`, `remove`) returns a monotonically non-decreasing timestamp (nanoseconds since UNIX epoch) +- Every write operation (`insert`, `remove`) returns a monotonically non-decreasing timestamp (microseconds since UNIX epoch) - Timestamps serve as both version identifiers and logical clocks - If the system clock goes backward, timestamps are clamped to the last timestamp to maintain monotonicity - Entries with `Value::Null` are treated as deletions but still timestamped - During rotation, snapshot entries preserve their original timestamps **Timestamp Tracking**: -- Each entry records a timestamp (nanoseconds since UNIX epoch) when the write occurred +- Each entry records a timestamp (microseconds since UNIX epoch) when the write occurred - Timestamps are monotonically non-decreasing, not strictly increasing - Multiple entries may share the same timestamp if the system clock doesn't advance between writes - This is expected behavior, particularly during rapid writes or in test environments @@ -97,12 +97,14 @@ The `VersionedRecovery` utility provides point-in-time recovery by replaying jou - Architecture: Two journals with automatic switching - Compaction: Compresses entire state into inactive journal - No version tracking +- Format: BONJSON-based entries **VersionedKVStore (Single Journal with Rotation)**: - Best for: Audit logs, state history, remote backup - Architecture: Single journal with archived versions - Rotation: Creates new journal with compacted state - Timestamp tracking: Every write returns a timestamp +- Format: Protobuf-based entries (VERSION 3) ### 2. Compaction Efficiency **Key Insight**: Compaction via `reinit_from()` is already maximally efficient. It writes data in the most compact possible serialized form (hashmap → bytes). If even this compact representation exceeds high water marks, then the data volume itself is the limiting factor, not inefficient storage. @@ -255,7 +257,7 @@ fn set_multiple(&mut self, entries: &[(String, Value)]) -> anyhow::Result<()> { ### Impossible Failure Modes (Architectural Guarantees) 1. **Timestamp Overflow (VersionedKVStore)** - - **Why Practically Impossible**: Uses u64 for nanosecond timestamps, would require 584+ years to overflow (u64::MAX nanoseconds ≈ year 2554) + - **Why Practically Impossible**: Uses u64 for microsecond timestamps, would require 584,000+ years to overflow (u64::MAX microseconds ≈ year 586,524 CE) - **Implication**: No overflow handling needed in practice ## Common Pitfalls diff --git a/bd-resilient-kv/VERSIONED_FORMAT.md b/bd-resilient-kv/VERSIONED_FORMAT.md index 3e22e657..9e0411a6 100644 --- a/bd-resilient-kv/VERSIONED_FORMAT.md +++ b/bd-resilient-kv/VERSIONED_FORMAT.md @@ -1,8 +1,8 @@ -# Versioned Journal Format Design +# Versioned KV Journal Design ## Overview -This document describes the versioned journal format (VERSION 2) that enables point-in-time state recovery by using timestamps as version identifiers for each write operation. +This document describes the versioned k-v store, which enables point-in-time state recovery by using timestamps as version identifiers. Each write operation is tagged with a timestamp, allowing the system to reconstruct the key-value store state at any historical moment. ## Goals @@ -16,13 +16,15 @@ The versioned journal format uses timestamps as version identifiers for each wri To prevent unbounded growth, the system uses journal rotation: when the active journal reaches a size threshold, it is rotated out and replaced with a new journal containing only the current compacted state. The old journal is archived and compressed. Each archived journal preserves the original write timestamps of all entries, enabling point-in-time recovery across rotation boundaries. -The format is built on top of BONJSON, a binary JSON format that provides efficient serialization while maintaining flexibility for different value types. +The underlying journal uses Protobuf to serialize the payloads that are used to implement the key-value semantics. ## File Types -### 1. Active Journal (`my_store.jrn`) +### 1. Active Journal (`my_store.jrn.0`) The current active journal receiving new writes. Active journals are **not compressed** for performance reasons. +The number at the end of the active journal reflects the generation of the active journal, which allows us to safely rotate the journal while gracefully handling I/O errors. More on this below in the rotation section. + ### 2. Archived Journals (`my_store.jrn.t1699564900000000.zz`, etc.) Previous journals, archived during rotation. Each contains complete state at its creation time plus subsequent incremental writes. The timestamp in the filename indicates the rotation/snapshot timestamp. @@ -32,35 +34,30 @@ Previous journals, archived during rotation. Each contains complete state at its ### Binary Structure -The byte-level layout of a VERSION 2 journal file: +The byte-level layout of a VERSION 3 journal file: ``` ┌─────────────────────────────────────────────────────────────────────────┐ │ JOURNAL FILE HEADER │ ├──────────────────┬──────────────────┬───────────────────────────────────┤ -│ Format Version │ Position │ Array Start Type Code │ -│ (u64) │ (u64) │ (u8) │ -│ 8 bytes │ 8 bytes │ 1 byte │ +│ Format Version │ Position │ Reserved │ +│ (u64) │ (u64) │ (u8) │ +│ 8 bytes │ 8 bytes │ 1 byte │ └──────────────────┴──────────────────┴───────────────────────────────────┘ -┌─────────────────────────────────────────────────────────────────────────┐ -│ BONJSON METADATA OBJECT │ -│ (First entry in the array) │ -├─────────────────────────────────────────────────────────────────────────┤ -│ { │ -│ "initialized": 1699564800000000, // u64 timestamp (us) │ -│ "format_version": 2 // Format identifier │ -│ } │ -└─────────────────────────────────────────────────────────────────────────┘ - ┌─────────────────────────────────────────────────────────────────────────┐ │ VERSIONED JOURNAL ENTRY │ -│ (BONJSON Object) │ +│ (Protobuf-encoded StateKeyValuePair) │ ├─────────────────────────────────────────────────────────────────────────┤ -│ { │ -│ "t": 1699564801000000, // Timestamp in us (u64) │ -│ "k": "key1", // Key (string) │ -│ "o": "value1" // Value or null (any type) │ +│ Frame Length (u32) │ 4 bytes │ +│ Timestamp (varint) │ Variable length (microseconds) │ +│ Protobuf Payload │ Variable length │ +│ CRC32 │ 4 bytes │ +│ │ +│ Payload contains: │ +│ StateKeyValuePair { │ +│ key: String, // The key being modified │ +│ value: StateValue // Value for SET, null for DELETE │ │ } │ └─────────────────────────────────────────────────────────────────────────┘ ``` @@ -70,82 +67,87 @@ The byte-level layout of a VERSION 2 journal file: | Field | Offset | Size | Type | Value | Purpose | |-------|--------|------|------|-------|---------| -| Format Version | 0 | 8 bytes | u64 (little-endian) | `2` | Allows future format evolution | +| Format Version | 0 | 8 bytes | u64 (little-endian) | `3` | Allows future format evolution | | Position | 8 | 8 bytes | u64 (little-endian) | Current write position | Tracks where next entry will be written | -| Array Start Type Code | 16 | 1 byte | u8 | BONJSON array start code | Begins the BONJSON array containing all entries | +| Reserved | 16 | 1 byte | u8 | `0` | Reserved for future use | -### Metadata Object (Variable size) +### Entry Framing Format -The first entry in the array is always a metadata object: +Each entry in the journal uses a length-prefixed framing format with CRC32 integrity checking: -```json -{ - "initialized": , // Creation timestamp (microseconds since epoch) - "format_version": 2 // Must be 2 for this format -} -``` +| Component | Size | Type | Description | +|-----------|------|------|-------------| +| Frame Length | 4 bytes | u32 (little-endian) | Total size of timestamp + protobuf payload + CRC32 | +| Timestamp | Variable | varint | Entry timestamp in microseconds (serves as version) | +| Protobuf Payload | Variable | bytes | Serialized StateKeyValuePair message | +| CRC32 | 4 bytes | u32 (little-endian) | Checksum of timestamp + payload | -### Versioned Journal Entry Schema (Variable size) +### Versioned Journal Entry Schema -Each subsequent entry follows this uniform schema: +Each entry in the journal is a `StateKeyValuePair` protobuf message: -```json -{ - "t": , // Timestamp in microseconds (monotonically non-decreasing, serves as version) - "k": "", // Key being modified - "o": // Value for SET, null for DELETE +```protobuf +message StateKeyValuePair { + string key = 1; // The key being modified + StateValue value = 2; // Value for SET, null/empty for DELETE +} + +message StateValue { + oneof value { + string string_value = 1; + } } ``` Fields: -- `t` (timestamp): Monotonically non-decreasing timestamp (microseconds since UNIX epoch) that serves as both the write time and version identifier -- `k` (key): The key being written -- `o` (operation): The value (for SET) or null (for DELETE) - -**Type Flexibility**: The `"o"` field can contain any BONJSON-compatible type: -- Primitives (strings, numbers, booleans) -- Complex objects -- Arrays -- `null` (indicates DELETE operation) +- `key`: The key being written (string) +- `value`: The value being set (StateValue) or null/empty for DELETE operations **Timestamp Semantics:** -Timestamps are monotonically non-decreasing, not strictly increasing. If the system clock doesn't advance between writes, multiple entries may share the same timestamp. This is expected behavior and ensures proper ordering without clock skew. +- Timestamps are stored as varints in microseconds since UNIX epoch +- Timestamps are monotonically non-decreasing, not strictly increasing +- If the system clock doesn't advance between writes, multiple entries may share the same timestamp +- This is expected behavior and ensures proper ordering without clock skew + +**Type Flexibility**: The `StateValue` message supports multiple value types: +- Primitives: strings, integers, doubles, booleans +- Complex types: lists, maps +- Binary data: bytes +- null value (indicates DELETE operation) **Size Considerations:** - **Header**: Fixed 17 bytes -- **Metadata**: ~50-70 bytes (depending on timestamp magnitude) - **Per Entry**: Varies based on key and value size - - Minimum: ~30 bytes (short key, small value) - - Typical: 70-470 bytes - - Maximum: Limited by buffer size + - Frame overhead: 8+ bytes (length + CRC) + - Timestamp: 1-10 bytes (varint-encoded) + - Protobuf payload: varies by content + - Typical: 40-200 bytes per entry ## Journal Structure ### Initial Journal -When first created: -```json -{"initialized": 1699564800000000, "format_version": 2} -{"t": 1699564801000000, "k": "key1", "o": "value1"} -{"t": 1699564802000000, "k": "key2", "o": "value2"} +When first created, the journal contains versioned entries: +``` +Entry 0: {"key": "key1", "value": "value1"} @ t=1699564801000000 +Entry 1: {"key": "key2", "value": "value2"} @ t=1699564802000000 ... ``` ### Rotated Journal After rotation at timestamp 1699564900000000, the new journal contains: -```json -{"initialized": 1699564900000000, "format_version": 2} -{"t": 1699564800123456, "k": "key1", "o": "value1"} // Compacted state (original timestamp preserved) -{"t": 1699564850987654, "k": "key2", "o": "value2"} // Compacted state (original timestamp preserved) -{"t": 1699564875111222, "k": "key3", "o": "value3"} // Compacted state (original timestamp preserved) -{"t": 1699564901000000, "k": "key4", "o": "value4"} // New write after rotation -{"t": 1699564902000000, "k": "key1", "o": "updated1"} // New write after rotation +``` +Entry 0: {"key": "key1", "value": "value1"} @ t=1699564800123456 // Compacted state (original timestamp preserved) +Entry 1: {"key": "key2", "value": "value2"} @ t=1699564850987654 // Compacted state (original timestamp preserved) +Entry 2: {"key": "key3", "value": "value3"} @ t=1699564875111222 // Compacted state (original timestamp preserved) +Entry 3: {"key": "key4", "value": "value4"} @ t=1699564901000000 // New write after rotation +Entry 4: {"key": "key1", "value": "updated1"} @ t=1699564902000000 // New write after rotation ... ``` Key observations: - **Timestamps are preserved**: Each compacted entry retains its original write timestamp (not the rotation time) - - This ensures that not only is the state at any given time recoverably from a given snapshot, we'll also be able to recover how long the current state values have been active for without looking at the previous snapshot. -- These are regular journal entries, not a special format + - This ensures that not only is the state at any given time recoverable from a given snapshot, we'll also be able to recover how long the current state values have been active for without looking at the previous snapshot. +- All entries use the same protobuf framing format - New writes continue with later timestamps - Each rotated journal is self-contained and can be read independently @@ -154,21 +156,21 @@ Key observations: When high water mark is reached: 1. **Determine Rotation Timestamp**: Calculate max timestamp T from the most recent entry -2. **Create New Journal**: Initialize fresh journal file (e.g., `my_store.jrn.tmp`) -3. **Write Compacted State**: Write all current key-value pairs as versioned entries using their original update timestamp -4. **Archive Old Journal**: Rename `my_store.jrn` → `my_store.jrn.old` (temporary) -5. **Activate New Journal**: Rename `my_store.jrn.tmp` → `my_store.jrn` -6. **Compress Archive**: Compress `my_store.jrn.old` → `my_store.jrn.t{T}.zz` using zlib -7. **Delete Temporary**: Remove uncompressed `my_store.jrn.old` +2. **Increment Generation**: Calculate next generation number (e.g., 0 → 1) +3. **Create New Journal**: Initialize fresh journal file with next generation (e.g., `my_store.jrn.1`) +4. **Write Compacted State**: Write all current key-value pairs as versioned entries using their original update timestamp +5. **Activate New Journal**: Switch to new journal in-memory, unmap old journal +6. **Compress Archive** (async, best-effort): Compress old generation → `my_store.jrn.t{T}.zz` using zlib +7. **Delete Original** (best-effort): Remove uncompressed old generation file Example: ``` -Before rotation at t=1699564900000000000: - my_store.jrn # Active journal +Before rotation (generation 0): + my_store.jrn.0 # Active journal (generation 0) -After rotation: - my_store.jrn # Active, contains compacted state - my_store.jrn.t1699564900000000000.zz # Compressed archive +After rotation (generation 1): + my_store.jrn.1 # Active, contains compacted state (generation 1) + my_store.jrn.t1699564900000000.zz # Compressed archive of generation 0 ``` ### Rotation Timeline Visualization @@ -176,89 +178,81 @@ After rotation: ``` TIME │ - ├─ t0: Normal Operation + ├─ t0: Normal Operation (Generation 0) │ ┌────────────────────────────────────┐ - │ │ my_store.jrn │ - │ │ ├─ {"t": 1699564795000000000, ...}│ - │ │ ├─ {"t": 1699564796000000000, ...}│ - │ │ ├─ {"t": 1699564797000000000, ...}│ - │ │ ├─ {"t": 1699564798000000000, ...}│ - │ │ └─ {"t": 1699564799000000000, ...}│ + │ │ my_store.jrn.0 │ + │ │ ├─ Entry @ t=1699564795000000 │ + │ │ ├─ Entry @ t=1699564796000000 │ + │ │ ├─ Entry @ t=1699564797000000 │ + │ │ ├─ Entry @ t=1699564798000000 │ + │ │ └─ Entry @ t=1699564799000000 │ │ └────────────────────────────────────┘ │ ├─ t1: High Water Mark Reached │ ┌────────────────────────────────────┐ - │ │ my_store.jrn │ - │ │ └─ {"t": 1699564800000000000, ...}│ ← TRIGGER + │ │ my_store.jrn.0 │ + │ │ └─ Entry @ t=1699564800000000 │ ← TRIGGER │ └────────────────────────────────────┘ - │ max_timestamp = 1699564800000000000 + │ max_timestamp = 1699564800000000 │ ├─ t2: Create New Journal (Step 1) │ ┌────────────────────────────────────┐ - │ │ my_store.jrn │ (old, still active) + │ │ my_store.jrn.0 │ (old, still active - generation 0) │ └────────────────────────────────────┘ │ ┌────────────────────────────────────┐ - │ │ my_store.jrn.tmp │ (new, being written) - │ │ └─ [header + metadata] │ + │ │ my_store.jrn.1 │ (new, being written - generation 1) + │ │ └─ [header] │ │ └────────────────────────────────────┘ │ ├─ t3: Write Compacted State (Step 2) │ ┌────────────────────────────────────┐ - │ │ my_store.jrn │ (old, still active) - │ └────────────────────────────────────┘ - │ ┌────────────────────────────────────┐ - │ │ my_store.jrn.tmp │ (new, being written) - │ │ ├─ {"t": 1699564750000000000, "k": "key1", ...}│ ← Original timestamps - │ │ ├─ {"t": 1699564780000000000, "k": "key2", ...}│ ← Original timestamps - │ │ └─ {"t": 1699564799000000000, "k": "key3", ...}│ ← Original timestamps - │ └────────────────────────────────────┘ - │ - ├─ t4: Archive Old Journal (Step 3) - │ ┌────────────────────────────────────┐ - │ │ my_store.jrn.old │ (renamed, temporary) + │ │ my_store.jrn.0 │ (old, still active) │ └────────────────────────────────────┘ │ ┌────────────────────────────────────┐ - │ │ my_store.jrn.tmp │ (new, ready) + │ │ my_store.jrn.1 │ (new, being written) + │ │ ├─ Entry {"key1", ...} @ t=1699564750000000│ ← Original timestamps + │ │ ├─ Entry {"key2", ...} @ t=1699564780000000│ ← Original timestamps + │ │ └─ Entry {"key3", ...} @ t=1699564799000000│ ← Original timestamps │ └────────────────────────────────────┘ │ - ├─ t5: Activate New Journal (Step 4) + ├─ t4: Activate New Journal (Step 3) │ ┌────────────────────────────────────┐ - │ │ my_store.jrn.old │ (archived, temporary) + │ │ my_store.jrn.0 │ (old, unmapped - ready for archive) │ └────────────────────────────────────┘ │ ┌────────────────────────────────────┐ - │ │ my_store.jrn │ ← NOW ACTIVE! + │ │ my_store.jrn.1 │ ← NOW ACTIVE! (generation 1) │ │ (contains compacted state) │ │ └────────────────────────────────────┘ │ - ├─ t6: Compress Archive (Step 5 - Async) + ├─ t5: Compress Archive (Step 4 - Async) │ ┌────────────────────────────────────┐ - │ │ my_store.jrn │ (active, accepting writes) - │ │ └─ {"t": 1699564801000000000, ...}│ ← New writes + │ │ my_store.jrn.1 │ (active, accepting writes) + │ │ └─ Entry @ t=1699564801000000 │ ← New writes │ └────────────────────────────────────┘ │ ┌────────────────────────────────────┐ - │ │ my_store.jrn.old │ (being compressed...) + │ │ my_store.jrn.0 │ (being compressed...) │ └────────────────────────────────────┘ │ ┌────────────────────────────────────┐ - │ │ my_store.jrn.t1699564800000000000.zz│ (compressed output) + │ │ my_store.jrn.t1699564800000000.zz │ (compressed output) │ └────────────────────────────────────┘ │ - ├─ t7: Delete Temporary (Step 6) + ├─ t6: Delete Original (Step 5) │ ┌────────────────────────────────────┐ - │ │ my_store.jrn │ (active) + │ │ my_store.jrn.1 │ (active - generation 1) │ └────────────────────────────────────┘ │ ┌────────────────────────────────────┐ - │ │ my_store.jrn.t1699564800000000000.zz│ (compressed archive) + │ │ my_store.jrn.t1699564800000000.zz │ (compressed archive of gen 0) │ └────────────────────────────────────┘ │ - └─ t8: Continue Normal Operation + └─ t7: Continue Normal Operation ┌────────────────────────────────────┐ - │ my_store.jrn │ - │ ├─ {"t": 1699564801000000000, ...}│ - │ ├─ {"t": 1699564802000000000, ...}│ - │ └─ {"t": 1699564803000000000, ...}│ + │ my_store.jrn.1 │ + │ ├─ Entry @ t=1699564801000000 │ + │ ├─ Entry @ t=1699564802000000 │ + │ └─ Entry @ t=1699564803000000 │ └────────────────────────────────────┘ ┌────────────────────────────────────┐ - │ my_store.jrn.t1699564800000000000.zz│ (ready for upload) + │ my_store.jrn.t1699564800000000.zz │ (ready for upload) └────────────────────────────────────┘ ``` @@ -271,22 +265,25 @@ Archived journals are automatically compressed using zlib (compression level 5) ### Rotation Failure Modes and Recovery +The generation-based rotation process is designed to be resilient: + | Failure Point | State | Recovery | |---------------|-------|----------| -| Before Step 3 | my_store.jrn + my_store.jrn.tmp exist | Delete .tmp, retry | -| After Step 3, before Step 4 | my_store.jrn.old exists, no active journal | Rename .old back to .jrn | -| After Step 4 | New journal active | Continue normally, cleanup may be incomplete | -| During Step 5-6 | Compression fails | .old file may remain, but new journal is valid | +| Before Step 3 | Old generation active, new generation partially written | Delete incomplete new generation, retry | +| During/After Step 5 | New generation active | Continue normally, old generation remains until compressed | +| During Step 6-7 | Compression fails | Uncompressed old generation may remain, but new journal is valid | **What Can Fail:** - I/O errors (disk full, permissions, etc.) - Compression errors during async compression phase +**Key Design Feature**: The rotation switches journals in-memory without file renames, making the critical transition atomic from the application's perspective. Old generation files remain at their original paths until successfully archived. + ## Recovery and Audit ### Current State Recovery -Simply read the active journal (`my_store.jrn`) and replay all entries to reconstruct the current state. +The active journal is identified by finding the highest generation number (e.g., `my_store.jrn.0`, `my_store.jrn.1`, etc.). Simply read the active journal and replay all entries to reconstruct the current state. ### Audit and Analysis While `VersionedKVStore` does not support point-in-time recovery through its API, archived journals contain complete historical data. diff --git a/bd-resilient-kv/src/versioned_kv_journal/memmapped_versioned.rs b/bd-resilient-kv/src/versioned_kv_journal/memmapped_versioned.rs index d8f521ee..97cf1e64 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/memmapped_versioned.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/memmapped_versioned.rs @@ -5,9 +5,10 @@ // LICENSE file or at: // https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt -use super::versioned::VersionedKVJournal; +use super::versioned::VersionedJournal; use crate::versioned_kv_journal::TimestampedValue; use ahash::AHashMap; +use bd_proto::protos::state::payload::StateKeyValuePair; use memmap2::{MmapMut, MmapOptions}; use std::fs::OpenOptions; use std::path::Path; @@ -26,7 +27,7 @@ use std::path::Path; pub struct MemMappedVersionedKVJournal { // Note: mmap MUST de-init AFTER versioned_kv because mmap uses it. mmap: MmapMut, - versioned_kv: VersionedKVJournal<'static>, + versioned_kv: VersionedJournal<'static, StateKeyValuePair>, } impl MemMappedVersionedKVJournal { @@ -81,7 +82,7 @@ impl MemMappedVersionedKVJournal { let (mmap, buffer) = unsafe { Self::create_mmap_buffer(file)? }; - let versioned_kv = VersionedKVJournal::new(buffer, high_water_mark_ratio)?; + let versioned_kv = VersionedJournal::new(buffer, high_water_mark_ratio)?; Ok(Self { mmap, versioned_kv }) } @@ -113,7 +114,7 @@ impl MemMappedVersionedKVJournal { let (mmap, buffer) = unsafe { Self::create_mmap_buffer(file)? }; - let versioned_kv = VersionedKVJournal::from_buffer(buffer, high_water_mark_ratio)?; + let versioned_kv = VersionedJournal::from_buffer(buffer, high_water_mark_ratio)?; Ok(Self { mmap, versioned_kv }) } @@ -132,8 +133,16 @@ impl MemMappedVersionedKVJournal { } /// Reconstruct the hashmap with timestamps by replaying all journal entries. - pub fn as_hashmap_with_timestamps(&self) -> (AHashMap, bool) { - self.versioned_kv.to_hashmap_with_timestamps() + pub fn to_hashmap_with_timestamps(&self) -> (AHashMap, bool) { + let mut map = AHashMap::new(); + let complete = self.versioned_kv.read(|payload, timestamp| { + if let Some(value) = payload.value.clone().into_option() { + map.insert(payload.key.clone(), TimestampedValue { value, timestamp }); + } else { + map.remove(&payload.key); + } + }); + (map, !complete) } /// Synchronize changes to disk. diff --git a/bd-resilient-kv/src/versioned_kv_journal/recovery.rs b/bd-resilient-kv/src/versioned_kv_journal/recovery.rs index 7c209b5f..62ee51fa 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/recovery.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/recovery.rs @@ -191,10 +191,7 @@ fn replay_journal_to_timestamp( } if let Some(value) = frame.payload.value.into_option() { - // Insertion - parse the value string back to a Value - // For now, we store everything as strings since that's what the current - // implementation does. In the future, you can parse the value_str to - // reconstruct the original type. + // Insertion - store the protobuf StateValue map.insert( frame.payload.key, TimestampedValue { diff --git a/bd-resilient-kv/src/versioned_kv_journal/store.rs b/bd-resilient-kv/src/versioned_kv_journal/store.rs index 10f87f1d..766a1b02 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/store.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/store.rs @@ -8,7 +8,7 @@ use crate::versioned_kv_journal::TimestampedValue; use crate::versioned_kv_journal::file_manager::{self, compress_archived_journal}; use crate::versioned_kv_journal::memmapped_versioned::MemMappedVersionedKVJournal; -use crate::versioned_kv_journal::versioned::VersionedKVJournal; +use crate::versioned_kv_journal::versioned::VersionedJournal; use ahash::AHashMap; use bd_proto::protos::state::payload::{StateKeyValuePair, StateValue}; use std::path::{Path, PathBuf}; @@ -91,7 +91,7 @@ impl VersionedKVStore { ) }; - let (cached_map, incomplete) = journal.as_hashmap_with_timestamps(); + let (cached_map, incomplete) = journal.to_hashmap_with_timestamps(); if incomplete && data_loss == DataLoss::None { data_loss = DataLoss::Partial; @@ -140,7 +140,7 @@ impl VersionedKVStore { let journal = MemMappedVersionedKVJournal::from_file(&journal_path, buffer_size, high_water_mark_ratio)?; - let (cached_map, incomplete) = journal.as_hashmap_with_timestamps(); + let (cached_map, incomplete) = journal.to_hashmap_with_timestamps(); Ok(( Self { @@ -352,8 +352,8 @@ impl VersionedKVStore { // Create in-memory buffer for new journal let mut buffer = vec![0u8; self.buffer_size]; - // Use VersionedKVJournal to create rotated journal in memory - let _rotated = VersionedKVJournal::create_rotated_journal( + // Use VersionedJournal to create rotated journal in memory + let _rotated = VersionedJournal::::create_rotated_journal( &mut buffer, &self.cached_map, self.high_water_mark_ratio, diff --git a/bd-resilient-kv/src/versioned_kv_journal/versioned.rs b/bd-resilient-kv/src/versioned_kv_journal/versioned.rs index 37519b3a..39072ae5 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/versioned.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/versioned.rs @@ -12,7 +12,7 @@ use bd_client_common::error::InvariantError; use bd_proto::protos::state::payload::StateKeyValuePair; use std::time::{SystemTime, UNIX_EPOCH}; -/// Timestamped implementation of a key-value journaling system that uses timestamps +/// Timestamped implementation of a journaling system that uses timestamps /// as the version identifier for point-in-time recovery. /// /// Each write operation is assigned a monotonically non-decreasing timestamp (in microseconds @@ -21,12 +21,13 @@ use std::time::{SystemTime, UNIX_EPOCH}; /// the same timestamp value to maintain ordering guarantees. When timestamps collide, /// journal ordering determines precedence. #[derive(Debug)] -pub struct VersionedKVJournal<'a> { +pub struct VersionedJournal<'a, M> { position: usize, buffer: &'a mut [u8], high_water_mark: usize, high_water_mark_triggered: bool, last_timestamp: u64, // Most recent timestamp written (for monotonic enforcement) + _payload_marker: std::marker::PhantomData, } // Versioned KV files have the following structure: @@ -39,11 +40,13 @@ pub struct VersionedKVJournal<'a> { // | ... | Frame 2 | Framed Entry | // | ... | Frame N | Framed Entry | // -// Frame format: [length: u32][timestamp_micros: varint][payload: bytes][crc32: u32] +// Frame format: [length: u32][timestamp_micros: varint][protobuf_payload: bytes][crc32: u32] // -// For testing, payload format is: -// - Set operation: "key:value" (UTF-8 string) -// - Delete operation: "key:null" (UTF-8 string) +// Payload format: +// - Uses `StateKeyValuePair` protobuf messages +// - Contains `key: String` and `value: StateValue` fields +// - Set operation: `value` field is populated with the state value +// - Delete operation: `value` field is null/empty // // # Timestamp Semantics // @@ -132,7 +135,7 @@ fn calculate_high_water_mark( Ok(high_water_mark) } -impl<'a> VersionedKVJournal<'a> { +impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { /// Create a new versioned journal using the provided buffer as storage space. /// /// # Arguments @@ -159,6 +162,7 @@ impl<'a> VersionedKVJournal<'a> { high_water_mark, high_water_mark_triggered: false, last_timestamp: timestamp, + _payload_marker: std::marker::PhantomData, }) } @@ -196,6 +200,7 @@ impl<'a> VersionedKVJournal<'a> { high_water_mark, high_water_mark_triggered: position >= high_water_mark, last_timestamp: highest_timestamp, + _payload_marker: std::marker::PhantomData, }) } @@ -275,32 +280,20 @@ impl<'a> VersionedKVJournal<'a> { self.high_water_mark_triggered } - /// Reconstruct the hashmap with timestamps by replaying all journal entries. + /// Read and process all complete entries in the journal with their timestamps. /// - /// Returns a tuple containing: - /// - A hashmap of all key-value pairs with their timestamps - /// - A boolean indicating if we failed to decode the entire journal (true if incomplete) - pub fn to_hashmap_with_timestamps(&self) -> (AHashMap, bool) { - let mut map = AHashMap::new(); + /// # Returns + /// Returns `false` if there are incomplete entries remaining in the journal after reading. + pub fn read(&self, mut f: impl FnMut(&M, u64)) -> bool { let mut cursor = HEADER_SIZE; let mut incomplete = false; while cursor < self.position { let remaining = &self.buffer[cursor .. self.position]; - match Frame::::decode(remaining) { + match Frame::::decode(remaining) { Ok((frame, consumed)) => { - if let Some(value) = frame.payload.value.into_option() { - map.insert( - frame.payload.key, - TimestampedValue { - value, - timestamp: frame.timestamp_micros, - }, - ); - } else { - map.remove(&frame.payload.key); - } + f(&frame.payload, frame.timestamp_micros); cursor += consumed; }, @@ -317,12 +310,12 @@ impl<'a> VersionedKVJournal<'a> { } } - (map, incomplete) + !incomplete } } /// Rotation utilities for creating new journals with compacted state -impl<'a> VersionedKVJournal<'a> { +impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { /// Create a new journal initialized with the compacted state from a snapshot. /// /// The new journal will have all current key-value pairs written with their **original From ac9d000b1059d4f2c4e1b86c0b4fb859da2c2fa5 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 14:05:18 -0800 Subject: [PATCH 44/66] use protobuf for varint encoding --- .../src/versioned_kv_journal/framing.rs | 45 +++++++------------ 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/bd-resilient-kv/src/versioned_kv_journal/framing.rs b/bd-resilient-kv/src/versioned_kv_journal/framing.rs index 3ccf2a13..3bfe5c4a 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/framing.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/framing.rs @@ -25,44 +25,31 @@ const MAX_VARINT_SIZE: usize = 10; /// Encode a u64 as a varint into the buffer. /// Returns the number of bytes written. -pub fn encode_varint(value: u64, buf: &mut [u8]) -> usize { - let mut val = value; - let mut idx = 0; - - #[allow(clippy::cast_possible_truncation)] - while val >= 0x80 { - buf[idx] = (val as u8) | 0x80; - val >>= 7; - idx += 1; - } - #[allow(clippy::cast_possible_truncation)] +pub fn encode_varint(value: u64, mut buf: &mut [u8]) -> usize { + let size = ::protobuf::rt::compute_raw_varint64_size(value) as usize; + debug_assert!(buf.len() >= size, "Buffer too small for varint encoding"); + + if protobuf::CodedOutputStream::new(&mut buf) + .write_raw_varint64(value) + .is_err() { - buf[idx] = val as u8; + // Should never happen as we ensure that there is enough space elsewhere. + return 0; } - idx + 1 + + size } /// Decode a varint from the buffer. /// Returns (value, `bytes_read`) or None if buffer is incomplete/invalid. #[must_use] pub fn decode_varint(buf: &[u8]) -> Option<(u64, usize)> { - let mut value: u64 = 0; - let mut shift = 0; - - for (idx, &byte) in buf.iter().enumerate() { - if idx >= MAX_VARINT_SIZE { - return None; // Varint too long - } - - value |= u64::from(byte & 0x7F) << shift; - shift += 7; - - if byte & 0x80 == 0 { - return Some((value, idx + 1)); - } - } + let value = protobuf::CodedInputStream::from_bytes(buf) + .read_raw_varint64() + .ok()?; - None // Incomplete varint + let bytes_read = ::protobuf::rt::compute_raw_varint64_size(value) as usize; + Some((value, bytes_read)) } /// Frame structure for a journal entry. From fbbd0c8713fc6ed1173e52e4005dba6ee1f86d89 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 14:16:55 -0800 Subject: [PATCH 45/66] simplify --- bd-resilient-kv/src/versioned_kv_journal/framing.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/bd-resilient-kv/src/versioned_kv_journal/framing.rs b/bd-resilient-kv/src/versioned_kv_journal/framing.rs index 3bfe5c4a..c1fc4cf6 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/framing.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/framing.rs @@ -23,10 +23,14 @@ use crc32fast::Hasher; /// Maximum varint size for u64 (10 bytes) const MAX_VARINT_SIZE: usize = 10; +fn varint_size(value: u64) -> usize { + ::protobuf::rt::compute_raw_varint64_size(value) as usize +} + /// Encode a u64 as a varint into the buffer. /// Returns the number of bytes written. pub fn encode_varint(value: u64, mut buf: &mut [u8]) -> usize { - let size = ::protobuf::rt::compute_raw_varint64_size(value) as usize; + let size = varint_size(value); debug_assert!(buf.len() >= size, "Buffer too small for varint encoding"); if protobuf::CodedOutputStream::new(&mut buf) @@ -48,7 +52,7 @@ pub fn decode_varint(buf: &[u8]) -> Option<(u64, usize)> { .read_raw_varint64() .ok()?; - let bytes_read = ::protobuf::rt::compute_raw_varint64_size(value) as usize; + let bytes_read = varint_size(value); Some((value, bytes_read)) } @@ -83,8 +87,7 @@ impl Frame { #[must_use] pub fn encoded_size(&self) -> usize { // Calculate varint size - let mut temp_buf = [0u8; MAX_VARINT_SIZE]; - let varint_size = encode_varint(self.timestamp_micros, &mut temp_buf); + let varint_size = varint_size(self.timestamp_micros); let payload_size: usize = self.payload.compute_size().try_into().unwrap_or(0); // length(4) + timestamp_varint + payload + crc(4) From 651fe5fc27f014de94966a362c460dab7c1eae65 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 14:54:52 -0800 Subject: [PATCH 46/66] clean up framing --- Cargo.lock | 1 - bd-resilient-kv/Cargo.toml | 1 - .../src/versioned_kv_journal/framing.rs | 68 +++++-------------- .../versioned_kv_journal/framing/varint.rs | 36 ++++++++++ .../src/versioned_kv_journal/framing_test.rs | 8 +-- 5 files changed, 56 insertions(+), 58 deletions(-) create mode 100644 bd-resilient-kv/src/versioned_kv_journal/framing/varint.rs diff --git a/Cargo.lock b/Cargo.lock index 9eb7097d..9c194ee0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1117,7 +1117,6 @@ dependencies = [ "memmap2", "protobuf 4.0.0-alpha.0", "tempfile", - "thiserror 2.0.17", "tokio", ] diff --git a/bd-resilient-kv/Cargo.toml b/bd-resilient-kv/Cargo.toml index 76d9a62a..14195590 100644 --- a/bd-resilient-kv/Cargo.toml +++ b/bd-resilient-kv/Cargo.toml @@ -27,4 +27,3 @@ flate2 = { workspace = true, features = ["zlib"] } memmap2.workspace = true tokio.workspace = true protobuf.workspace = true -thiserror.workspace = true diff --git a/bd-resilient-kv/src/versioned_kv_journal/framing.rs b/bd-resilient-kv/src/versioned_kv_journal/framing.rs index c1fc4cf6..351756f3 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/framing.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/framing.rs @@ -20,41 +20,10 @@ use bytes::BufMut; use crc32fast::Hasher; -/// Maximum varint size for u64 (10 bytes) -const MAX_VARINT_SIZE: usize = 10; +mod varint; -fn varint_size(value: u64) -> usize { - ::protobuf::rt::compute_raw_varint64_size(value) as usize -} - -/// Encode a u64 as a varint into the buffer. -/// Returns the number of bytes written. -pub fn encode_varint(value: u64, mut buf: &mut [u8]) -> usize { - let size = varint_size(value); - debug_assert!(buf.len() >= size, "Buffer too small for varint encoding"); - - if protobuf::CodedOutputStream::new(&mut buf) - .write_raw_varint64(value) - .is_err() - { - // Should never happen as we ensure that there is enough space elsewhere. - return 0; - } - - size -} - -/// Decode a varint from the buffer. -/// Returns (value, `bytes_read`) or None if buffer is incomplete/invalid. -#[must_use] -pub fn decode_varint(buf: &[u8]) -> Option<(u64, usize)> { - let value = protobuf::CodedInputStream::from_bytes(buf) - .read_raw_varint64() - .ok()?; - - let bytes_read = varint_size(value); - Some((value, bytes_read)) -} +const CRC_LEN: usize = 4; +const LENGTH_LEN: usize = 4; /// Frame structure for a journal entry. #[derive(Debug, Clone, PartialEq, Eq)] @@ -68,7 +37,7 @@ pub struct Frame { impl Frame { pub fn decode_timestamp(buf: &[u8]) -> anyhow::Result<(u64, usize)> { let (timestamp_micros, timestamp_len) = - decode_varint(buf).ok_or_else(|| anyhow::anyhow!("Invalid varint"))?; + varint::decode(buf).ok_or_else(|| anyhow::anyhow!("Invalid varint"))?; Ok((timestamp_micros, timestamp_len)) } } @@ -87,11 +56,11 @@ impl Frame { #[must_use] pub fn encoded_size(&self) -> usize { // Calculate varint size - let varint_size = varint_size(self.timestamp_micros); + let varint_size = varint::compute_size(self.timestamp_micros); let payload_size: usize = self.payload.compute_size().try_into().unwrap_or(0); // length(4) + timestamp_varint + payload + crc(4) - 4 + varint_size + payload_size + 4 + LENGTH_LEN + varint_size + payload_size + CRC_LEN } /// Encode this frame into a buffer. @@ -111,8 +80,8 @@ impl Frame { let mut cursor = buf; // Encode timestamp to calculate frame length - let mut timestamp_buf = [0u8; MAX_VARINT_SIZE]; - let timestamp_len = encode_varint(self.timestamp_micros, &mut timestamp_buf); + let mut timestamp_buf = [0u8; varint::MAX_SIZE]; + let timestamp_len = varint::encode(self.timestamp_micros, &mut timestamp_buf); let payload_bytes = self .payload @@ -120,25 +89,20 @@ impl Frame { .map_err(|e| anyhow::anyhow!("Failed to serialize payload: {e}"))?; // Frame length = timestamp + payload + crc - let frame_len = timestamp_len + payload_bytes.len() + 4; + let frame_len = timestamp_len + payload_bytes.len() + CRC_LEN; #[allow(clippy::cast_possible_truncation)] { cursor.put_u32_le(frame_len as u32); } - // Write timestamp varint cursor.put_slice(×tamp_buf[.. timestamp_len]); - - // Write payload cursor.put_slice(&payload_bytes); - // Calculate CRC over timestamp + payload let mut hasher = Hasher::new(); hasher.update(×tamp_buf[.. timestamp_len]); hasher.update(payload_bytes.as_slice()); let crc = hasher.finalize(); - // Write CRC cursor.put_u32_le(crc); Ok(required_size) @@ -148,15 +112,15 @@ impl Frame { /// /// Returns (Frame, `bytes_consumed`) or error if invalid/incomplete. pub fn decode(buf: &[u8]) -> anyhow::Result<(Self, usize)> { - if buf.len() < 4 { + if buf.len() < LENGTH_LEN { anyhow::bail!("Buffer too small for length field"); } // Read frame length - let frame_len = u32::from_le_bytes(buf[0 .. 4].try_into()?) as usize; + let frame_len = u32::from_le_bytes(buf[0 .. LENGTH_LEN].try_into()?) as usize; // Check if we have the complete frame - let total_len = 4 + frame_len; // length field + frame + let total_len = LENGTH_LEN + frame_len; // length field + frame if buf.len() < total_len { anyhow::bail!( "Incomplete frame: need {} bytes, have {} bytes", @@ -165,18 +129,18 @@ impl Frame { ); } - let frame_data = &buf[4 .. total_len]; + let frame_data = &buf[LENGTH_LEN .. total_len]; // Decode timestamp varint let (timestamp_micros, timestamp_len) = - decode_varint(frame_data).ok_or_else(|| anyhow::anyhow!("Invalid varint"))?; + varint::decode(frame_data).ok_or_else(|| anyhow::anyhow!("Invalid varint"))?; // Extract payload and CRC - if frame_data.len() < timestamp_len + 4 { + if frame_data.len() < timestamp_len + CRC_LEN { anyhow::bail!("Frame too small for CRC"); } - let payload_end = frame_data.len() - 4; + let payload_end = frame_data.len() - CRC_LEN; let payload = frame_data[timestamp_len .. payload_end].to_vec(); let stored_crc = u32::from_le_bytes(frame_data[payload_end ..].try_into()?); diff --git a/bd-resilient-kv/src/versioned_kv_journal/framing/varint.rs b/bd-resilient-kv/src/versioned_kv_journal/framing/varint.rs new file mode 100644 index 00000000..8778fed8 --- /dev/null +++ b/bd-resilient-kv/src/versioned_kv_journal/framing/varint.rs @@ -0,0 +1,36 @@ +/// Maximum varint size for u64 (10 bytes) +pub const MAX_SIZE: usize = 10; + +/// Calculate the size of a u64 when encoded as a varint. +pub fn compute_size(value: u64) -> usize { + ::protobuf::rt::compute_raw_varint64_size(value) as usize +} + +/// Encode a u64 as a varint into the buffer. +/// Returns the number of bytes written. +pub fn encode(value: u64, mut buf: &mut [u8]) -> usize { + let size = compute_size(value); + debug_assert!(buf.len() >= size, "Buffer too small for varint encoding"); + + if protobuf::CodedOutputStream::new(&mut buf) + .write_raw_varint64(value) + .is_err() + { + // Should never happen as we ensure that there is enough space elsewhere. + return 0; + } + + size +} + +/// Decode a varint from the buffer. +/// Returns (value, `bytes_read`) or None if buffer is incomplete/invalid. +#[must_use] +pub fn decode(buf: &[u8]) -> Option<(u64, usize)> { + let value = protobuf::CodedInputStream::from_bytes(buf) + .read_raw_varint64() + .ok()?; + + let bytes_read = compute_size(value); + Some((value, bytes_read)) +} diff --git a/bd-resilient-kv/src/versioned_kv_journal/framing_test.rs b/bd-resilient-kv/src/versioned_kv_journal/framing_test.rs index 6ee5f13c..784b3ae6 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/framing_test.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/framing_test.rs @@ -50,7 +50,7 @@ fn varint_decoding() { ]; for (buf, expected_value, expected_len) in test_cases { - let (value, len) = decode_varint(&buf).unwrap(); + let (value, len) = decode(&buf).unwrap(); assert_eq!(value, expected_value, "Failed for buffer {buf:?}"); assert_eq!(len, expected_len, "Wrong length for buffer {buf:?}"); } @@ -63,7 +63,7 @@ fn varint_roundtrip() { for value in values { let mut buf = [0u8; MAX_VARINT_SIZE]; let encoded_len = encode_varint(value, &mut buf); - let (decoded_value, decoded_len) = decode_varint(&buf).unwrap(); + let (decoded_value, decoded_len) = decode(&buf).unwrap(); assert_eq!(decoded_value, value, "Roundtrip failed for {value}"); assert_eq!(decoded_len, encoded_len, "Length mismatch for {value}"); @@ -74,7 +74,7 @@ fn varint_roundtrip() { fn varint_incomplete() { // Incomplete varint (has continuation bit but no next byte) let buf = vec![0x80]; - assert!(decode_varint(&buf).is_none()); + assert!(decode(&buf).is_none()); } #[test] @@ -83,7 +83,7 @@ fn varint_too_long() { let buf = vec![ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, ]; - assert!(decode_varint(&buf).is_none()); + assert!(decode(&buf).is_none()); } #[test] From e41363370377fe2c3ce1b7b408640ee4f366fbea Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 15:19:42 -0800 Subject: [PATCH 47/66] renames, move stuff around --- .../src/versioned_kv_journal/framing_test.rs | 16 ++-- .../{versioned.rs => journal.rs} | 63 ++++++---------- ...pped_versioned.rs => memmapped_journal.rs} | 68 ++++++++--------- .../src/versioned_kv_journal/mod.rs | 4 +- .../src/versioned_kv_journal/store.rs | 73 ++++++++++++++----- 5 files changed, 117 insertions(+), 107 deletions(-) rename bd-resilient-kv/src/versioned_kv_journal/{versioned.rs => journal.rs} (91%) rename bd-resilient-kv/src/versioned_kv_journal/{memmapped_versioned.rs => memmapped_journal.rs} (75%) diff --git a/bd-resilient-kv/src/versioned_kv_journal/framing_test.rs b/bd-resilient-kv/src/versioned_kv_journal/framing_test.rs index 784b3ae6..b0651f35 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/framing_test.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/framing_test.rs @@ -27,8 +27,8 @@ fn varint_encoding() { ]; for (value, expected) in test_cases { - let mut buf = [0u8; MAX_VARINT_SIZE]; - let len = encode_varint(value, &mut buf); + let mut buf = [0u8; varint::MAX_SIZE]; + let len = varint::encode(value, &mut buf); assert_eq!(&buf[.. len], &expected[..], "Failed for value {value}"); } } @@ -50,7 +50,7 @@ fn varint_decoding() { ]; for (buf, expected_value, expected_len) in test_cases { - let (value, len) = decode(&buf).unwrap(); + let (value, len) = varint::decode(&buf).unwrap(); assert_eq!(value, expected_value, "Failed for buffer {buf:?}"); assert_eq!(len, expected_len, "Wrong length for buffer {buf:?}"); } @@ -61,9 +61,9 @@ fn varint_roundtrip() { let values = vec![0, 1, 127, 128, 255, 256, 65535, 65536, 1_000_000, u64::MAX]; for value in values { - let mut buf = [0u8; MAX_VARINT_SIZE]; - let encoded_len = encode_varint(value, &mut buf); - let (decoded_value, decoded_len) = decode(&buf).unwrap(); + let mut buf = [0u8; varint::MAX_SIZE]; + let encoded_len = varint::encode(value, &mut buf); + let (decoded_value, decoded_len) = varint::decode(&buf).unwrap(); assert_eq!(decoded_value, value, "Roundtrip failed for {value}"); assert_eq!(decoded_len, encoded_len, "Length mismatch for {value}"); @@ -74,7 +74,7 @@ fn varint_roundtrip() { fn varint_incomplete() { // Incomplete varint (has continuation bit but no next byte) let buf = vec![0x80]; - assert!(decode(&buf).is_none()); + assert!(varint::decode(&buf).is_none()); } #[test] @@ -83,7 +83,7 @@ fn varint_too_long() { let buf = vec![ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, ]; - assert!(decode(&buf).is_none()); + assert!(varint::decode(&buf).is_none()); } #[test] diff --git a/bd-resilient-kv/src/versioned_kv_journal/versioned.rs b/bd-resilient-kv/src/versioned_kv_journal/journal.rs similarity index 91% rename from bd-resilient-kv/src/versioned_kv_journal/versioned.rs rename to bd-resilient-kv/src/versioned_kv_journal/journal.rs index 39072ae5..babe517a 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/versioned.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/journal.rs @@ -6,10 +6,7 @@ // https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt use super::framing::Frame; -use crate::versioned_kv_journal::TimestampedValue; -use ahash::AHashMap; use bd_client_common::error::InvariantError; -use bd_proto::protos::state::payload::StateKeyValuePair; use std::time::{SystemTime, UNIX_EPOCH}; /// Timestamped implementation of a journaling system that uses timestamps @@ -42,12 +39,6 @@ pub struct VersionedJournal<'a, M> { // // Frame format: [length: u32][timestamp_micros: varint][protobuf_payload: bytes][crc32: u32] // -// Payload format: -// - Uses `StateKeyValuePair` protobuf messages -// - Contains `key: String` and `value: StateValue` fields -// - Set operation: `value` field is populated with the state value -// - Delete operation: `value` field is null/empty -// // # Timestamp Semantics // // Timestamps serve as both version identifiers and logical clocks with monotonic guarantees: @@ -56,27 +47,14 @@ pub struct VersionedJournal<'a, M> { // - When timestamps collide, journal ordering determines precedence // - This ensures total ordering while allowing correlation with external timestamped systems -const VERSION: u64 = 3; // The versioned format version (bumped for new framing format) +// The journal format version, incremented on incompatible changes. +const VERSION: u64 = 1; const HEADER_SIZE: usize = 17; // Minimum buffer size for a valid journal const MIN_BUFFER_SIZE: usize = HEADER_SIZE + 4; -/// Get current timestamp in microseconds since UNIX epoch. -fn current_timestamp() -> anyhow::Result { - SystemTime::now() - .duration_since(UNIX_EPOCH) - .map_err(|_| InvariantError::Invariant.into()) - .map(|d| { - #[allow(clippy::cast_possible_truncation)] - { - d.as_micros() as u64 - } - }) -} - - /// Write to the version field of a journal buffer. fn write_version_field(buffer: &mut [u8], version: u64) { let version_bytes = version.to_le_bytes(); @@ -318,41 +296,35 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { /// Create a new journal initialized with the compacted state from a snapshot. /// - /// The new journal will have all current key-value pairs written with their **original + /// The new journal will have all current entries written with their **original /// timestamps** to preserve historical accuracy. The journal's monotonic timestamp /// enforcement will respect the highest timestamp in the provided state. /// /// # Arguments /// * `buffer` - The buffer to write the new journal to - /// * `state` - The current key-value state with timestamps to write + /// * `entries` - Iterator over the entries that should be included in the new journal /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark /// /// # Errors /// Returns an error if serialization fails or buffer is too small. pub fn create_rotated_journal( + &self, buffer: &'a mut [u8], - state: &AHashMap, + entries: impl IntoIterator, high_water_mark_ratio: Option, ) -> anyhow::Result { // Create a new journal let mut journal = Self::new(buffer, high_water_mark_ratio)?; // Find the maximum timestamp in the state to maintain monotonicity - let max_state_timestamp = state.values().map(|tv| tv.timestamp).max().unwrap_or(0); + let max_state_timestamp = self.last_timestamp; // Write all current state with their original timestamps - for (key, timestamped_value) in state { + for (entry, timestamp) in entries.into_iter().map(Into::into) { // Update last_timestamp to ensure monotonicity is maintained - journal.last_timestamp = std::cmp::max(journal.last_timestamp, timestamped_value.timestamp); - - let frame = Frame::new( - timestamped_value.timestamp, - StateKeyValuePair { - key: key.clone(), - value: Some(timestamped_value.value.clone()).into(), - ..Default::default() - }, - ); + journal.last_timestamp = std::cmp::max(journal.last_timestamp, timestamp); + + let frame = Frame::new(timestamp, entry); // Encode frame let available_space = &mut journal.buffer[journal.position ..]; @@ -367,3 +339,16 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { Ok(journal) } } + +/// Get current timestamp in microseconds since UNIX epoch. +fn current_timestamp() -> anyhow::Result { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_err(|_| InvariantError::Invariant.into()) + .map(|d| { + #[allow(clippy::cast_possible_truncation)] + { + d.as_micros() as u64 + } + }) +} diff --git a/bd-resilient-kv/src/versioned_kv_journal/memmapped_versioned.rs b/bd-resilient-kv/src/versioned_kv_journal/memmapped_journal.rs similarity index 75% rename from bd-resilient-kv/src/versioned_kv_journal/memmapped_versioned.rs rename to bd-resilient-kv/src/versioned_kv_journal/memmapped_journal.rs index 97cf1e64..f8a90268 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/memmapped_versioned.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/memmapped_journal.rs @@ -5,15 +5,13 @@ // LICENSE file or at: // https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt -use super::versioned::VersionedJournal; -use crate::versioned_kv_journal::TimestampedValue; -use ahash::AHashMap; +use super::journal::VersionedJournal; use bd_proto::protos::state::payload::StateKeyValuePair; use memmap2::{MmapMut, MmapOptions}; use std::fs::OpenOptions; use std::path::Path; -/// Memory-mapped implementation of a timestamped key-value journal. +/// Memory-mapped implementation of a timestamped journal. /// /// This implementation uses memory-mapped files to provide persistence while maintaining /// the efficiency of in-memory operations. All changes are automatically synced to disk. @@ -21,16 +19,30 @@ use std::path::Path; /// /// # Safety /// During construction, we unsafely declare mmap's internal buffer as having a static -/// lifetime, but it's actually tied to the lifetime of `versioned_kv`. This works because +/// lifetime, but it's actually tied to the lifetime of `inner`. This works because /// nothing external holds a reference to the buffer. #[derive(Debug)] -pub struct MemMappedVersionedKVJournal { +pub struct MemMappedVersionedJournal { // Note: mmap MUST de-init AFTER versioned_kv because mmap uses it. mmap: MmapMut, - versioned_kv: VersionedJournal<'static, StateKeyValuePair>, + inner: VersionedJournal<'static, StateKeyValuePair>, } -impl MemMappedVersionedKVJournal { +impl std::ops::Deref for MemMappedVersionedJournal { + type Target = VersionedJournal<'static, StateKeyValuePair>; + + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +impl std::ops::DerefMut for MemMappedVersionedJournal { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } +} + +impl MemMappedVersionedJournal { /// Create a memory-mapped buffer from a file and convert it to a static lifetime slice. /// /// # Safety @@ -84,7 +96,10 @@ impl MemMappedVersionedKVJournal { let versioned_kv = VersionedJournal::new(buffer, high_water_mark_ratio)?; - Ok(Self { mmap, versioned_kv }) + Ok(Self { + mmap, + inner: versioned_kv, + }) } /// Create a new memory-mapped versioned KV journal from an existing file. @@ -116,33 +131,10 @@ impl MemMappedVersionedKVJournal { let versioned_kv = VersionedJournal::from_buffer(buffer, high_water_mark_ratio)?; - Ok(Self { mmap, versioned_kv }) - } - - /// Insert a new entry into the journal with the given payload. - /// Returns the timestamp of the operation. - pub fn insert_entry(&mut self, message: impl protobuf::MessageFull) -> anyhow::Result { - self.versioned_kv.insert_entry(message) - } - - - /// Check if the high water mark has been triggered. - #[must_use] - pub fn is_high_water_mark_triggered(&self) -> bool { - self.versioned_kv.is_high_water_mark_triggered() - } - - /// Reconstruct the hashmap with timestamps by replaying all journal entries. - pub fn to_hashmap_with_timestamps(&self) -> (AHashMap, bool) { - let mut map = AHashMap::new(); - let complete = self.versioned_kv.read(|payload, timestamp| { - if let Some(value) = payload.value.clone().into_option() { - map.insert(payload.key.clone(), TimestampedValue { value, timestamp }); - } else { - map.remove(&payload.key); - } - }); - (map, !complete) + Ok(Self { + mmap, + inner: versioned_kv, + }) } /// Synchronize changes to disk. @@ -156,7 +148,7 @@ impl MemMappedVersionedKVJournal { /// /// # Errors /// Returns an error if the sync operation fails. - pub fn sync(&self) -> anyhow::Result<()> { - self.mmap.flush().map_err(Into::into) + pub fn sync(journal: &Self) -> anyhow::Result<()> { + journal.mmap.flush().map_err(Into::into) } } diff --git a/bd-resilient-kv/src/versioned_kv_journal/mod.rs b/bd-resilient-kv/src/versioned_kv_journal/mod.rs index 0a62522b..5b415d42 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/mod.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/mod.rs @@ -2,10 +2,10 @@ use bd_proto::protos::state; mod file_manager; mod framing; -mod memmapped_versioned; +mod journal; +mod memmapped_journal; pub mod recovery; pub mod store; -mod versioned; /// Represents a value with its associated timestamp. #[derive(Debug, Clone, PartialEq)] diff --git a/bd-resilient-kv/src/versioned_kv_journal/store.rs b/bd-resilient-kv/src/versioned_kv_journal/store.rs index 766a1b02..cda503eb 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/store.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/store.rs @@ -7,8 +7,8 @@ use crate::versioned_kv_journal::TimestampedValue; use crate::versioned_kv_journal::file_manager::{self, compress_archived_journal}; -use crate::versioned_kv_journal::memmapped_versioned::MemMappedVersionedKVJournal; -use crate::versioned_kv_journal::versioned::VersionedJournal; +use crate::versioned_kv_journal::journal::VersionedJournal; +use crate::versioned_kv_journal::memmapped_journal::MemMappedVersionedJournal; use ahash::AHashMap; use bd_proto::protos::state::payload::{StateKeyValuePair, StateValue}; use std::path::{Path, PathBuf}; @@ -33,7 +33,7 @@ pub enum DataLoss { /// For detailed information about timestamp semantics, recovery bucketing, and invariants, /// see the `VERSIONED_FORMAT.md` documentation. pub struct VersionedKVStore { - journal: MemMappedVersionedKVJournal, + journal: MemMappedVersionedJournal, cached_map: AHashMap, dir_path: PathBuf, journal_name: String, @@ -71,14 +71,14 @@ impl VersionedKVStore { let (journal, mut data_loss) = if journal_path.exists() { // Try to open existing journal - MemMappedVersionedKVJournal::from_file(&journal_path, buffer_size, high_water_mark_ratio) + MemMappedVersionedJournal::from_file(&journal_path, buffer_size, high_water_mark_ratio) .map(|j| (j, DataLoss::None)) .or_else(|_| { // TODO(snowp): Distinguish between partial and total data loss. // Data is corrupt or unreadable, create fresh journal Ok::<_, anyhow::Error>(( - MemMappedVersionedKVJournal::new(&journal_path, buffer_size, high_water_mark_ratio)?, + MemMappedVersionedJournal::new(&journal_path, buffer_size, high_water_mark_ratio)?, DataLoss::Total, )) })? @@ -86,12 +86,12 @@ impl VersionedKVStore { // Create new journal ( - MemMappedVersionedKVJournal::new(&journal_path, buffer_size, high_water_mark_ratio)?, + MemMappedVersionedJournal::new(&journal_path, buffer_size, high_water_mark_ratio)?, DataLoss::None, ) }; - let (cached_map, incomplete) = journal.to_hashmap_with_timestamps(); + let (initial_state, incomplete) = Self::populate_initial_state(&journal); if incomplete && data_loss == DataLoss::None { data_loss = DataLoss::Partial; @@ -100,7 +100,7 @@ impl VersionedKVStore { Ok(( Self { journal, - cached_map, + cached_map: initial_state, dir_path: dir.to_path_buf(), journal_name: name.to_string(), buffer_size, @@ -139,13 +139,14 @@ impl VersionedKVStore { let (journal_path, generation) = file_manager::find_active_journal(dir, name); let journal = - MemMappedVersionedKVJournal::from_file(&journal_path, buffer_size, high_water_mark_ratio)?; - let (cached_map, incomplete) = journal.to_hashmap_with_timestamps(); + MemMappedVersionedJournal::from_file(&journal_path, buffer_size, high_water_mark_ratio)?; + + let (initial_state, incomplete) = Self::populate_initial_state(&journal); Ok(( Self { journal, - cached_map, + cached_map: initial_state, dir_path: dir.to_path_buf(), journal_name: name.to_string(), buffer_size, @@ -205,6 +206,8 @@ impl VersionedKVStore { // Check if rotation is needed if self.journal.is_high_water_mark_triggered() { + // TODO(snowp): Consider doing this out of band to split error handling for the insert and + // rotation. self.rotate_journal().await?; } @@ -273,7 +276,7 @@ impl VersionedKVStore { /// This is a blocking operation that performs synchronous I/O. In async contexts, /// consider wrapping this call with `tokio::task::spawn_blocking`. pub fn sync(&self) -> anyhow::Result<()> { - self.journal.sync() + MemMappedVersionedJournal::sync(&self.journal) } /// Manually trigger journal rotation. @@ -289,6 +292,10 @@ impl VersionedKVStore { .dir_path .join(format!("{}.jrn.{next_generation}", self.journal_name)); + // TODO(snowp): This part needs fuzzing and more safeguards. + // TODO(snowp): Consider doing this out of band to split error handling for the insert and + // rotation. + // Create new journal with compacted state let new_journal = self.create_rotated_journal(&new_journal_path).await?; @@ -339,6 +346,27 @@ impl VersionedKVStore { } } + fn populate_initial_state( + journal: &VersionedJournal<'_, StateKeyValuePair>, + ) -> (AHashMap, bool) { + let mut map = AHashMap::new(); + let incomplete = journal.read(|entry, timestamp| { + if let Some(value) = entry.value.as_ref() { + map.insert( + entry.key.clone(), + TimestampedValue { + value: value.clone(), + timestamp, + }, + ); + } else { + map.remove(&entry.key); + } + }); + + (map, incomplete) + } + /// Create a new rotated journal with compacted state. /// /// Note: Rotation cannot fail due to insufficient buffer space. Since rotation creates a new @@ -348,14 +376,23 @@ impl VersionedKVStore { async fn create_rotated_journal( &self, journal_path: &Path, - ) -> anyhow::Result { + ) -> anyhow::Result { // Create in-memory buffer for new journal let mut buffer = vec![0u8; self.buffer_size]; // Use VersionedJournal to create rotated journal in memory - let _rotated = VersionedJournal::::create_rotated_journal( + let _rotated = self.journal.create_rotated_journal( &mut buffer, - &self.cached_map, + self.cached_map.iter().map(|kv| { + ( + StateKeyValuePair { + key: kv.0.clone(), + value: Some(kv.1.value.clone()).into(), + ..Default::default() + }, + kv.1.timestamp, + ) + }), self.high_water_mark_ratio, )?; @@ -363,10 +400,6 @@ impl VersionedKVStore { tokio::fs::write(journal_path, &buffer).await?; // Open as memory-mapped journal - MemMappedVersionedKVJournal::from_file( - journal_path, - self.buffer_size, - self.high_water_mark_ratio, - ) + MemMappedVersionedJournal::from_file(journal_path, self.buffer_size, self.high_water_mark_ratio) } } From 4ee6510c3b3741989512900272a009f1e83b31fe Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 15:25:17 -0800 Subject: [PATCH 48/66] fixes --- bd-resilient-kv/AGENTS.md | 4 ++-- bd-resilient-kv/VERSIONED_FORMAT.md | 4 ++-- bd-resilient-kv/src/versioned_kv_journal/framing/varint.rs | 2 ++ bd-resilient-kv/src/versioned_kv_journal/journal.rs | 2 +- bd-resilient-kv/src/versioned_kv_journal/recovery.rs | 2 +- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/bd-resilient-kv/AGENTS.md b/bd-resilient-kv/AGENTS.md index 62aac92c..989b2b72 100644 --- a/bd-resilient-kv/AGENTS.md +++ b/bd-resilient-kv/AGENTS.md @@ -86,7 +86,7 @@ The `VersionedKVStore` provides a higher-level API built on top of `VersionedKVJ - Recovery transparently decompresses archived journals when needed **Point-in-Time Recovery**: -The `VersionedRecovery` utility provides point-in-time recovery by replaying journal entries up to a target timestamp. It works with raw journal bytes and can reconstruct state at any historical timestamp across rotation boundaries. Recovery is optimized: `recover_current()` only reads the last journal (since rotation writes complete compacted state), while `recover_at_timestamp()` intelligently selects and replays only necessary journals. The `from_files()` constructor is async for efficient file reading. +The `VersionedRecovery` utility provides point-in-time recovery by replaying journal entries up to a target timestamp. It works with raw uncompressed journal bytes and can reconstruct state at any historical timestamp across rotation boundaries. Recovery is optimized: `recover_current()` only reads the last journal (since rotation writes complete compacted state), while `recover_at_timestamp()` intelligently selects and replays only necessary journals. The `new()` constructor accepts a vector of `(&[u8], u64)` tuples (byte slice and snapshot timestamp). Callers must decompress archived journals before passing them to the constructor. ## Critical Design Insights @@ -104,7 +104,7 @@ The `VersionedRecovery` utility provides point-in-time recovery by replaying jou - Architecture: Single journal with archived versions - Rotation: Creates new journal with compacted state - Timestamp tracking: Every write returns a timestamp -- Format: Protobuf-based entries (VERSION 3) +- Format: Protobuf-based entries (VERSION 1) ### 2. Compaction Efficiency **Key Insight**: Compaction via `reinit_from()` is already maximally efficient. It writes data in the most compact possible serialized form (hashmap → bytes). If even this compact representation exceeds high water marks, then the data volume itself is the limiting factor, not inefficient storage. diff --git a/bd-resilient-kv/VERSIONED_FORMAT.md b/bd-resilient-kv/VERSIONED_FORMAT.md index 9e0411a6..a2af013a 100644 --- a/bd-resilient-kv/VERSIONED_FORMAT.md +++ b/bd-resilient-kv/VERSIONED_FORMAT.md @@ -34,7 +34,7 @@ Previous journals, archived during rotation. Each contains complete state at its ### Binary Structure -The byte-level layout of a VERSION 3 journal file: +The byte-level layout of a VERSION 1 journal file: ``` ┌─────────────────────────────────────────────────────────────────────────┐ @@ -67,7 +67,7 @@ The byte-level layout of a VERSION 3 journal file: | Field | Offset | Size | Type | Value | Purpose | |-------|--------|------|------|-------|---------| -| Format Version | 0 | 8 bytes | u64 (little-endian) | `3` | Allows future format evolution | +| Format Version | 0 | 8 bytes | u64 (little-endian) | `1` | Allows future format evolution | | Position | 8 | 8 bytes | u64 (little-endian) | Current write position | Tracks where next entry will be written | | Reserved | 16 | 1 byte | u8 | `0` | Reserved for future use | diff --git a/bd-resilient-kv/src/versioned_kv_journal/framing/varint.rs b/bd-resilient-kv/src/versioned_kv_journal/framing/varint.rs index 8778fed8..dfc97fbe 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/framing/varint.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/framing/varint.rs @@ -2,7 +2,9 @@ pub const MAX_SIZE: usize = 10; /// Calculate the size of a u64 when encoded as a varint. +#[allow(clippy::cast_possible_truncation)] pub fn compute_size(value: u64) -> usize { + // Safe cast: varint encoding of u64 is at most 10 bytes, which fits in usize on all platforms ::protobuf::rt::compute_raw_varint64_size(value) as usize } diff --git a/bd-resilient-kv/src/versioned_kv_journal/journal.rs b/bd-resilient-kv/src/versioned_kv_journal/journal.rs index babe517a..560adde0 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/journal.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/journal.rs @@ -320,7 +320,7 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { let max_state_timestamp = self.last_timestamp; // Write all current state with their original timestamps - for (entry, timestamp) in entries.into_iter().map(Into::into) { + for (entry, timestamp) in entries { // Update last_timestamp to ensure monotonicity is maintained journal.last_timestamp = std::cmp::max(journal.last_timestamp, timestamp); diff --git a/bd-resilient-kv/src/versioned_kv_journal/recovery.rs b/bd-resilient-kv/src/versioned_kv_journal/recovery.rs index 62ee51fa..edf39166 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/recovery.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/recovery.rs @@ -36,7 +36,7 @@ impl VersionedRecovery { /// Create a new recovery utility from a list of uncompressed snapshot byte slices. /// /// The snapshots should be provided in chronological order (oldest to newest). - /// Each snapshot must be a valid uncompressed versioned journal (VERSION 3 format). + /// Each snapshot must be a valid uncompressed versioned journal (VERSION 1 format). /// /// # Arguments /// From af5c3a1fca47a19b6fe1b17594544f67cc7aee95 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 15:25:37 -0800 Subject: [PATCH 49/66] remove --- bd-resilient-kv/BATCH_OPERATIONS_DESIGN.md | 318 --------------------- 1 file changed, 318 deletions(-) delete mode 100644 bd-resilient-kv/BATCH_OPERATIONS_DESIGN.md diff --git a/bd-resilient-kv/BATCH_OPERATIONS_DESIGN.md b/bd-resilient-kv/BATCH_OPERATIONS_DESIGN.md deleted file mode 100644 index a9d0fc8c..00000000 --- a/bd-resilient-kv/BATCH_OPERATIONS_DESIGN.md +++ /dev/null @@ -1,318 +0,0 @@ -# Batch Operations Design for VersionedKVStore - -## Context - -The versioned KV store needs efficient batch operations for use cases like: -- Clearing all state on app startup (while preserving journal history) -- Bulk insertions/deletions (e.g., clearing keys with specific prefix) -- Reducing overhead from repeated timestamp syscalls and rotation checks - -## Key Insight: Version Transparency - -**Users don't need to worry about internal version splitting.** If a batch operation spans multiple versions internally due to capacity constraints, the returned version still accurately represents the final state after all operations complete. - -### Example: -```rust -// User calls: -let v = store.insert_batch(100_entries).await?; - -// Internally might use: -// v-2: entries 1-50 -// v-1: entries 51-80 -// v: entries 81-100 - -// User receives v, and at version v, all 100 entries are present ✅ -``` - -This works because: -1. Single-threaded context (no concurrent observers) -2. Version number represents "state identifier" not "operation identifier" -3. By the time function returns, all entries are in `cached_map` - -## Critical Problem: Mid-Batch Rotation - -### The Issue - -When rotation happens mid-batch: -- Rotation compacts state from `cached_map` to new journal -- New journal is NOT empty - it contains all current state -- Available space after rotation = buffer_size - compacted_state_size -- **Pre-flight rotation check doesn't guarantee batch will fit** - -### Example Scenario: -``` -cached_map has 500 keys (50KB when serialized) -buffer_size = 64KB -Batch of 200 keys (20KB estimated) - -After rotation: -- New journal has 500 keys = 50KB -- Available space = 64KB - 50KB = 14KB -- Batch needs 20KB -- ❌ Still doesn't fit! -``` - -## Solution: Transparent Chunking - -Let batch operations automatically split when needed, but return single version representing final state. - -### API Design - -```rust -impl VersionedKVStore { - /// Insert multiple key-value pairs efficiently. - /// - /// Returns the final version after all entries have been written. - /// Entries are written in a batch to minimize overhead (shared timestamp, - /// deferred rotation checks). If the batch is too large to fit in available - /// journal space, it will be automatically split across multiple versions. - /// - /// The returned version represents the state after ALL entries have been - /// applied, regardless of whether they were split internally. - pub async fn insert_batch( - &mut self, - entries: Vec<(String, Value)> - ) -> anyhow::Result { - if entries.is_empty() { - return Ok(self.current_version()); - } - - let timestamp = current_timestamp()?; - - for (key, value) in entries { - // Check if we have space for this entry - let estimated_size = self.estimate_entry_size(&key, &value); - - if self.journal.remaining_capacity() < estimated_size { - // Rotate to make space - self.rotate_journal().await?; - - // Verify single entry fits after rotation - if self.journal.remaining_capacity() < estimated_size { - anyhow::bail!( - "Single entry too large: key='{}', size={} bytes, available={} bytes", - key, estimated_size, self.journal.remaining_capacity() - ); - } - } - - // Write entry - self.current_version += 1; - let version = self.current_version; - self.journal.write_versioned_entry_with_timestamp( - version, &key, &value, timestamp - )?; - self.cached_map.insert(key, TimestampedValue { value, timestamp }); - } - - // Check if we should rotate after batch - if self.journal.is_high_water_mark_triggered() { - self.rotate_journal().await?; - } - - Ok(self.current_version()) - } - - /// Delete multiple keys efficiently. - pub async fn delete_batch(&mut self, keys: Vec) -> anyhow::Result { - if keys.is_empty() { - return Ok(self.current_version()); - } - - let timestamp = current_timestamp()?; - - for key in keys { - // Deletion entries are small (null value), less likely to overflow - if self.journal.remaining_capacity() < 100 { // Conservative - self.rotate_journal().await?; - } - - self.current_version += 1; - let version = self.current_version; - self.journal.write_versioned_entry_with_timestamp( - version, &key, &Value::Null, timestamp - )?; - self.cached_map.remove(&key); - } - - if self.journal.is_high_water_mark_triggered() { - self.rotate_journal().await?; - } - - Ok(self.current_version()) - } - - /// Clear all keys in the store. - pub async fn clear_all(&mut self) -> anyhow::Result { - let keys: Vec = self.cached_map.keys().cloned().collect(); - self.delete_batch(keys).await - } - - /// Clear all keys with the given prefix. - pub async fn clear_prefix(&mut self, prefix: &str) -> anyhow::Result { - let keys: Vec = self.cached_map - .keys() - .filter(|k| k.starts_with(prefix)) - .cloned() - .collect(); - self.delete_batch(keys).await - } -} -``` - -## Implementation Requirements - -### 1. Add Helper Methods to VersionedKVJournal - -```rust -impl VersionedKVJournal<'_> { - /// Write a versioned entry with a pre-computed timestamp. - /// This enables batch operations to share a single timestamp. - pub fn write_versioned_entry_with_timestamp( - &mut self, - version: u64, - key: &str, - value: &Value, - timestamp: u64, - ) -> anyhow::Result<()> { - // Similar to write_versioned_entry but uses provided timestamp - } - - /// Get remaining capacity in bytes. - pub fn remaining_capacity(&self) -> usize { - self.buffer.len() - self.position - } -} -``` - -### 2. Add Estimation Helpers to VersionedKVStore - -```rust -impl VersionedKVStore { - /// Estimate bytes needed for a single entry - fn estimate_entry_size(&self, key: &str, value: &Value) -> usize { - // Conservative estimate: - // - Version: 10 bytes - // - Timestamp: 10 bytes - // - Key: key.len() + 10 bytes overhead - // - Value: estimated_value_size(v) + 10 bytes overhead - // - Object overhead: ~20 bytes - 60 + key.len() + self.estimate_value_size(value) - } - - /// Estimate bytes needed for value - fn estimate_value_size(&self, value: &Value) -> usize { - match value { - Value::Null => 1, - Value::Bool(_) => 1, - Value::Signed(_) | Value::Unsigned(_) => 9, - Value::Float(_) => 9, - Value::String(s) => s.len() + 5, - Value::Array(arr) => arr.iter().map(|v| self.estimate_value_size(v)).sum::() + 5, - Value::Object(obj) => obj.iter().map(|(k, v)| k.len() + self.estimate_value_size(v)).sum::() + 5, - Value::Binary(b) => b.len() + 5, - } - } -} -``` - -## Benefits - -1. **Simple API**: Returns single version representing final state -2. **Transparent**: Automatic capacity handling, no user intervention needed -3. **Efficient**: Shared timestamp within natural chunks -4. **Always succeeds**: No "batch too large" errors (except if single entry > buffer) -5. **Correct semantics**: Final version accurately represents state -6. **Single-threaded safe**: No intermediate state observation issues - -## Trade-offs - -### What We Give Up: -- Guarantee that batch uses single version number internally -- Ability to say "these N operations happened atomically at version V" - -### What We Keep: -- ✅ Final state correctness -- ✅ Simple API (returns single version) -- ✅ Efficient batch processing (50-80% overhead reduction from original analysis) -- ✅ Automatic capacity management -- ✅ Full audit trail (can still see all operations in journal) - -## Use Cases - -### Startup State Reset -```rust -// On app startup -let mut store = VersionedKVStore::open_existing(path, name, size, ratio)?; - -// Log what state existed before (for recovery/debugging) -let previous_version = store.current_version(); -let previous_keys = store.len(); -log::info!("Recovered state from version {previous_version} with {previous_keys} keys"); - -// Clear all state to start fresh session -let clear_version = store.clear_all().await?; -log::info!("Started new session at version {clear_version}"); -``` - -### Prefix-Based Operations -```rust -// Clear all metrics -store.clear_prefix("metrics:").await?; - -// Clear all temporary data -store.clear_prefix("temp:").await?; - -// Clear user-specific data -store.clear_prefix("user:123:").await?; -``` - -### Bulk Updates -```rust -// Bulk insert metrics -let metrics = vec![ - ("metrics:cpu".into(), Value::from(50)), - ("metrics:mem".into(), Value::from(80)), - ("metrics:disk".into(), Value::from(60)), -]; -let version = store.insert_batch(metrics).await?; -``` - -## Testing Requirements - -1. **Basic batch operations** - - Empty batch - - Small batch (fits in one chunk) - - Large batch (requires splitting) - -2. **Capacity edge cases** - - Batch that exactly fills buffer - - Batch that triggers rotation mid-way - - Batch that requires multiple rotations - - Single entry too large (should error) - -3. **State correctness** - - Verify all entries present after batch completes - - Verify version monotonicity - - Verify timestamp consistency within chunks - - Verify rotation doesn't lose data - -4. **Prefix operations** - - Clear prefix with no matches - - Clear prefix with some matches - - Clear prefix with all keys matching - - Overlapping prefixes - -5. **Recovery scenarios** - - Replay journal with split batch operations - - Verify final state matches expected - -## Open Questions - -None - design is ready for implementation. - -## References - -- Original optimization analysis: 50-80% overhead reduction from batch operations -- Current single-operation overhead: timestamp syscall + HashMap allocation + encoding + rotation check per operation -- Target: Amortize overhead across batch From 631ddb71a962bad330e4062a8cafccbe0728753b Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 15:25:59 -0800 Subject: [PATCH 50/66] revert --- bd-workflows/src/config.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/bd-workflows/src/config.rs b/bd-workflows/src/config.rs index 896937ec..03b5f44b 100644 --- a/bd-workflows/src/config.rs +++ b/bd-workflows/src/config.rs @@ -806,13 +806,6 @@ impl ActionEmitSankey { } } -// -// ActionTakeScreenshot -// - -#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] -pub struct ActionTakeScreenshot; - pub type FieldKey = String; // From 45fdec68c179704a491847e45768cb091913f30b Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 15:26:35 -0800 Subject: [PATCH 51/66] revert --- tarpaulin-report.html | 737 ------------------------------------------ 1 file changed, 737 deletions(-) delete mode 100644 tarpaulin-report.html diff --git a/tarpaulin-report.html b/tarpaulin-report.html deleted file mode 100644 index 8738aeac..00000000 --- a/tarpaulin-report.html +++ /dev/null @@ -1,737 +0,0 @@ - - - - - - - -
- - - - - - \ No newline at end of file From 7feb85fa9a671812e7136c7ac57de5080153c780 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 15:28:07 -0800 Subject: [PATCH 52/66] comments --- bd-resilient-kv/src/versioned_kv_journal/store.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bd-resilient-kv/src/versioned_kv_journal/store.rs b/bd-resilient-kv/src/versioned_kv_journal/store.rs index cda503eb..dc3c8392 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/store.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/store.rs @@ -47,7 +47,6 @@ impl VersionedKVStore { /// /// The journal file will be named `.jrn.N` where N is the generation number. /// If a journal already exists, it will be loaded with its existing contents. - /// Legacy journals (`.jrn`) are automatically migrated to generation 0. /// If the specified size is larger than an existing file, it will be resized while preserving /// data. If the specified size is smaller and the existing data doesn't fit, a fresh journal /// will be created. @@ -64,6 +63,10 @@ impl VersionedKVStore { let (journal_path, generation) = file_manager::find_active_journal(dir, name); + // TODO(snowp): It would be ideal to be able to start with a small buffer and grow is as needed + // depending on the particular device need. We can embed size information in the journal header + // or in the filename itself to facilitate this. + log::debug!( "Opening VersionedKVStore journal at {} (generation {generation})", journal_path.display() From 93e5f461960155af71521f6802f91b4591dc593e Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 17:50:09 -0800 Subject: [PATCH 53/66] add more tests --- bd-resilient-kv/src/tests/mod.rs | 1 + .../tests/versioned_recovery_error_test.rs | 193 ++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 bd-resilient-kv/src/tests/versioned_recovery_error_test.rs diff --git a/bd-resilient-kv/src/tests/mod.rs b/bd-resilient-kv/src/tests/mod.rs index 1cb5bfc8..38fa9f7a 100644 --- a/bd-resilient-kv/src/tests/mod.rs +++ b/bd-resilient-kv/src/tests/mod.rs @@ -36,6 +36,7 @@ pub mod kv_store_test; pub mod kv_test; pub mod memmapped_test; pub mod versioned_kv_store_test; +pub mod versioned_recovery_error_test; pub mod versioned_recovery_test; /// Helper function to decompress zlib-compressed data. diff --git a/bd-resilient-kv/src/tests/versioned_recovery_error_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_error_test.rs new file mode 100644 index 00000000..e222d3df --- /dev/null +++ b/bd-resilient-kv/src/tests/versioned_recovery_error_test.rs @@ -0,0 +1,193 @@ +// shared-core - bitdrift's common client/server libraries +// Copyright Bitdrift, Inc. All rights reserved. +// +// Use of this source code is governed by a source available license that can be found in the +// LICENSE file or at: +// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt + +#![allow(clippy::unwrap_used)] + +use crate::versioned_kv_journal::recovery::VersionedRecovery; + +#[test] +fn test_recovery_buffer_too_small() { + // Create a buffer that's smaller than the header size (17 bytes) + let buffer = vec![0u8; 10]; + + let recovery = VersionedRecovery::new(vec![(&buffer, 1000)]).unwrap(); + let result = recovery.recover_current(); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(err.to_string().contains("Buffer too small")); +} + +#[test] +fn test_recovery_invalid_position_less_than_header() { + // Create a buffer with a position field that's less than HEADER_SIZE (17) + let mut buffer = vec![0u8; 100]; + + // Write version (1 byte) + buffer[0] = 1; + + // Write position at bytes 8-15 (u64, little-endian) + // Set position to 10, which is less than HEADER_SIZE (17) + let invalid_position: u64 = 10; + buffer[8 .. 16].copy_from_slice(&invalid_position.to_le_bytes()); + + let recovery = VersionedRecovery::new(vec![(&buffer, 1000)]).unwrap(); + let result = recovery.recover_current(); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + err.to_string().contains("Invalid position"), + "Expected 'Invalid position' error, got: {}", + err + ); +} + +#[test] +fn test_recovery_position_exceeds_buffer_length() { + // Create a buffer where position > buffer.len() + let mut buffer = vec![0u8; 50]; + + // Write version (1 byte) + buffer[0] = 1; + + // Write position at bytes 8-15 (u64, little-endian) + // Set position to 100, which exceeds buffer length of 50 + let invalid_position: u64 = 100; + buffer[8 .. 16].copy_from_slice(&invalid_position.to_le_bytes()); + + let recovery = VersionedRecovery::new(vec![(&buffer, 1000)]).unwrap(); + let result = recovery.recover_current(); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + err.to_string().contains("Invalid position"), + "Expected 'Invalid position' error, got: {}", + err + ); +} + +#[tokio::test] +async fn test_recovery_with_deletions() -> anyhow::Result<()> { + use crate::tests::decompress_zlib; + use crate::versioned_kv_journal::make_string_value; + use crate::VersionedKVStore; + use tempfile::TempDir; + + let temp_dir = TempDir::new()?; + + let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + + // Insert a key + store + .insert("key1".to_string(), make_string_value("value1")) + .await?; + let ts1 = store + .get_with_timestamp("key1") + .map(|tv| tv.timestamp) + .unwrap(); + + std::thread::sleep(std::time::Duration::from_millis(10)); + + // Insert another key + store + .insert("key2".to_string(), make_string_value("value2")) + .await?; + let ts2 = store + .get_with_timestamp("key2") + .map(|tv| tv.timestamp) + .unwrap(); + + std::thread::sleep(std::time::Duration::from_millis(10)); + + // Delete key1 + store.remove("key1").await?; + // ts3 should be greater than ts2 due to the sleep and time passage + // We can't get the exact deletion timestamp, so we use ts2 + margin + let ts3 = ts2 + 20_000; // Add 20ms in microseconds as a safe margin + + store.sync()?; + + // Rotate to create snapshot + store.rotate_journal().await?; + + // Read the snapshot + let mut archived_files: Vec<_> = std::fs::read_dir(temp_dir.path())? + .filter_map(|entry| { + let entry = entry.ok()?; + let path = entry.path(); + if path.extension()?.to_str()? == "zz" { + Some(path) + } else { + None + } + }) + .collect(); + archived_files.sort(); + + assert_eq!(archived_files.len(), 1, "Expected exactly one archived file"); + let compressed_data = std::fs::read(&archived_files[0])?; + let decompressed_data = decompress_zlib(&compressed_data)?; + + // Use u64::MAX as snapshot timestamp since we're only checking the latest state + let recovery = VersionedRecovery::new(vec![(&decompressed_data, u64::MAX)])?; + + // At ts1, only key1 should exist + let state_ts1 = recovery.recover_at_timestamp(ts1)?; + assert_eq!(state_ts1.len(), 1); + assert!(state_ts1.contains_key("key1")); + + // At ts2, both keys should exist + let state_ts2 = recovery.recover_at_timestamp(ts2)?; + assert_eq!(state_ts2.len(), 2); + assert!(state_ts2.contains_key("key1")); + assert!(state_ts2.contains_key("key2")); + + // At ts3 (after deletion), only key2 should exist + let state_ts3 = recovery.recover_at_timestamp(ts3)?; + assert_eq!(state_ts3.len(), 1); + assert!(!state_ts3.contains_key("key1"), "key1 should be deleted"); + assert!(state_ts3.contains_key("key2")); + + Ok(()) +} + +#[test] +fn test_recovery_with_corrupted_frame() { + // Create a valid header followed by corrupted frame data + let mut buffer = vec![0u8; 100]; + + // Write version (1 byte) + buffer[0] = 1; + + // Write valid position at bytes 8-15 (u64, little-endian) + let position: u64 = 50; + buffer[8 .. 16].copy_from_slice(&position.to_le_bytes()); + + // Fill data area with corrupted/invalid frame data + // (random bytes that won't decode as a valid frame) + buffer[17 .. 50].fill(0xFF); + + // This should not panic, but should handle the corrupted frame gracefully + let result = VersionedRecovery::new(vec![(&buffer, 1000)]); + // The recovery should succeed even with corrupted frames (it will just stop reading) + assert!(result.is_ok()); + + let recovery = result.unwrap(); + let state = recovery.recover_current(); + // Should return empty state since frames are corrupted + assert!(state.is_ok()); +} + +#[tokio::test] +async fn test_recovery_current_with_empty_snapshots() -> anyhow::Result<()> { + // Test recover_current when there are no snapshots at all + let recovery = VersionedRecovery::new(vec![])?; + + let state = recovery.recover_current()?; + assert_eq!(state.len(), 0, "Should return empty state with no snapshots"); + + Ok(()) +} From 54984589218c4dfabad2fbb4e08adecd24b54bfa Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 7 Nov 2025 18:23:41 -0800 Subject: [PATCH 54/66] use time provider --- Cargo.lock | 2 + bd-resilient-kv/Cargo.toml | 2 + .../src/tests/versioned_kv_store_test.rs | 218 ++++++++++-------- .../tests/versioned_recovery_error_test.rs | 46 ++-- .../src/tests/versioned_recovery_test.rs | 53 +++-- .../src/versioned_kv_journal/journal.rs | 49 ++-- .../versioned_kv_journal/memmapped_journal.rs | 9 +- .../src/versioned_kv_journal/store.rs | 64 +++-- 8 files changed, 271 insertions(+), 172 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9c194ee0..8013e89e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1108,6 +1108,7 @@ dependencies = [ "bd-client-common", "bd-log", "bd-proto", + "bd-time", "bd-workspace-hack", "bytes", "crc32fast", @@ -1117,6 +1118,7 @@ dependencies = [ "memmap2", "protobuf 4.0.0-alpha.0", "tempfile", + "time", "tokio", ] diff --git a/bd-resilient-kv/Cargo.toml b/bd-resilient-kv/Cargo.toml index 14195590..3f3bd4af 100644 --- a/bd-resilient-kv/Cargo.toml +++ b/bd-resilient-kv/Cargo.toml @@ -11,6 +11,7 @@ doctest = false [dev-dependencies] tempfile.workspace = true ctor.workspace = true +time.workspace = true [dependencies] ahash.workspace = true @@ -18,6 +19,7 @@ log.workspace = true anyhow.workspace = true bd-bonjson = { path = "../bd-bonjson" } bd-proto = { path = "../bd-proto" } +bd-time = { path = "../bd-time" } bd-log = { path = "../bd-log" } bd-client-common = { path = "../bd-client-common" } bd-workspace-hack.workspace = true diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index d1361a00..a42f3b5c 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -11,19 +11,67 @@ use crate::VersionedKVStore; use crate::tests::decompress_zlib; use crate::versioned_kv_journal::{TimestampedValue, make_string_value}; use bd_proto::protos::state::payload::StateValue; +use bd_time::TestTimeProvider; +use std::sync::Arc; use tempfile::TempDir; +use time::ext::NumericalDuration; +use time::macros::datetime; + +struct Setup { + temp_dir: TempDir, + store: VersionedKVStore, + time_provider: Arc, +} + +impl Setup { + fn new() -> anyhow::Result { + let temp_dir = TempDir::new()?; + let time_provider = Arc::new(TestTimeProvider::new(datetime!(2024-01-01 00:00:00 UTC))); + + let (store, _) = + VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone())?; + + Ok(Self { + temp_dir, + store, + time_provider, + }) + } + + fn make_store_from_snapshot_file( + &self, + snapshot_path: &std::path::Path, + ) -> anyhow::Result { + // Decompress the snapshot and journal files into the temp directory + // so we can open them as a store. + let data = std::fs::read(snapshot_path)?; + let decompressed_snapshot = decompress_zlib(&data)?; + std::fs::write( + self.temp_dir.path().join("snapshot.jrn.0"), + decompressed_snapshot, + )?; + + let (store, _) = VersionedKVStore::open_existing( + self.temp_dir.path(), + "snapshot", + 4096, + None, + self.time_provider.clone(), + )?; + + Ok(store) + } +} #[test] fn empty_store() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - let (store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let setup = Setup::new()?; // Should start empty - assert!(store.is_empty()); - assert_eq!(store.len(), 0); + assert!(setup.store.is_empty()); + assert_eq!(setup.store.len(), 0); - assert!(temp_dir.path().join("test.jrn.0").exists()); + assert!(setup.temp_dir.path().join("test.jrn.0").exists()); Ok(()) } @@ -31,9 +79,10 @@ fn empty_store() -> anyhow::Result<()> { #[tokio::test] async fn basic_crud() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; + let time_provider = Arc::new(TestTimeProvider::new(datetime!(2024-01-01 00:00:00 UTC))); - let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider)?; // Insert some values let ts1 = store @@ -73,11 +122,13 @@ async fn basic_crud() -> anyhow::Result<()> { #[tokio::test] async fn test_persistence_and_reload() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; + let time_provider = Arc::new(TestTimeProvider::new(datetime!(2024-01-01 00:00:00 UTC))); // Create store and write some data let (ts1, ts2) = { - let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = + VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone())?; let ts1 = store .insert("key1".to_string(), make_string_value("value1")) .await?; @@ -91,7 +142,8 @@ async fn test_persistence_and_reload() -> anyhow::Result<()> { // Reopen and verify data persisted { - let (store, _) = VersionedKVStore::open_existing(temp_dir.path(), "test", 4096, None)?; + let (store, _) = + VersionedKVStore::open_existing(temp_dir.path(), "test", 4096, None, time_provider)?; assert_eq!(store.len(), 2); assert_eq!( store.get_with_timestamp("key1"), @@ -114,71 +166,72 @@ async fn test_persistence_and_reload() -> anyhow::Result<()> { #[tokio::test] async fn test_null_value_is_deletion() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let mut setup = Setup::new()?; // Insert a value - store + setup + .store .insert("key1".to_string(), make_string_value("value1")) .await?; - assert!(store.contains_key("key1")); + assert!(setup.store.contains_key("key1")); // Insert empty state to delete - store + setup + .store .insert("key1".to_string(), StateValue::default()) .await?; - assert!(!store.contains_key("key1")); - assert_eq!(store.len(), 0); + assert!(!setup.store.contains_key("key1")); + assert_eq!(setup.store.len(), 0); Ok(()) } #[tokio::test] async fn test_manual_rotation() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let mut setup = Setup::new()?; // Insert some data - let _ts1 = store + let _ts1 = setup + .store .insert("key1".to_string(), make_string_value("value1")) .await?; - let ts2 = store + let ts2 = setup + .store .insert("key2".to_string(), make_string_value("value2")) .await?; // Get max timestamp before rotation (this will be used in the archive name) - let rotation_timestamp = store + let rotation_timestamp = setup + .store .get_with_timestamp("key2") .map(|tv| tv.timestamp) .unwrap(); // Manually trigger rotation - store.rotate_journal().await?; + setup.store.rotate_journal().await?; // Verify archived file exists (compressed) - let archived_path = temp_dir + let archived_path = setup + .temp_dir .path() .join(format!("test.jrn.t{}.zz", rotation_timestamp)); assert!(archived_path.exists()); // Verify active journal still works - let ts3 = store + let ts3 = setup + .store .insert("key3".to_string(), make_string_value("value3")) .await?; assert!(ts3 >= ts2); - assert_eq!(store.len(), 3); + assert_eq!(setup.store.len(), 3); // Verify data is intact - assert_eq!(store.get("key1"), Some(&make_string_value("value1"))); - assert_eq!(store.get("key2"), Some(&make_string_value("value2"))); - assert_eq!(store.get("key3"), Some(&make_string_value("value3"))); + assert_eq!(setup.store.get("key1"), Some(&make_string_value("value1"))); + assert_eq!(setup.store.get("key2"), Some(&make_string_value("value2"))); + assert_eq!(setup.store.get("key3"), Some(&make_string_value("value3"))); // Decompress the archive and load it as a Store to verify that it contains the old state. - let snapshot_store = make_store_from_snapshot_file(&temp_dir, &archived_path)?; + let snapshot_store = setup.make_store_from_snapshot_file(&archived_path)?; assert_eq!( snapshot_store.get("key1"), Some(&make_string_value("value1")) @@ -194,73 +247,68 @@ async fn test_manual_rotation() -> anyhow::Result<()> { #[tokio::test] async fn test_rotation_preserves_state() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let mut setup = Setup::new()?; - store + setup + .store .insert("key1".to_string(), make_string_value("value1")) .await?; - let pre_rotation_state = store.as_hashmap().clone(); - let pre_rotation_ts = store + let pre_rotation_state = setup.store.as_hashmap().clone(); + let pre_rotation_ts = setup + .store .get_with_timestamp("key1") .map(|tv| tv.timestamp) .unwrap(); // Rotate - store.rotate_journal().await?; + setup.store.rotate_journal().await?; // Verify state is preserved exactly - let post_rotation_state = store.as_hashmap(); + let post_rotation_state = setup.store.as_hashmap(); assert_eq!(pre_rotation_state, *post_rotation_state); - assert_eq!(store.len(), 1); + assert_eq!(setup.store.len(), 1); // Verify we can continue writing - let ts_new = store + let ts_new = setup + .store .insert("key2".to_string(), make_string_value("value2")) .await?; assert!(ts_new >= pre_rotation_ts); - assert_eq!(store.len(), 2); + assert_eq!(setup.store.len(), 2); Ok(()) } #[tokio::test] async fn test_empty_store_operations() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let mut setup = Setup::new()?; // Operations on empty store - assert_eq!(store.get("nonexistent"), None); - assert!(!store.contains_key("nonexistent")); - assert_eq!(store.remove("nonexistent").await?, None); - assert!(store.is_empty()); - assert_eq!(store.len(), 0); + assert_eq!(setup.store.get("nonexistent"), None); + assert!(!setup.store.contains_key("nonexistent")); + assert_eq!(setup.store.remove("nonexistent").await?, None); + assert!(setup.store.is_empty()); + assert_eq!(setup.store.len(), 0); Ok(()) } #[tokio::test] async fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - // Create store with small buffer to trigger rotation easily - let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 2048, Some(0.5))?; + let mut setup = Setup::new()?; // Insert some keys and capture their timestamps - let ts1 = store + let ts1 = setup + .store .insert("key1".to_string(), make_string_value("value1")) .await?; - // Small sleep to ensure different timestamps - std::thread::sleep(std::time::Duration::from_millis(10)); + // Advance time to ensure different timestamps. + setup.time_provider.advance(10.milliseconds()); - let ts2 = store + let ts2 = setup + .store .insert("key2".to_string(), make_string_value("value2")) .await?; @@ -270,18 +318,21 @@ async fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { // Write enough data to trigger rotation for i in 0 .. 50 { - store + setup + .store .insert(format!("fill{i}"), make_string_value("foo")) .await?; } // Verify that after rotation, the original timestamps are preserved - let ts1_after = store + let ts1_after = setup + .store .get_with_timestamp("key1") .map(|tv| tv.timestamp) .unwrap(); - let ts2_after = store + let ts2_after = setup + .store .get_with_timestamp("key2") .map(|tv| tv.timestamp) .unwrap(); @@ -306,10 +357,7 @@ async fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { #[tokio::test] async fn test_multiple_rotations() -> anyhow::Result<()> { - let temp_dir = TempDir::new()?; - - - let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let mut setup = Setup::new()?; let mut rotation_timestamps = Vec::new(); @@ -317,18 +365,22 @@ async fn test_multiple_rotations() -> anyhow::Result<()> { for i in 0 .. 3 { let key = format!("key{}", i); let value = make_string_value(&format!("value{}", i)); - store.insert(key.clone(), value).await?; - let timestamp = store + setup.store.insert(key.clone(), value).await?; + let timestamp = setup + .store .get_with_timestamp(&key) .map(|tv| tv.timestamp) .unwrap(); rotation_timestamps.push(timestamp); - store.rotate_journal().await?; + setup.store.rotate_journal().await?; } // Verify all compressed archives exist for timestamp in rotation_timestamps { - let archived_path = temp_dir.path().join(format!("test.jrn.t{}.zz", timestamp)); + let archived_path = setup + .temp_dir + .path() + .join(format!("test.jrn.t{}.zz", timestamp)); assert!( archived_path.exists(), "Compressed archive for timestamp {} should exist", @@ -338,21 +390,3 @@ async fn test_multiple_rotations() -> anyhow::Result<()> { Ok(()) } - -fn make_store_from_snapshot_file( - temp_dir: &TempDir, - snapshot_path: &std::path::Path, -) -> anyhow::Result { - // Decompress the snapshot and journal files into the temp directory - // so we can open them as a store. - let data = std::fs::read(snapshot_path)?; - let decompressed_snapshot = decompress_zlib(&data)?; - std::fs::write( - temp_dir.path().join("snapshot.jrn.0"), - decompressed_snapshot, - )?; - - let (store, _) = VersionedKVStore::open_existing(temp_dir.path(), "snapshot", 4096, None)?; - - Ok(store) -} diff --git a/bd-resilient-kv/src/tests/versioned_recovery_error_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_error_test.rs index e222d3df..b6a3bf8a 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_error_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_error_test.rs @@ -7,7 +7,16 @@ #![allow(clippy::unwrap_used)] +use crate::VersionedKVStore; +use crate::tests::decompress_zlib; +use crate::versioned_kv_journal::make_string_value; use crate::versioned_kv_journal::recovery::VersionedRecovery; +use bd_time::TestTimeProvider; +use std::sync::Arc; +use tempfile::TempDir; +use time::ext::NumericalDuration; +use time::macros::datetime; + #[test] fn test_recovery_buffer_too_small() { @@ -71,14 +80,11 @@ fn test_recovery_position_exceeds_buffer_length() { #[tokio::test] async fn test_recovery_with_deletions() -> anyhow::Result<()> { - use crate::tests::decompress_zlib; - use crate::versioned_kv_journal::make_string_value; - use crate::VersionedKVStore; - use tempfile::TempDir; - let temp_dir = TempDir::new()?; + let time_provider = Arc::new(TestTimeProvider::new(datetime!(2024-01-01 00:00:00 UTC))); - let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = + VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone())?; // Insert a key store @@ -89,7 +95,7 @@ async fn test_recovery_with_deletions() -> anyhow::Result<()> { .map(|tv| tv.timestamp) .unwrap(); - std::thread::sleep(std::time::Duration::from_millis(10)); + time_provider.advance(10.milliseconds()); // Insert another key store @@ -100,7 +106,7 @@ async fn test_recovery_with_deletions() -> anyhow::Result<()> { .map(|tv| tv.timestamp) .unwrap(); - std::thread::sleep(std::time::Duration::from_millis(10)); + time_provider.advance(10.milliseconds()); // Delete key1 store.remove("key1").await?; @@ -111,24 +117,10 @@ async fn test_recovery_with_deletions() -> anyhow::Result<()> { store.sync()?; // Rotate to create snapshot - store.rotate_journal().await?; + let snapshot = store.rotate_journal().await?; // Read the snapshot - let mut archived_files: Vec<_> = std::fs::read_dir(temp_dir.path())? - .filter_map(|entry| { - let entry = entry.ok()?; - let path = entry.path(); - if path.extension()?.to_str()? == "zz" { - Some(path) - } else { - None - } - }) - .collect(); - archived_files.sort(); - - assert_eq!(archived_files.len(), 1, "Expected exactly one archived file"); - let compressed_data = std::fs::read(&archived_files[0])?; + let compressed_data = std::fs::read(&snapshot)?; let decompressed_data = decompress_zlib(&compressed_data)?; // Use u64::MAX as snapshot timestamp since we're only checking the latest state @@ -187,7 +179,11 @@ async fn test_recovery_current_with_empty_snapshots() -> anyhow::Result<()> { let recovery = VersionedRecovery::new(vec![])?; let state = recovery.recover_current()?; - assert_eq!(state.len(), 0, "Should return empty state with no snapshots"); + assert_eq!( + state.len(), + 0, + "Should return empty state with no snapshots" + ); Ok(()) } diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index d72198b7..ec26a382 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -12,7 +12,10 @@ use crate::VersionedKVStore; use crate::tests::decompress_zlib; use crate::versioned_kv_journal::make_string_value; use crate::versioned_kv_journal::recovery::VersionedRecovery; +use std::sync::Arc; use tempfile::TempDir; +use time::ext::NumericalDuration; +use time::macros::datetime; /// Helper function to find archived journal files in a directory. /// Returns sorted paths to all `.zz` compressed journal archives. @@ -58,8 +61,13 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; + let time_provider = Arc::new(bd_time::TestTimeProvider::new(datetime!( + 2024-01-01 00:00:00 UTC + ))); + // Create a store with larger buffer to avoid BufferFull errors during test - let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 2048, None)?; + let (mut store, _) = + VersionedKVStore::new(temp_dir.path(), "test", 2048, None, time_provider.clone())?; store .insert("key1".to_string(), make_string_value("value1")) @@ -69,7 +77,7 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { .map(|tv| tv.timestamp) .unwrap(); - std::thread::sleep(std::time::Duration::from_millis(10)); + time_provider.advance(10.milliseconds()); store .insert("key2".to_string(), make_string_value("value2")) @@ -87,7 +95,7 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { .map(|tv| tv.timestamp) .unwrap(); - std::thread::sleep(std::time::Duration::from_millis(10)); + time_provider.advance(10.milliseconds()); // Write more after rotation store @@ -144,10 +152,13 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { #[tokio::test] async fn test_recovery_empty_journal() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; + let time_provider = Arc::new(bd_time::TestTimeProvider::new(datetime!( + 2024-01-01 00:00:00 UTC + ))); // Create an empty store - let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider)?; store.sync()?; // Rotate to create snapshot @@ -172,9 +183,13 @@ async fn test_recovery_empty_journal() -> anyhow::Result<()> { #[tokio::test] async fn test_recovery_with_overwrites() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; + let time_provider = Arc::new(bd_time::TestTimeProvider::new(datetime!( + 2024-01-01 00:00:00 UTC + ))); - let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = + VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone())?; store .insert("key".to_string(), make_string_value("1")) .await?; @@ -183,7 +198,7 @@ async fn test_recovery_with_overwrites() -> anyhow::Result<()> { .map(|tv| tv.timestamp) .unwrap(); - std::thread::sleep(std::time::Duration::from_millis(10)); + time_provider.advance(10.milliseconds()); store .insert("key".to_string(), make_string_value("2")) @@ -193,7 +208,7 @@ async fn test_recovery_with_overwrites() -> anyhow::Result<()> { .map(|tv| tv.timestamp) .unwrap(); - std::thread::sleep(std::time::Duration::from_millis(10)); + time_provider.advance(10.milliseconds()); store .insert("key".to_string(), make_string_value("3")) @@ -242,9 +257,13 @@ async fn test_recovery_with_overwrites() -> anyhow::Result<()> { #[tokio::test] async fn test_recovery_at_timestamp() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; + let time_provider = Arc::new(bd_time::TestTimeProvider::new(datetime!( + 2024-01-01 00:00:00 UTC + ))); // Create a store and write some timestamped data - let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = + VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone())?; store .insert("key1".to_string(), make_string_value("value1")) @@ -254,8 +273,8 @@ async fn test_recovery_at_timestamp() -> anyhow::Result<()> { .map(|tv| tv.timestamp) .unwrap(); - // Small sleep to ensure different timestamps - std::thread::sleep(std::time::Duration::from_millis(10)); + // Advance time to ensure different timestamps + time_provider.advance(10.milliseconds()); store .insert("key2".to_string(), make_string_value("value2")) @@ -265,8 +284,8 @@ async fn test_recovery_at_timestamp() -> anyhow::Result<()> { .map(|tv| tv.timestamp) .unwrap(); - // Small sleep to ensure different timestamps - std::thread::sleep(std::time::Duration::from_millis(10)); + // Advance time again + time_provider.advance(10.milliseconds()); store .insert("key1".to_string(), make_string_value("updated1")) @@ -329,8 +348,12 @@ async fn test_recovery_at_timestamp() -> anyhow::Result<()> { #[tokio::test] async fn test_recovery_at_timestamp_with_rotation() -> anyhow::Result<()> { let temp_dir = TempDir::new()?; + let time_provider = Arc::new(bd_time::TestTimeProvider::new(datetime!( + 2024-01-01 00:00:00 UTC + ))); - let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None)?; + let (mut store, _) = + VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone())?; // Write some data before rotation store @@ -341,7 +364,7 @@ async fn test_recovery_at_timestamp_with_rotation() -> anyhow::Result<()> { .map(|tv| tv.timestamp) .unwrap(); - std::thread::sleep(std::time::Duration::from_millis(10)); + time_provider.advance(10.milliseconds()); store .insert("key2".to_string(), make_string_value("value2")) @@ -354,7 +377,7 @@ async fn test_recovery_at_timestamp_with_rotation() -> anyhow::Result<()> { // Rotate journal store.rotate_journal().await?; - std::thread::sleep(std::time::Duration::from_millis(10)); + time_provider.advance(10.milliseconds()); // Write data after rotation store diff --git a/bd-resilient-kv/src/versioned_kv_journal/journal.rs b/bd-resilient-kv/src/versioned_kv_journal/journal.rs index 560adde0..5693fe39 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/journal.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/journal.rs @@ -7,7 +7,8 @@ use super::framing::Frame; use bd_client_common::error::InvariantError; -use std::time::{SystemTime, UNIX_EPOCH}; +use bd_time::TimeProvider; +use std::sync::Arc; /// Timestamped implementation of a journaling system that uses timestamps /// as the version identifier for point-in-time recovery. @@ -17,13 +18,13 @@ use std::time::{SystemTime, UNIX_EPOCH}; /// The monotonicity is enforced by clamping: if the system clock goes backwards, we reuse /// the same timestamp value to maintain ordering guarantees. When timestamps collide, /// journal ordering determines precedence. -#[derive(Debug)] pub struct VersionedJournal<'a, M> { position: usize, buffer: &'a mut [u8], high_water_mark: usize, high_water_mark_triggered: bool, last_timestamp: u64, // Most recent timestamp written (for monotonic enforcement) + pub(crate) time_provider: Arc, _payload_marker: std::marker::PhantomData, } @@ -122,12 +123,16 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { /// /// # Errors /// Returns an error if the buffer is too small or if `high_water_mark_ratio` is invalid. - pub fn new(buffer: &'a mut [u8], high_water_mark_ratio: Option) -> anyhow::Result { + pub fn new( + buffer: &'a mut [u8], + high_water_mark_ratio: Option, + time_provider: Arc, + ) -> anyhow::Result { let buffer_len = validate_buffer_len(buffer)?; let high_water_mark = calculate_high_water_mark(buffer_len, high_water_mark_ratio)?; // Write header - let timestamp = current_timestamp()?; + let timestamp = Self::unix_timestamp_micros(time_provider.as_ref())?; let position = HEADER_SIZE; write_position(buffer, position); @@ -140,6 +145,7 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { high_water_mark, high_water_mark_triggered: false, last_timestamp: timestamp, + time_provider, _payload_marker: std::marker::PhantomData, }) } @@ -156,6 +162,7 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { pub fn from_buffer( buffer: &'a mut [u8], high_water_mark_ratio: Option, + time_provider: Arc, ) -> anyhow::Result { let buffer_len = validate_buffer_len(buffer)?; let position = read_position(buffer)?; @@ -178,6 +185,7 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { high_water_mark, high_water_mark_triggered: position >= high_water_mark, last_timestamp: highest_timestamp, + time_provider, _payload_marker: std::marker::PhantomData, }) } @@ -211,7 +219,7 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { /// monotonically increasing by clamping to `last_timestamp` (reusing the same value). /// This prevents artificial clock skew while maintaining ordering guarantees. fn next_monotonic_timestamp(&mut self) -> anyhow::Result { - let current = current_timestamp()?; + let current = self.current_timestamp()?; let monotonic = std::cmp::max(current, self.last_timestamp); self.last_timestamp = monotonic; Ok(monotonic) @@ -290,6 +298,22 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { !incomplete } + + /// Get current timestamp in microseconds since UNIX epoch. + fn current_timestamp(&self) -> std::result::Result { + Self::unix_timestamp_micros(self.time_provider.as_ref()) + } + + fn unix_timestamp_micros( + time_provider: &dyn TimeProvider, + ) -> std::result::Result { + time_provider + .now() + .unix_timestamp_nanos() + .checked_div(1_000) + .and_then(|micros| micros.try_into().ok()) + .ok_or(InvariantError::Invariant) + } } /// Rotation utilities for creating new journals with compacted state @@ -314,7 +338,7 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { high_water_mark_ratio: Option, ) -> anyhow::Result { // Create a new journal - let mut journal = Self::new(buffer, high_water_mark_ratio)?; + let mut journal = Self::new(buffer, high_water_mark_ratio, self.time_provider.clone())?; // Find the maximum timestamp in the state to maintain monotonicity let max_state_timestamp = self.last_timestamp; @@ -339,16 +363,3 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { Ok(journal) } } - -/// Get current timestamp in microseconds since UNIX epoch. -fn current_timestamp() -> anyhow::Result { - SystemTime::now() - .duration_since(UNIX_EPOCH) - .map_err(|_| InvariantError::Invariant.into()) - .map(|d| { - #[allow(clippy::cast_possible_truncation)] - { - d.as_micros() as u64 - } - }) -} diff --git a/bd-resilient-kv/src/versioned_kv_journal/memmapped_journal.rs b/bd-resilient-kv/src/versioned_kv_journal/memmapped_journal.rs index f8a90268..a77f9c6c 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/memmapped_journal.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/memmapped_journal.rs @@ -7,9 +7,11 @@ use super::journal::VersionedJournal; use bd_proto::protos::state::payload::StateKeyValuePair; +use bd_time::TimeProvider; use memmap2::{MmapMut, MmapOptions}; use std::fs::OpenOptions; use std::path::Path; +use std::sync::Arc; /// Memory-mapped implementation of a timestamped journal. /// @@ -21,7 +23,6 @@ use std::path::Path; /// During construction, we unsafely declare mmap's internal buffer as having a static /// lifetime, but it's actually tied to the lifetime of `inner`. This works because /// nothing external holds a reference to the buffer. -#[derive(Debug)] pub struct MemMappedVersionedJournal { // Note: mmap MUST de-init AFTER versioned_kv because mmap uses it. mmap: MmapMut, @@ -79,6 +80,7 @@ impl MemMappedVersionedJournal { file_path: P, size: usize, high_water_mark_ratio: Option, + time_provider: Arc, ) -> anyhow::Result { let file = OpenOptions::new() .read(true) @@ -94,7 +96,7 @@ impl MemMappedVersionedJournal { let (mmap, buffer) = unsafe { Self::create_mmap_buffer(file)? }; - let versioned_kv = VersionedJournal::new(buffer, high_water_mark_ratio)?; + let versioned_kv = VersionedJournal::new(buffer, high_water_mark_ratio, time_provider)?; Ok(Self { mmap, @@ -119,6 +121,7 @@ impl MemMappedVersionedJournal { file_path: P, size: usize, high_water_mark_ratio: Option, + time_provider: Arc, ) -> anyhow::Result { let file = OpenOptions::new().read(true).write(true).open(file_path)?; @@ -129,7 +132,7 @@ impl MemMappedVersionedJournal { let (mmap, buffer) = unsafe { Self::create_mmap_buffer(file)? }; - let versioned_kv = VersionedJournal::from_buffer(buffer, high_water_mark_ratio)?; + let versioned_kv = VersionedJournal::from_buffer(buffer, high_water_mark_ratio, time_provider)?; Ok(Self { mmap, diff --git a/bd-resilient-kv/src/versioned_kv_journal/store.rs b/bd-resilient-kv/src/versioned_kv_journal/store.rs index dc3c8392..9350e4d0 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/store.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/store.rs @@ -11,7 +11,9 @@ use crate::versioned_kv_journal::journal::VersionedJournal; use crate::versioned_kv_journal::memmapped_journal::MemMappedVersionedJournal; use ahash::AHashMap; use bd_proto::protos::state::payload::{StateKeyValuePair, StateValue}; +use bd_time::TimeProvider; use std::path::{Path, PathBuf}; +use std::sync::Arc; #[derive(Debug, PartialEq, Eq)] pub enum DataLoss { @@ -58,6 +60,7 @@ impl VersionedKVStore { name: &str, buffer_size: usize, high_water_mark_ratio: Option, + time_provider: Arc, ) -> anyhow::Result<(Self, DataLoss)> { let dir = dir_path.as_ref(); @@ -74,22 +77,37 @@ impl VersionedKVStore { let (journal, mut data_loss) = if journal_path.exists() { // Try to open existing journal - MemMappedVersionedJournal::from_file(&journal_path, buffer_size, high_water_mark_ratio) - .map(|j| (j, DataLoss::None)) - .or_else(|_| { - // TODO(snowp): Distinguish between partial and total data loss. - - // Data is corrupt or unreadable, create fresh journal - Ok::<_, anyhow::Error>(( - MemMappedVersionedJournal::new(&journal_path, buffer_size, high_water_mark_ratio)?, - DataLoss::Total, - )) - })? + MemMappedVersionedJournal::from_file( + &journal_path, + buffer_size, + high_water_mark_ratio, + time_provider.clone(), + ) + .map(|j| (j, DataLoss::None)) + .or_else(|_| { + // TODO(snowp): Distinguish between partial and total data loss. + + // Data is corrupt or unreadable, create fresh journal + Ok::<_, anyhow::Error>(( + MemMappedVersionedJournal::new( + &journal_path, + buffer_size, + high_water_mark_ratio, + time_provider, + )?, + DataLoss::Total, + )) + })? } else { // Create new journal ( - MemMappedVersionedJournal::new(&journal_path, buffer_size, high_water_mark_ratio)?, + MemMappedVersionedJournal::new( + &journal_path, + buffer_size, + high_water_mark_ratio, + time_provider, + )?, DataLoss::None, ) }; @@ -136,13 +154,18 @@ impl VersionedKVStore { name: &str, buffer_size: usize, high_water_mark_ratio: Option, + time_provider: Arc, ) -> anyhow::Result<(Self, DataLoss)> { let dir = dir_path.as_ref(); let (journal_path, generation) = file_manager::find_active_journal(dir, name); - let journal = - MemMappedVersionedJournal::from_file(&journal_path, buffer_size, high_water_mark_ratio)?; + let journal = MemMappedVersionedJournal::from_file( + &journal_path, + buffer_size, + high_water_mark_ratio, + time_provider, + )?; let (initial_state, incomplete) = Self::populate_initial_state(&journal); @@ -282,13 +305,13 @@ impl VersionedKVStore { MemMappedVersionedJournal::sync(&self.journal) } - /// Manually trigger journal rotation. + /// Manually trigger journal rotation, returning the path to the new journal file. /// /// This will create a new journal with the current state compacted and archive the old journal. /// The archived journal will be compressed using zlib to reduce storage size. /// Rotation typically happens automatically when the high water mark is reached, but this /// method allows manual control when needed. - pub async fn rotate_journal(&mut self) -> anyhow::Result<()> { + pub async fn rotate_journal(&mut self) -> anyhow::Result { // Increment generation counter for new journal let next_generation = self.current_generation + 1; let new_journal_path = self @@ -317,7 +340,7 @@ impl VersionedKVStore { .join(format!("{}.jrn.{old_generation}", self.journal_name)); self.cleanup_archived_journal(&old_journal_path).await; - Ok(()) + Ok(new_journal_path) } /// Clean up after successful rotation (best effort, non-critical). @@ -403,6 +426,11 @@ impl VersionedKVStore { tokio::fs::write(journal_path, &buffer).await?; // Open as memory-mapped journal - MemMappedVersionedJournal::from_file(journal_path, self.buffer_size, self.high_water_mark_ratio) + MemMappedVersionedJournal::from_file( + journal_path, + self.buffer_size, + self.high_water_mark_ratio, + self.journal.time_provider.clone(), + ) } } From 35abc73af822f237c8b6c6433e4416e9dcb5f538 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Sun, 9 Nov 2025 19:21:58 -0800 Subject: [PATCH 55/66] simplify rotation code, use tokio fs where possible --- .../src/tests/versioned_kv_store_test.rs | 36 +++++----- .../tests/versioned_recovery_error_test.rs | 25 ++----- .../src/tests/versioned_recovery_test.rs | 11 +-- .../src/versioned_kv_journal/file_manager.rs | 10 +-- .../src/versioned_kv_journal/store.rs | 71 ++++++++++++++----- 5 files changed, 86 insertions(+), 67 deletions(-) diff --git a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs index a42f3b5c..558e7a8c 100644 --- a/bd-resilient-kv/src/tests/versioned_kv_store_test.rs +++ b/bd-resilient-kv/src/tests/versioned_kv_store_test.rs @@ -24,12 +24,12 @@ struct Setup { } impl Setup { - fn new() -> anyhow::Result { + async fn new() -> anyhow::Result { let temp_dir = TempDir::new()?; let time_provider = Arc::new(TestTimeProvider::new(datetime!(2024-01-01 00:00:00 UTC))); let (store, _) = - VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone())?; + VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone()).await?; Ok(Self { temp_dir, @@ -38,7 +38,7 @@ impl Setup { }) } - fn make_store_from_snapshot_file( + async fn make_store_from_snapshot_file( &self, snapshot_path: &std::path::Path, ) -> anyhow::Result { @@ -57,15 +57,16 @@ impl Setup { 4096, None, self.time_provider.clone(), - )?; + ) + .await?; Ok(store) } } -#[test] -fn empty_store() -> anyhow::Result<()> { - let setup = Setup::new()?; +#[tokio::test] +async fn empty_store() -> anyhow::Result<()> { + let setup = Setup::new().await?; // Should start empty assert!(setup.store.is_empty()); @@ -82,7 +83,8 @@ async fn basic_crud() -> anyhow::Result<()> { let time_provider = Arc::new(TestTimeProvider::new(datetime!(2024-01-01 00:00:00 UTC))); - let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider)?; + let (mut store, _) = + VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider).await?; // Insert some values let ts1 = store @@ -128,7 +130,7 @@ async fn test_persistence_and_reload() -> anyhow::Result<()> { // Create store and write some data let (ts1, ts2) = { let (mut store, _) = - VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone())?; + VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone()).await?; let ts1 = store .insert("key1".to_string(), make_string_value("value1")) .await?; @@ -143,7 +145,7 @@ async fn test_persistence_and_reload() -> anyhow::Result<()> { // Reopen and verify data persisted { let (store, _) = - VersionedKVStore::open_existing(temp_dir.path(), "test", 4096, None, time_provider)?; + VersionedKVStore::open_existing(temp_dir.path(), "test", 4096, None, time_provider).await?; assert_eq!(store.len(), 2); assert_eq!( store.get_with_timestamp("key1"), @@ -166,7 +168,7 @@ async fn test_persistence_and_reload() -> anyhow::Result<()> { #[tokio::test] async fn test_null_value_is_deletion() -> anyhow::Result<()> { - let mut setup = Setup::new()?; + let mut setup = Setup::new().await?; // Insert a value setup @@ -188,7 +190,7 @@ async fn test_null_value_is_deletion() -> anyhow::Result<()> { #[tokio::test] async fn test_manual_rotation() -> anyhow::Result<()> { - let mut setup = Setup::new()?; + let mut setup = Setup::new().await?; // Insert some data let _ts1 = setup @@ -231,7 +233,7 @@ async fn test_manual_rotation() -> anyhow::Result<()> { assert_eq!(setup.store.get("key3"), Some(&make_string_value("value3"))); // Decompress the archive and load it as a Store to verify that it contains the old state. - let snapshot_store = setup.make_store_from_snapshot_file(&archived_path)?; + let snapshot_store = setup.make_store_from_snapshot_file(&archived_path).await?; assert_eq!( snapshot_store.get("key1"), Some(&make_string_value("value1")) @@ -247,7 +249,7 @@ async fn test_manual_rotation() -> anyhow::Result<()> { #[tokio::test] async fn test_rotation_preserves_state() -> anyhow::Result<()> { - let mut setup = Setup::new()?; + let mut setup = Setup::new().await?; setup .store @@ -282,7 +284,7 @@ async fn test_rotation_preserves_state() -> anyhow::Result<()> { #[tokio::test] async fn test_empty_store_operations() -> anyhow::Result<()> { - let mut setup = Setup::new()?; + let mut setup = Setup::new().await?; // Operations on empty store assert_eq!(setup.store.get("nonexistent"), None); @@ -296,7 +298,7 @@ async fn test_empty_store_operations() -> anyhow::Result<()> { #[tokio::test] async fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { - let mut setup = Setup::new()?; + let mut setup = Setup::new().await?; // Insert some keys and capture their timestamps let ts1 = setup @@ -357,7 +359,7 @@ async fn test_timestamp_preservation_during_rotation() -> anyhow::Result<()> { #[tokio::test] async fn test_multiple_rotations() -> anyhow::Result<()> { - let mut setup = Setup::new()?; + let mut setup = Setup::new().await?; let mut rotation_timestamps = Vec::new(); diff --git a/bd-resilient-kv/src/tests/versioned_recovery_error_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_error_test.rs index b6a3bf8a..a9550c04 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_error_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_error_test.rs @@ -84,43 +84,30 @@ async fn test_recovery_with_deletions() -> anyhow::Result<()> { let time_provider = Arc::new(TestTimeProvider::new(datetime!(2024-01-01 00:00:00 UTC))); let (mut store, _) = - VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone())?; + VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone()).await?; - // Insert a key - store + let ts1 = store .insert("key1".to_string(), make_string_value("value1")) .await?; - let ts1 = store - .get_with_timestamp("key1") - .map(|tv| tv.timestamp) - .unwrap(); time_provider.advance(10.milliseconds()); - // Insert another key - store + let ts2 = store .insert("key2".to_string(), make_string_value("value2")) .await?; - let ts2 = store - .get_with_timestamp("key2") - .map(|tv| tv.timestamp) - .unwrap(); time_provider.advance(10.milliseconds()); // Delete key1 - store.remove("key1").await?; - // ts3 should be greater than ts2 due to the sleep and time passage - // We can't get the exact deletion timestamp, so we use ts2 + margin - let ts3 = ts2 + 20_000; // Add 20ms in microseconds as a safe margin + let ts3 = store.remove("key1").await?.unwrap(); store.sync()?; // Rotate to create snapshot - let snapshot = store.rotate_journal().await?; + let rotation = store.rotate_journal().await?; // Read the snapshot - let compressed_data = std::fs::read(&snapshot)?; + let compressed_data = std::fs::read(&rotation.snapshot_path)?; let decompressed_data = decompress_zlib(&compressed_data)?; // Use u64::MAX as snapshot timestamp since we're only checking the latest state diff --git a/bd-resilient-kv/src/tests/versioned_recovery_test.rs b/bd-resilient-kv/src/tests/versioned_recovery_test.rs index ec26a382..6805c992 100644 --- a/bd-resilient-kv/src/tests/versioned_recovery_test.rs +++ b/bd-resilient-kv/src/tests/versioned_recovery_test.rs @@ -67,7 +67,7 @@ async fn test_recovery_multiple_journals_with_rotation() -> anyhow::Result<()> { // Create a store with larger buffer to avoid BufferFull errors during test let (mut store, _) = - VersionedKVStore::new(temp_dir.path(), "test", 2048, None, time_provider.clone())?; + VersionedKVStore::new(temp_dir.path(), "test", 2048, None, time_provider.clone()).await?; store .insert("key1".to_string(), make_string_value("value1")) @@ -158,7 +158,8 @@ async fn test_recovery_empty_journal() -> anyhow::Result<()> { // Create an empty store - let (mut store, _) = VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider)?; + let (mut store, _) = + VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider).await?; store.sync()?; // Rotate to create snapshot @@ -189,7 +190,7 @@ async fn test_recovery_with_overwrites() -> anyhow::Result<()> { let (mut store, _) = - VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone())?; + VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone()).await?; store .insert("key".to_string(), make_string_value("1")) .await?; @@ -263,7 +264,7 @@ async fn test_recovery_at_timestamp() -> anyhow::Result<()> { // Create a store and write some timestamped data let (mut store, _) = - VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone())?; + VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone()).await?; store .insert("key1".to_string(), make_string_value("value1")) @@ -353,7 +354,7 @@ async fn test_recovery_at_timestamp_with_rotation() -> anyhow::Result<()> { ))); let (mut store, _) = - VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone())?; + VersionedKVStore::new(temp_dir.path(), "test", 4096, None, time_provider.clone()).await?; // Write some data before rotation store diff --git a/bd-resilient-kv/src/versioned_kv_journal/file_manager.rs b/bd-resilient-kv/src/versioned_kv_journal/file_manager.rs index 591de718..e7ed71aa 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/file_manager.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/file_manager.rs @@ -2,20 +2,16 @@ use std::path::{Path, PathBuf}; /// Find the active journal file by searching for the highest generation number. If we failed /// to read the directory or there are no journal files, we return generation 0. -pub fn find_active_journal(dir: &Path, name: &str) -> (PathBuf, u64) { +pub async fn find_active_journal(dir: &Path, name: &str) -> (PathBuf, u64) { // Search for generation-based journals let pattern = format!("{name}.jrn."); let mut max_gen = 0u64; - let Ok(entries) = std::fs::read_dir(dir) else { + let Ok(mut entries) = tokio::fs::read_dir(dir).await else { return (dir.join(format!("{name}.jrn.{max_gen}")), max_gen); }; - for entry in entries { - let Ok(entry) = entry else { - continue; - }; - + while let Ok(Some(entry)) = entries.next_entry().await { let filename = entry.file_name(); let filename_str = filename.to_string_lossy(); diff --git a/bd-resilient-kv/src/versioned_kv_journal/store.rs b/bd-resilient-kv/src/versioned_kv_journal/store.rs index 9350e4d0..d4b4734a 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/store.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/store.rs @@ -55,7 +55,7 @@ impl VersionedKVStore { /// /// # Errors /// Returns an error if we failed to create or open the journal file. - pub fn new>( + pub async fn new>( dir_path: P, name: &str, buffer_size: usize, @@ -64,7 +64,7 @@ impl VersionedKVStore { ) -> anyhow::Result<(Self, DataLoss)> { let dir = dir_path.as_ref(); - let (journal_path, generation) = file_manager::find_active_journal(dir, name); + let (journal_path, generation) = file_manager::find_active_journal(dir, name).await; // TODO(snowp): It would be ideal to be able to start with a small buffer and grow is as needed // depending on the particular device need. We can embed size information in the journal header @@ -149,7 +149,7 @@ impl VersionedKVStore { /// - The journal file cannot be opened /// - The journal file contains invalid data /// - Initialization fails - pub fn open_existing>( + pub async fn open_existing>( dir_path: P, name: &str, buffer_size: usize, @@ -158,7 +158,7 @@ impl VersionedKVStore { ) -> anyhow::Result<(Self, DataLoss)> { let dir = dir_path.as_ref(); - let (journal_path, generation) = file_manager::find_active_journal(dir, name); + let (journal_path, generation) = file_manager::find_active_journal(dir, name).await; let journal = MemMappedVersionedJournal::from_file( &journal_path, @@ -304,14 +304,24 @@ impl VersionedKVStore { pub fn sync(&self) -> anyhow::Result<()> { MemMappedVersionedJournal::sync(&self.journal) } +} + + +/// Information about a journal rotation. This is used by test code to verify rotation results. +pub struct Rotation { + pub new_journal_path: PathBuf, + pub old_journal_path: PathBuf, + pub snapshot_path: PathBuf, +} +impl VersionedKVStore { /// Manually trigger journal rotation, returning the path to the new journal file. /// /// This will create a new journal with the current state compacted and archive the old journal. /// The archived journal will be compressed using zlib to reduce storage size. /// Rotation typically happens automatically when the high water mark is reached, but this /// method allows manual control when needed. - pub async fn rotate_journal(&mut self) -> anyhow::Result { + pub async fn rotate_journal(&mut self) -> anyhow::Result { // Increment generation counter for new journal let next_generation = self.current_generation + 1; let new_journal_path = self @@ -338,15 +348,20 @@ impl VersionedKVStore { let old_journal_path = self .dir_path .join(format!("{}.jrn.{old_generation}", self.journal_name)); - self.cleanup_archived_journal(&old_journal_path).await; + let snapshot_path = self.archive_journal(&old_journal_path).await; - Ok(new_journal_path) + Ok(Rotation { + new_journal_path, + old_journal_path, + snapshot_path, + }) } - /// Clean up after successful rotation (best effort, non-critical). + /// Archives the old journal by compressing it and removing the original. /// - /// This compresses and removes the old journal. Failures are logged but not propagated. - async fn cleanup_archived_journal(&self, old_journal_path: &Path) { + /// This is a best-effort operation; failures to compress or delete the old journal + /// are logged but do not cause the rotation to fail. + async fn archive_journal(&self, old_journal_path: &Path) -> PathBuf { // Generate archived path with timestamp let rotation_timestamp = self .cached_map @@ -360,16 +375,34 @@ impl VersionedKVStore { self.journal_name, rotation_timestamp )); - // Try to compress the old journal - match compress_archived_journal(old_journal_path, &archived_path).await { - Ok(()) => { - // Compression succeeded, remove uncompressed version - let _ = tokio::fs::remove_file(old_journal_path).await; - }, - Err(_e) => { - // Compression failed - keep the uncompressed version as a fallback - }, + log::debug!( + "Archiving journal {} to {}", + old_journal_path.display(), + archived_path.display() + ); + + // Try to compress the old journal for longer-term storage. + if let Err(e) = compress_archived_journal(old_journal_path, &archived_path).await { + log::warn!( + "Failed to compress archived journal {}: {}", + old_journal_path.display(), + e + ) } + + // Remove the uncompressed regardless of compression success. If we succeeded we no longer need + // it, while if we failed we consider the snapshot lost. + let _ignored = tokio::fs::remove_file(old_journal_path) + .await + .inspect_err(|e| { + log::warn!( + "Failed to remove old journal {}: {}", + old_journal_path.display(), + e + ); + }); + + archived_path } fn populate_initial_state( From 1047e02e594575ddc3f75f9975b62d2ca761b38f Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Sun, 9 Nov 2025 20:24:32 -0800 Subject: [PATCH 56/66] mode mod --- AGENTS.md | 2 +- bd-resilient-kv/src/versioned_kv_journal/framing.rs | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 6905513f..424d9453 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -27,7 +27,7 @@ ## Test File Conventions 1. Test files should be placed adjacent to the implementation file they're testing 2. Test files should be named with a `_test.rs` suffix (e.g., `network_quality_test.rs`) -3. Link test files in the implementation file using: +3. Link test files in the implementation file using the following pattern at the top of the file, right below the license header and optional module-level docs. ```rust #[cfg(test)] #[path = "./file_name_test.rs"] diff --git a/bd-resilient-kv/src/versioned_kv_journal/framing.rs b/bd-resilient-kv/src/versioned_kv_journal/framing.rs index 351756f3..8fa6402e 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/framing.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/framing.rs @@ -17,6 +17,10 @@ //! - `payload`: Opaque binary data (format determined by caller) //! - `crc32`: CRC32 checksum of (`timestamp_bytes` + payload) +#[cfg(test)] +#[path = "./framing_test.rs"] +mod tests; + use bytes::BufMut; use crc32fast::Hasher; @@ -160,7 +164,3 @@ impl Frame { Ok((Self::new(timestamp_micros, payload), total_len)) } } - -#[cfg(test)] -#[path = "./framing_test.rs"] -mod tests; From afb6d3ee09a9b7a7c5d464b2aa30aa87ecf930d4 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Sun, 9 Nov 2025 20:25:00 -0800 Subject: [PATCH 57/66] clippy --- bd-resilient-kv/src/versioned_kv_journal/store.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bd-resilient-kv/src/versioned_kv_journal/store.rs b/bd-resilient-kv/src/versioned_kv_journal/store.rs index d4b4734a..42f94b10 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/store.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/store.rs @@ -387,7 +387,7 @@ impl VersionedKVStore { "Failed to compress archived journal {}: {}", old_journal_path.display(), e - ) + ); } // Remove the uncompressed regardless of compression success. If we succeeded we no longer need From 8b266bf0ee2fa68361c4b2c3ae065075aac79d7e Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 14 Nov 2025 10:56:59 -0800 Subject: [PATCH 58/66] update api --- api | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api b/api index 9f59c7f4..53334bc3 160000 --- a/api +++ b/api @@ -1 +1 @@ -Subproject commit 9f59c7f4d855a1aaec85a4647524353262f6cd58 +Subproject commit 53334bc3b224c81d1e39bcb5c1dfced7a5302a4a From 89f9a56ff3066730b9e375dab2ac8283ad1c009f Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 14 Nov 2025 17:23:15 -0800 Subject: [PATCH 59/66] use varint for length --- bd-resilient-kv/VERSIONED_FORMAT.md | 10 +++-- .../src/versioned_kv_journal/framing.rs | 39 ++++++++++--------- .../src/versioned_kv_journal/framing_test.rs | 35 +++++++++++++++-- 3 files changed, 58 insertions(+), 26 deletions(-) diff --git a/bd-resilient-kv/VERSIONED_FORMAT.md b/bd-resilient-kv/VERSIONED_FORMAT.md index a2af013a..3bfc2e99 100644 --- a/bd-resilient-kv/VERSIONED_FORMAT.md +++ b/bd-resilient-kv/VERSIONED_FORMAT.md @@ -49,7 +49,7 @@ The byte-level layout of a VERSION 1 journal file: │ VERSIONED JOURNAL ENTRY │ │ (Protobuf-encoded StateKeyValuePair) │ ├─────────────────────────────────────────────────────────────────────────┤ -│ Frame Length (u32) │ 4 bytes │ +│ Frame Length (varint) │ Variable length (1-10 bytes) │ │ Timestamp (varint) │ Variable length (microseconds) │ │ Protobuf Payload │ Variable length │ │ CRC32 │ 4 bytes │ @@ -77,7 +77,7 @@ Each entry in the journal uses a length-prefixed framing format with CRC32 integ | Component | Size | Type | Description | |-----------|------|------|-------------| -| Frame Length | 4 bytes | u32 (little-endian) | Total size of timestamp + protobuf payload + CRC32 | +| Frame Length | Variable | varint | Total size of timestamp + protobuf payload + CRC32 (1-10 bytes) | | Timestamp | Variable | varint | Entry timestamp in microseconds (serves as version) | | Protobuf Payload | Variable | bytes | Serialized StateKeyValuePair message | | CRC32 | 4 bytes | u32 (little-endian) | Checksum of timestamp + payload | @@ -118,10 +118,12 @@ Fields: **Size Considerations:** - **Header**: Fixed 17 bytes - **Per Entry**: Varies based on key and value size - - Frame overhead: 8+ bytes (length + CRC) + - Frame length: 1-10 bytes (varint-encoded) - Timestamp: 1-10 bytes (varint-encoded) - Protobuf payload: varies by content - - Typical: 40-200 bytes per entry + - CRC: Fixed 4 bytes + - Typical small entries: 20-50 bytes total + - Typical medium entries: 50-200 bytes total ## Journal Structure diff --git a/bd-resilient-kv/src/versioned_kv_journal/framing.rs b/bd-resilient-kv/src/versioned_kv_journal/framing.rs index 8fa6402e..8800e3b9 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/framing.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/framing.rs @@ -9,10 +9,10 @@ //! //! Per-entry format: //! ```text -//! [length: u32][timestamp_micros: varint][payload: bytes][crc32: u32] +//! [length: varint][timestamp_micros: varint][payload: bytes][crc32: u32] //! ``` //! -//! - `length`: Total length of the frame (timestamp + payload + crc) +//! - `length`: Total length of the frame (timestamp + payload + crc), varint encoded //! - `timestamp_micros`: Microseconds since UNIX epoch (varint encoded) //! - `payload`: Opaque binary data (format determined by caller) //! - `crc32`: CRC32 checksum of (`timestamp_bytes` + payload) @@ -27,7 +27,6 @@ use crc32fast::Hasher; mod varint; const CRC_LEN: usize = 4; -const LENGTH_LEN: usize = 4; /// Frame structure for a journal entry. #[derive(Debug, Clone, PartialEq, Eq)] @@ -59,12 +58,13 @@ impl Frame { /// Calculate the encoded size of this frame. #[must_use] pub fn encoded_size(&self) -> usize { - // Calculate varint size - let varint_size = varint::compute_size(self.timestamp_micros); + let timestamp_varint_size = varint::compute_size(self.timestamp_micros); let payload_size: usize = self.payload.compute_size().try_into().unwrap_or(0); - // length(4) + timestamp_varint + payload + crc(4) - LENGTH_LEN + varint_size + payload_size + CRC_LEN + let frame_content_len = timestamp_varint_size + payload_size + CRC_LEN; + let length_varint_size = varint::compute_size(frame_content_len as u64); + + length_varint_size + frame_content_len } /// Encode this frame into a buffer. @@ -94,10 +94,11 @@ impl Frame { // Frame length = timestamp + payload + crc let frame_len = timestamp_len + payload_bytes.len() + CRC_LEN; - #[allow(clippy::cast_possible_truncation)] - { - cursor.put_u32_le(frame_len as u32); - } + + // Encode frame length as varint + let mut length_buf = [0u8; varint::MAX_SIZE]; + let length_len = varint::encode(frame_len as u64, &mut length_buf); + cursor.put_slice(&length_buf[.. length_len]); cursor.put_slice(×tamp_buf[.. timestamp_len]); cursor.put_slice(&payload_bytes); @@ -116,15 +117,15 @@ impl Frame { /// /// Returns (Frame, `bytes_consumed`) or error if invalid/incomplete. pub fn decode(buf: &[u8]) -> anyhow::Result<(Self, usize)> { - if buf.len() < LENGTH_LEN { - anyhow::bail!("Buffer too small for length field"); - } + // Decode frame length varint + let (frame_len_u64, length_len) = + varint::decode(buf).ok_or_else(|| anyhow::anyhow!("Invalid length varint"))?; - // Read frame length - let frame_len = u32::from_le_bytes(buf[0 .. LENGTH_LEN].try_into()?) as usize; + let frame_len = usize::try_from(frame_len_u64) + .map_err(|_| anyhow::anyhow!("Frame length too large: {frame_len_u64}"))?; // Check if we have the complete frame - let total_len = LENGTH_LEN + frame_len; // length field + frame + let total_len = length_len + frame_len; // length varint + frame content if buf.len() < total_len { anyhow::bail!( "Incomplete frame: need {} bytes, have {} bytes", @@ -133,11 +134,11 @@ impl Frame { ); } - let frame_data = &buf[LENGTH_LEN .. total_len]; + let frame_data = &buf[length_len .. total_len]; // Decode timestamp varint let (timestamp_micros, timestamp_len) = - varint::decode(frame_data).ok_or_else(|| anyhow::anyhow!("Invalid varint"))?; + varint::decode(frame_data).ok_or_else(|| anyhow::anyhow!("Invalid timestamp varint"))?; // Extract payload and CRC if frame_data.len() < timestamp_len + CRC_LEN { diff --git a/bd-resilient-kv/src/versioned_kv_journal/framing_test.rs b/bd-resilient-kv/src/versioned_kv_journal/framing_test.rs index b0651f35..a5b43596 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/framing_test.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/framing_test.rs @@ -152,7 +152,7 @@ fn frame_buffer_too_small() { #[test] fn frame_incomplete_length() { - let buf = vec![0x01, 0x02]; // Only 2 bytes (need 4 for length) + let buf = vec![0x80]; // Incomplete varint (has continuation bit but no next byte) let result = Frame::::decode(&buf); assert!(result.is_err()); @@ -160,9 +160,12 @@ fn frame_incomplete_length() { #[test] fn frame_incomplete_data() { - // Frame says it needs 100 bytes but we only provide 20 + // Frame says it needs 100 bytes but we only provide partial data let mut buf = vec![0u8; 20]; - buf[0 .. 4].copy_from_slice(&100u32.to_le_bytes()); + // Encode length varint for 100 bytes + let length_len = varint::encode(100, &mut buf); + // Truncate to simulate incomplete frame + buf.truncate(length_len + 10); let result = Frame::::decode(&buf); assert!(result.is_err()); @@ -206,3 +209,29 @@ fn frame_multiple_frames() { assert_eq!(consumed2, len2); assert_eq!(consumed3, len3); } + +#[test] +fn frame_length_varint_encoding() { + // Test that frame length is properly varint-encoded + // Small frames should use 1 byte for length, larger frames may use more + + // Very small payload (length should fit in 1 byte varint) + let small_frame = Frame::new(0, make_string_value("x")); + let mut buf = vec![0u8; 1024]; + let encoded_len = small_frame.encode(&mut buf).unwrap(); + + // First byte should be the length varint + let (frame_len, length_varint_len) = varint::decode(&buf).unwrap(); + assert_eq!( + length_varint_len, 1, + "Small frame should use 1-byte varint for length" + ); + + // Verify total encoded size matches + assert_eq!(encoded_len as u64, length_varint_len as u64 + frame_len); + + // Verify decoding works + let (decoded, consumed) = Frame::::decode(&buf).unwrap(); + assert_eq!(decoded, small_frame); + assert_eq!(consumed, encoded_len); +} From 2aca5d04ee3546ce07ad9eceee2daa9b3149ffc3 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 14 Nov 2025 17:29:30 -0800 Subject: [PATCH 60/66] add license headers --- bd-resilient-kv/src/versioned_kv_journal/file_manager.rs | 7 +++++++ bd-resilient-kv/src/versioned_kv_journal/framing/varint.rs | 7 +++++++ bd-resilient-kv/src/versioned_kv_journal/mod.rs | 7 +++++++ 3 files changed, 21 insertions(+) diff --git a/bd-resilient-kv/src/versioned_kv_journal/file_manager.rs b/bd-resilient-kv/src/versioned_kv_journal/file_manager.rs index e7ed71aa..337382ef 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/file_manager.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/file_manager.rs @@ -1,3 +1,10 @@ +// shared-core - bitdrift's common client/server libraries +// Copyright Bitdrift, Inc. All rights reserved. +// +// Use of this source code is governed by a source available license that can be found in the +// LICENSE file or at: +// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt + use std::path::{Path, PathBuf}; /// Find the active journal file by searching for the highest generation number. If we failed diff --git a/bd-resilient-kv/src/versioned_kv_journal/framing/varint.rs b/bd-resilient-kv/src/versioned_kv_journal/framing/varint.rs index dfc97fbe..26decd90 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/framing/varint.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/framing/varint.rs @@ -1,3 +1,10 @@ +// shared-core - bitdrift's common client/server libraries +// Copyright Bitdrift, Inc. All rights reserved. +// +// Use of this source code is governed by a source available license that can be found in the +// LICENSE file or at: +// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt + /// Maximum varint size for u64 (10 bytes) pub const MAX_SIZE: usize = 10; diff --git a/bd-resilient-kv/src/versioned_kv_journal/mod.rs b/bd-resilient-kv/src/versioned_kv_journal/mod.rs index 5b415d42..486d57d0 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/mod.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/mod.rs @@ -1,3 +1,10 @@ +// shared-core - bitdrift's common client/server libraries +// Copyright Bitdrift, Inc. All rights reserved. +// +// Use of this source code is governed by a source available license that can be found in the +// LICENSE file or at: +// https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt + use bd_proto::protos::state; mod file_manager; From 2be28cd648204936c942ea4e0df7c917d491e2e8 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 14 Nov 2025 20:10:58 -0800 Subject: [PATCH 61/66] propagate partial failure case, remove read() in favor of ctor arg --- .../src/versioned_kv_journal/framing.rs | 8 -- .../src/versioned_kv_journal/journal.rs | 91 +++++++-------- .../versioned_kv_journal/memmapped_journal.rs | 31 ++--- .../src/versioned_kv_journal/store.rs | 106 ++++++++++-------- 4 files changed, 120 insertions(+), 116 deletions(-) diff --git a/bd-resilient-kv/src/versioned_kv_journal/framing.rs b/bd-resilient-kv/src/versioned_kv_journal/framing.rs index 8800e3b9..ee7875d6 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/framing.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/framing.rs @@ -37,14 +37,6 @@ pub struct Frame { pub payload: M, } -impl Frame { - pub fn decode_timestamp(buf: &[u8]) -> anyhow::Result<(u64, usize)> { - let (timestamp_micros, timestamp_len) = - varint::decode(buf).ok_or_else(|| anyhow::anyhow!("Invalid varint"))?; - Ok((timestamp_micros, timestamp_len)) - } -} - impl Frame { /// Create a new frame. #[must_use] diff --git a/bd-resilient-kv/src/versioned_kv_journal/journal.rs b/bd-resilient-kv/src/versioned_kv_journal/journal.rs index 5693fe39..6fbf0292 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/journal.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/journal.rs @@ -10,6 +10,14 @@ use bd_client_common::error::InvariantError; use bd_time::TimeProvider; use std::sync::Arc; +/// Indicates whether partial data loss has occurred. Partial data loss is detected when the +/// journal would be parsed from disk, but we were not able to find valid records up to `position` +/// as stored in the header. +pub enum PartialDataLoss { + Yes, + None, +} + /// Timestamped implementation of a journaling system that uses timestamps /// as the version identifier for point-in-time recovery. /// @@ -56,6 +64,12 @@ const HEADER_SIZE: usize = 17; // Minimum buffer size for a valid journal const MIN_BUFFER_SIZE: usize = HEADER_SIZE + 4; +/// Returns by +struct BufferState { + highest_timestamp: u64, + partial_data_loss: PartialDataLoss, +} + /// Write to the version field of a journal buffer. fn write_version_field(buffer: &mut [u8], version: u64) { let version_bytes = version.to_le_bytes(); @@ -163,7 +177,8 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { buffer: &'a mut [u8], high_water_mark_ratio: Option, time_provider: Arc, - ) -> anyhow::Result { + f: impl FnMut(&M, u64), + ) -> anyhow::Result<(Self, PartialDataLoss)> { let buffer_len = validate_buffer_len(buffer)?; let position = read_position(buffer)?; let high_water_mark = calculate_high_water_mark(buffer_len, high_water_mark_ratio)?; @@ -177,30 +192,37 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { } // Find initialization timestamp and highest timestamp in the journal - let highest_timestamp = Self::find_latest_timestamp(buffer, position); - - Ok(Self { - position, - buffer, - high_water_mark, - high_water_mark_triggered: position >= high_water_mark, - last_timestamp: highest_timestamp, - time_provider, - _payload_marker: std::marker::PhantomData, - }) + let buffer_state = Self::iterate_buffer(buffer, position, f); + + Ok(( + Self { + position, + buffer, + high_water_mark, + high_water_mark_triggered: position >= high_water_mark, + last_timestamp: buffer_state.highest_timestamp, + time_provider, + _payload_marker: std::marker::PhantomData, + }, + buffer_state.partial_data_loss, + )) } /// Scan the journal to find the highest timestamp. - fn find_latest_timestamp(buffer: &[u8], position: usize) -> u64 { + fn iterate_buffer(buffer: &[u8], position: usize, mut f: impl FnMut(&M, u64)) -> BufferState { let mut cursor = HEADER_SIZE; - let mut highest_timestamp = 0u64; + let mut state = BufferState { + highest_timestamp: 0, + partial_data_loss: PartialDataLoss::None, + }; while cursor < position { let remaining = &buffer[cursor .. position]; - match Frame::<()>::decode_timestamp(remaining) { - Ok((timestamp_micros, consumed)) => { - highest_timestamp = timestamp_micros; + match Frame::::decode(remaining) { + Ok((frame, consumed)) => { + f(&frame.payload, frame.timestamp_micros); + state.highest_timestamp = frame.timestamp_micros; cursor += consumed; }, Err(_) => { @@ -210,7 +232,7 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { } } - highest_timestamp + state } /// Get the next monotonically increasing timestamp. @@ -266,39 +288,6 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { self.high_water_mark_triggered } - /// Read and process all complete entries in the journal with their timestamps. - /// - /// # Returns - /// Returns `false` if there are incomplete entries remaining in the journal after reading. - pub fn read(&self, mut f: impl FnMut(&M, u64)) -> bool { - let mut cursor = HEADER_SIZE; - - let mut incomplete = false; - while cursor < self.position { - let remaining = &self.buffer[cursor .. self.position]; - - match Frame::::decode(remaining) { - Ok((frame, consumed)) => { - f(&frame.payload, frame.timestamp_micros); - - cursor += consumed; - }, - Err(e) => { - // TODO(snowp): In this case we may want to reset the position to cursor to avoid - // carrying forward partial/corrupted data. This matters as the recovery will bail on - // corrupt data resulting in further writes also being lost. - log::debug!("Failed to decode frame at offset {cursor}: {e}"); - - // Stop on first decode error - incomplete = true; - break; - }, - } - } - - !incomplete - } - /// Get current timestamp in microseconds since UNIX epoch. fn current_timestamp(&self) -> std::result::Result { Self::unix_timestamp_micros(self.time_provider.as_ref()) diff --git a/bd-resilient-kv/src/versioned_kv_journal/memmapped_journal.rs b/bd-resilient-kv/src/versioned_kv_journal/memmapped_journal.rs index a77f9c6c..a46e41e8 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/memmapped_journal.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/memmapped_journal.rs @@ -6,7 +6,7 @@ // https://polyformproject.org/wp-content/uploads/2020/06/PolyForm-Shield-1.0.0.txt use super::journal::VersionedJournal; -use bd_proto::protos::state::payload::StateKeyValuePair; +use crate::versioned_kv_journal::journal::PartialDataLoss; use bd_time::TimeProvider; use memmap2::{MmapMut, MmapOptions}; use std::fs::OpenOptions; @@ -23,27 +23,27 @@ use std::sync::Arc; /// During construction, we unsafely declare mmap's internal buffer as having a static /// lifetime, but it's actually tied to the lifetime of `inner`. This works because /// nothing external holds a reference to the buffer. -pub struct MemMappedVersionedJournal { +pub struct MemMappedVersionedJournal { // Note: mmap MUST de-init AFTER versioned_kv because mmap uses it. mmap: MmapMut, - inner: VersionedJournal<'static, StateKeyValuePair>, + inner: VersionedJournal<'static, M>, } -impl std::ops::Deref for MemMappedVersionedJournal { - type Target = VersionedJournal<'static, StateKeyValuePair>; +impl std::ops::Deref for MemMappedVersionedJournal { + type Target = VersionedJournal<'static, M>; fn deref(&self) -> &Self::Target { &self.inner } } -impl std::ops::DerefMut for MemMappedVersionedJournal { +impl std::ops::DerefMut for MemMappedVersionedJournal { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.inner } } -impl MemMappedVersionedJournal { +impl MemMappedVersionedJournal { /// Create a memory-mapped buffer from a file and convert it to a static lifetime slice. /// /// # Safety @@ -122,7 +122,8 @@ impl MemMappedVersionedJournal { size: usize, high_water_mark_ratio: Option, time_provider: Arc, - ) -> anyhow::Result { + f: impl FnMut(&M, u64), + ) -> anyhow::Result<(Self, PartialDataLoss)> { let file = OpenOptions::new().read(true).write(true).open(file_path)?; let file_len = file.metadata()?.len(); @@ -132,12 +133,16 @@ impl MemMappedVersionedJournal { let (mmap, buffer) = unsafe { Self::create_mmap_buffer(file)? }; - let versioned_kv = VersionedJournal::from_buffer(buffer, high_water_mark_ratio, time_provider)?; + let (versioned_kv, partial_data_loss) = + VersionedJournal::from_buffer(buffer, high_water_mark_ratio, time_provider, f)?; - Ok(Self { - mmap, - inner: versioned_kv, - }) + Ok(( + Self { + mmap, + inner: versioned_kv, + }, + partial_data_loss, + )) } /// Synchronize changes to disk. diff --git a/bd-resilient-kv/src/versioned_kv_journal/store.rs b/bd-resilient-kv/src/versioned_kv_journal/store.rs index 42f94b10..7b4ee820 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/store.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/store.rs @@ -7,7 +7,7 @@ use crate::versioned_kv_journal::TimestampedValue; use crate::versioned_kv_journal::file_manager::{self, compress_archived_journal}; -use crate::versioned_kv_journal::journal::VersionedJournal; +use crate::versioned_kv_journal::journal::PartialDataLoss; use crate::versioned_kv_journal::memmapped_journal::MemMappedVersionedJournal; use ahash::AHashMap; use bd_proto::protos::state::payload::{StateKeyValuePair, StateValue}; @@ -22,6 +22,15 @@ pub enum DataLoss { None, } +impl From for DataLoss { + fn from(value: PartialDataLoss) -> Self { + match value { + PartialDataLoss::Yes => Self::Partial, + PartialDataLoss::None => Self::None, + } + } +} + /// A persistent key-value store with timestamp tracking. /// /// `VersionedKVStore` provides HashMap-like semantics backed by a timestamped journal that @@ -35,7 +44,7 @@ pub enum DataLoss { /// For detailed information about timestamp semantics, recovery bucketing, and invariants, /// see the `VERSIONED_FORMAT.md` documentation. pub struct VersionedKVStore { - journal: MemMappedVersionedJournal, + journal: MemMappedVersionedJournal, cached_map: AHashMap, dir_path: PathBuf, journal_name: String, @@ -75,18 +84,16 @@ impl VersionedKVStore { journal_path.display() ); - let (journal, mut data_loss) = if journal_path.exists() { + let (journal, initial_state, data_loss) = if journal_path.exists() { // Try to open existing journal - MemMappedVersionedJournal::from_file( + Self::open( &journal_path, buffer_size, high_water_mark_ratio, time_provider.clone(), ) - .map(|j| (j, DataLoss::None)) + .map(|(j, initial_state, data_loss)| (j, initial_state, data_loss.into())) .or_else(|_| { - // TODO(snowp): Distinguish between partial and total data loss. - // Data is corrupt or unreadable, create fresh journal Ok::<_, anyhow::Error>(( MemMappedVersionedJournal::new( @@ -95,6 +102,7 @@ impl VersionedKVStore { high_water_mark_ratio, time_provider, )?, + AHashMap::default(), DataLoss::Total, )) })? @@ -108,16 +116,11 @@ impl VersionedKVStore { high_water_mark_ratio, time_provider, )?, + AHashMap::default(), DataLoss::None, ) }; - let (initial_state, incomplete) = Self::populate_initial_state(&journal); - - if incomplete && data_loss == DataLoss::None { - data_loss = DataLoss::Partial; - } - Ok(( Self { journal, @@ -160,15 +163,13 @@ impl VersionedKVStore { let (journal_path, generation) = file_manager::find_active_journal(dir, name).await; - let journal = MemMappedVersionedJournal::from_file( + let (journal, initial_state, data_loss) = Self::open( &journal_path, buffer_size, high_water_mark_ratio, time_provider, )?; - let (initial_state, incomplete) = Self::populate_initial_state(&journal); - Ok(( Self { journal, @@ -179,7 +180,7 @@ impl VersionedKVStore { high_water_mark_ratio, current_generation: generation, }, - if incomplete { + if matches!(data_loss, PartialDataLoss::Yes) { DataLoss::Partial } else { DataLoss::None @@ -187,6 +188,40 @@ impl VersionedKVStore { )) } + fn open( + journal_path: &Path, + buffer_size: usize, + high_water_mark_ratio: Option, + time_provider: Arc, + ) -> anyhow::Result<( + MemMappedVersionedJournal, + AHashMap, + PartialDataLoss, + )> { + let mut initial_state = AHashMap::default(); + let (journal, data_loss) = MemMappedVersionedJournal::::from_file( + &journal_path, + buffer_size, + high_water_mark_ratio, + time_provider, + |entry, timestamp| { + if let Some(value) = entry.value.as_ref() { + initial_state.insert( + entry.key.clone(), + TimestampedValue { + value: value.clone(), + timestamp, + }, + ); + } else { + initial_state.remove(&entry.key); + } + }, + )?; + + Ok((journal, initial_state, data_loss)) + } + /// Get a value by key. /// /// This operation is O(1) as it reads from the in-memory cache. @@ -405,27 +440,6 @@ impl VersionedKVStore { archived_path } - fn populate_initial_state( - journal: &VersionedJournal<'_, StateKeyValuePair>, - ) -> (AHashMap, bool) { - let mut map = AHashMap::new(); - let incomplete = journal.read(|entry, timestamp| { - if let Some(value) = entry.value.as_ref() { - map.insert( - entry.key.clone(), - TimestampedValue { - value: value.clone(), - timestamp, - }, - ); - } else { - map.remove(&entry.key); - } - }); - - (map, incomplete) - } - /// Create a new rotated journal with compacted state. /// /// Note: Rotation cannot fail due to insufficient buffer space. Since rotation creates a new @@ -435,7 +449,7 @@ impl VersionedKVStore { async fn create_rotated_journal( &self, journal_path: &Path, - ) -> anyhow::Result { + ) -> anyhow::Result> { // Create in-memory buffer for new journal let mut buffer = vec![0u8; self.buffer_size]; @@ -459,11 +473,15 @@ impl VersionedKVStore { tokio::fs::write(journal_path, &buffer).await?; // Open as memory-mapped journal - MemMappedVersionedJournal::from_file( - journal_path, - self.buffer_size, - self.high_water_mark_ratio, - self.journal.time_provider.clone(), + Ok( + MemMappedVersionedJournal::from_file( + journal_path, + self.buffer_size, + self.high_water_mark_ratio, + self.journal.time_provider.clone(), + |_, _| {}, + )? + .0, ) } } From 1d26a05e47d90bb99e342d3bab680a5659ab55d6 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 14 Nov 2025 20:11:59 -0800 Subject: [PATCH 62/66] clippy --- bd-resilient-kv/src/versioned_kv_journal/store.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bd-resilient-kv/src/versioned_kv_journal/store.rs b/bd-resilient-kv/src/versioned_kv_journal/store.rs index 7b4ee820..a8ffeeed 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/store.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/store.rs @@ -200,7 +200,7 @@ impl VersionedKVStore { )> { let mut initial_state = AHashMap::default(); let (journal, data_loss) = MemMappedVersionedJournal::::from_file( - &journal_path, + journal_path, buffer_size, high_water_mark_ratio, time_provider, From 300401429e415293a12c56d76f3608b64a40ff89 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 14 Nov 2025 20:31:53 -0800 Subject: [PATCH 63/66] impprove rotation Directly create the mmap instead of initializing temp buffer --- .../src/versioned_kv_journal/journal.rs | 72 ++++++------------- .../versioned_kv_journal/memmapped_journal.rs | 5 +- .../src/versioned_kv_journal/store.rs | 30 +++----- 3 files changed, 33 insertions(+), 74 deletions(-) diff --git a/bd-resilient-kv/src/versioned_kv_journal/journal.rs b/bd-resilient-kv/src/versioned_kv_journal/journal.rs index 6fbf0292..e4f92793 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/journal.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/journal.rs @@ -134,6 +134,7 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { /// # Arguments /// * `buffer` - The storage buffer /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 + /// * `entries` - Iterator of entries to be inserted into the newly created buffer. /// /// # Errors /// Returns an error if the buffer is too small or if `high_water_mark_ratio` is invalid. @@ -141,24 +142,41 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { buffer: &'a mut [u8], high_water_mark_ratio: Option, time_provider: Arc, + entries: impl IntoIterator, ) -> anyhow::Result { let buffer_len = validate_buffer_len(buffer)?; let high_water_mark = calculate_high_water_mark(buffer_len, high_water_mark_ratio)?; // Write header - let timestamp = Self::unix_timestamp_micros(time_provider.as_ref())?; - let position = HEADER_SIZE; + let mut position = HEADER_SIZE; + + let mut max_state_timestamp = None; + + // Write all current state with their original timestamps + for (entry, timestamp) in entries { + max_state_timestamp = Some(timestamp); + + let frame = Frame::new(timestamp, entry); + + // Encode frame + let available_space = &mut buffer[position ..]; + let encoded_len = frame.encode(available_space)?; + + position += encoded_len; + } write_position(buffer, position); write_version(buffer); buffer[16] = 0; // Reserved byte + let now = Self::unix_timestamp_micros(time_provider.as_ref())?; + Ok(Self { position, buffer, high_water_mark, high_water_mark_triggered: false, - last_timestamp: timestamp, + last_timestamp: max_state_timestamp.unwrap_or(now), time_provider, _payload_marker: std::marker::PhantomData, }) @@ -304,51 +322,3 @@ impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { .ok_or(InvariantError::Invariant) } } - -/// Rotation utilities for creating new journals with compacted state -impl<'a, M: protobuf::Message> VersionedJournal<'a, M> { - /// Create a new journal initialized with the compacted state from a snapshot. - /// - /// The new journal will have all current entries written with their **original - /// timestamps** to preserve historical accuracy. The journal's monotonic timestamp - /// enforcement will respect the highest timestamp in the provided state. - /// - /// # Arguments - /// * `buffer` - The buffer to write the new journal to - /// * `entries` - Iterator over the entries that should be included in the new journal - /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark - /// - /// # Errors - /// Returns an error if serialization fails or buffer is too small. - pub fn create_rotated_journal( - &self, - buffer: &'a mut [u8], - entries: impl IntoIterator, - high_water_mark_ratio: Option, - ) -> anyhow::Result { - // Create a new journal - let mut journal = Self::new(buffer, high_water_mark_ratio, self.time_provider.clone())?; - - // Find the maximum timestamp in the state to maintain monotonicity - let max_state_timestamp = self.last_timestamp; - - // Write all current state with their original timestamps - for (entry, timestamp) in entries { - // Update last_timestamp to ensure monotonicity is maintained - journal.last_timestamp = std::cmp::max(journal.last_timestamp, timestamp); - - let frame = Frame::new(timestamp, entry); - - // Encode frame - let available_space = &mut journal.buffer[journal.position ..]; - let encoded_len = frame.encode(available_space)?; - - journal.set_position(journal.position + encoded_len); - } - - // Ensure last_timestamp reflects the maximum timestamp we've written - journal.last_timestamp = std::cmp::max(journal.last_timestamp, max_state_timestamp); - - Ok(journal) - } -} diff --git a/bd-resilient-kv/src/versioned_kv_journal/memmapped_journal.rs b/bd-resilient-kv/src/versioned_kv_journal/memmapped_journal.rs index a46e41e8..fa459e3f 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/memmapped_journal.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/memmapped_journal.rs @@ -73,6 +73,7 @@ impl MemMappedVersionedJournal { /// * `file_path` - Path to the file to use for storage /// * `size` - Minimum size of the file in bytes /// * `high_water_mark_ratio` - Optional ratio (0.0 to 1.0) for high water mark. Default: 0.8 + /// * `entries` - Iterator of entries to be inserted into the newly created buffer. /// /// # Errors /// Returns an error if the file cannot be created/opened or memory-mapped. @@ -81,6 +82,7 @@ impl MemMappedVersionedJournal { size: usize, high_water_mark_ratio: Option, time_provider: Arc, + entries: impl IntoIterator, ) -> anyhow::Result { let file = OpenOptions::new() .read(true) @@ -96,7 +98,8 @@ impl MemMappedVersionedJournal { let (mmap, buffer) = unsafe { Self::create_mmap_buffer(file)? }; - let versioned_kv = VersionedJournal::new(buffer, high_water_mark_ratio, time_provider)?; + let versioned_kv = + VersionedJournal::new(buffer, high_water_mark_ratio, time_provider, entries)?; Ok(Self { mmap, diff --git a/bd-resilient-kv/src/versioned_kv_journal/store.rs b/bd-resilient-kv/src/versioned_kv_journal/store.rs index a8ffeeed..8f658fa8 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/store.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/store.rs @@ -101,6 +101,7 @@ impl VersionedKVStore { buffer_size, high_water_mark_ratio, time_provider, + std::iter::empty(), )?, AHashMap::default(), DataLoss::Total, @@ -108,13 +109,13 @@ impl VersionedKVStore { })? } else { // Create new journal - ( MemMappedVersionedJournal::new( &journal_path, buffer_size, high_water_mark_ratio, time_provider, + std::iter::empty(), )?, AHashMap::default(), DataLoss::None, @@ -450,12 +451,11 @@ impl VersionedKVStore { &self, journal_path: &Path, ) -> anyhow::Result> { - // Create in-memory buffer for new journal - let mut buffer = vec![0u8; self.buffer_size]; - - // Use VersionedJournal to create rotated journal in memory - let _rotated = self.journal.create_rotated_journal( - &mut buffer, + let rotated = MemMappedVersionedJournal::new( + journal_path, + self.buffer_size, + self.high_water_mark_ratio, + self.journal.time_provider.clone(), self.cached_map.iter().map(|kv| { ( StateKeyValuePair { @@ -466,22 +466,8 @@ impl VersionedKVStore { kv.1.timestamp, ) }), - self.high_water_mark_ratio, )?; - // Write buffer to the new journal path - tokio::fs::write(journal_path, &buffer).await?; - - // Open as memory-mapped journal - Ok( - MemMappedVersionedJournal::from_file( - journal_path, - self.buffer_size, - self.high_water_mark_ratio, - self.journal.time_provider.clone(), - |_, _| {}, - )? - .0, - ) + Ok(rotated) } } From 23bd85387240579acb2634b8f70856ed064a56c5 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 14 Nov 2025 20:32:20 -0800 Subject: [PATCH 64/66] clippy --- bd-resilient-kv/src/versioned_kv_journal/store.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bd-resilient-kv/src/versioned_kv_journal/store.rs b/bd-resilient-kv/src/versioned_kv_journal/store.rs index 8f658fa8..a02b3acf 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/store.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/store.rs @@ -369,7 +369,7 @@ impl VersionedKVStore { // rotation. // Create new journal with compacted state - let new_journal = self.create_rotated_journal(&new_journal_path).await?; + let new_journal = self.create_rotated_journal(&new_journal_path)?; // Replace in-memory journal with new one (critical section - but no file ops!) // The old journal file remains at the previous generation number @@ -447,7 +447,7 @@ impl VersionedKVStore { /// journal with the same buffer size and compaction only removes redundant updates (old /// versions of keys), the compacted state is always ≤ the current journal size. If data fits /// during normal operation, it will always fit during rotation. - async fn create_rotated_journal( + fn create_rotated_journal( &self, journal_path: &Path, ) -> anyhow::Result> { From 518fb23dec535eec60891def47f39d9f223f3175 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Fri, 14 Nov 2025 20:33:06 -0800 Subject: [PATCH 65/66] simplify --- bd-resilient-kv/src/versioned_kv_journal/store.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bd-resilient-kv/src/versioned_kv_journal/store.rs b/bd-resilient-kv/src/versioned_kv_journal/store.rs index a02b3acf..2e409cf0 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/store.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/store.rs @@ -451,7 +451,7 @@ impl VersionedKVStore { &self, journal_path: &Path, ) -> anyhow::Result> { - let rotated = MemMappedVersionedJournal::new( + MemMappedVersionedJournal::new( journal_path, self.buffer_size, self.high_water_mark_ratio, @@ -466,8 +466,6 @@ impl VersionedKVStore { kv.1.timestamp, ) }), - )?; - - Ok(rotated) + ) } } From 6f19078563670fa581217578cfddbc5ead15f822 Mon Sep 17 00:00:00 2001 From: Snow Pettersen Date: Mon, 17 Nov 2025 09:24:44 -0800 Subject: [PATCH 66/66] refactor --- .../src/versioned_kv_journal/store.rs | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/bd-resilient-kv/src/versioned_kv_journal/store.rs b/bd-resilient-kv/src/versioned_kv_journal/store.rs index 2e409cf0..72402277 100644 --- a/bd-resilient-kv/src/versioned_kv_journal/store.rs +++ b/bd-resilient-kv/src/versioned_kv_journal/store.rs @@ -358,27 +358,23 @@ impl VersionedKVStore { /// Rotation typically happens automatically when the high water mark is reached, but this /// method allows manual control when needed. pub async fn rotate_journal(&mut self) -> anyhow::Result { - // Increment generation counter for new journal let next_generation = self.current_generation + 1; - let new_journal_path = self - .dir_path - .join(format!("{}.jrn.{next_generation}", self.journal_name)); + let old_generation = self.current_generation; + self.current_generation = next_generation; // TODO(snowp): This part needs fuzzing and more safeguards. // TODO(snowp): Consider doing this out of band to split error handling for the insert and // rotation. - // Create new journal with compacted state - let new_journal = self.create_rotated_journal(&new_journal_path)?; - - // Replace in-memory journal with new one (critical section - but no file ops!) - // The old journal file remains at the previous generation number - let old_journal = std::mem::replace(&mut self.journal, new_journal); - let old_generation = self.current_generation; - self.current_generation = next_generation; + // Create new journal with compacted state. This doens't touch the file containing the old + // journal. + let new_journal_path = self + .dir_path + .join(format!("{}.jrn.{next_generation}", self.journal_name)); - // Drop the old journal to release the mmap - drop(old_journal); + MemMappedVersionedJournal::sync(&self.journal)?; + let new_journal = self.create_rotated_journal(&new_journal_path)?; + self.journal = new_journal; // Best-effort cleanup: compress and archive the old journal let old_journal_path = self