diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d436f8..4dac227 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,24 @@ All notable changes to this project will be documented in this file. +## [Unreleased] + +### Added +- `OneIo::to_lines_lossy` — lossy line iterator from any `Box` +- `read_lines_lossy` / `OneIo::read_lines_lossy` — line iteration with invalid UTF-8 replaced by `U+FFFD` +- `read_to_string_lossy` / `OneIo::read_to_string_lossy` / `read_to_string_lossy_async` — full-file read with lossy UTF-8 +- `read_to_bytes` / `OneIo::read_to_bytes` / `read_to_bytes_async` — byte-perfect read without UTF-8 validation +- `--strict-utf8` CLI flag for strict UTF-8 validation in the CLI + +### Changed +- CLI now defaults to lossy UTF-8 reading; no longer exits on invalid byte sequences +- Documentation examples updated to use lossy APIs + +### Deprecated +- `read_lines` — use `read_lines_lossy` for lossy text, `read_to_bytes` for byte-perfect whole-file reads, or `get_reader` + `BufReader` for streaming byte processing +- `read_to_string` — use `read_to_string_lossy` or `read_to_bytes` +- `read_to_string_async` — use `read_to_string_lossy_async` or `read_to_bytes_async` + ## v0.22.0 -- 2026-05-01 ### Breaking changes diff --git a/README.md b/README.md index 8a79a30..0c2c322 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,7 @@ Read all content into a string (works with compression and remote files automati ```rust use oneio; -let content = oneio::read_to_string("https://spaces.bgpkit.org/oneio/test_data.txt.gz")?; +let content = oneio::read_to_string_lossy("https://spaces.bgpkit.org/oneio/test_data.txt.gz")?; println!("{}", content); ``` @@ -120,7 +120,7 @@ Read line by line: ```rust use oneio; -let lines = oneio::read_lines("https://spaces.bgpkit.org/oneio/test_data.txt.gz")? +let lines = oneio::read_lines_lossy("https://spaces.bgpkit.org/oneio/test_data.txt.gz")? .map(|line| line.unwrap()) .collect::>(); @@ -151,7 +151,7 @@ writer.write_all(b"Hello, compressed world!")?; drop(writer); // Important: close the writer // Read it back -let content = oneio::read_to_string("output.txt.gz")?; +let content = oneio::read_to_string_lossy("output.txt.gz")?; ``` ### Reusable OneIo Clients @@ -171,8 +171,8 @@ let oneio = OneIo::builder() .build()?; // Reuse the same configuration for multiple requests -let content1 = oneio.read_to_string("https://api.example.com/data1.json")?; -let content2 = oneio.read_to_string("https://api.example.com/data2.json")?; +let content1 = oneio.read_to_string_lossy("https://api.example.com/data1.json")?; +let content2 = oneio.read_to_string_lossy("https://api.example.com/data2.json")?; ``` **Builder Methods:** @@ -237,7 +237,7 @@ use oneio; #[tokio::main] async fn main() -> Result<(), Box> { - let content = oneio::read_to_string_async("https://example.com/data.json.gz").await?; + let content = oneio::read_to_string_lossy_async("https://example.com/data.json.gz").await?; oneio::download_async( "https://example.com/data.csv.gz", @@ -270,7 +270,7 @@ s3_delete("my-bucket", "path/to/copy.txt")?; // Read S3 directly using OneIO let oneio = oneio::OneIo::new()?; -let content = oneio.read_to_string("s3://my-bucket/path/to/file.txt")?; +let content = oneio.read_to_string_lossy("s3://my-bucket/path/to/file.txt")?; // Check existence and get metadata if s3_exists("my-bucket", "path/to/file.txt")? { @@ -327,7 +327,7 @@ fn main() -> Result<(), Box> { oneio::crypto::ensure_default_provider()?; // Now all HTTPS/S3/FTP operations will work - let content = oneio::read_to_string("https://example.com/data.txt")?; + let content = oneio::read_to_string_lossy("https://example.com/data.txt")?; Ok(()) } diff --git a/examples/async_read.rs b/examples/async_read.rs index 777a319..579fd6c 100644 --- a/examples/async_read.rs +++ b/examples/async_read.rs @@ -5,7 +5,7 @@ //! Requires the "async" feature and an async runtime (tokio). use oneio::get_reader_async; -use oneio::read_to_string_async; +use oneio::read_to_string_lossy_async; use tokio::io::{AsyncReadExt, BufReader}; #[tokio::main] @@ -15,9 +15,9 @@ async fn main() -> Result<(), Box> { // You can use a local file or a remote URL let path = "tests/test_data.txt.gz"; - // --- High-level API: read_to_string_async --- + // --- High-level API: read_to_string_lossy_async --- println!("Reading file asynchronously (high-level): {}", path); - let content = read_to_string_async(path).await?; + let content = read_to_string_lossy_async(path).await?; println!("File content (high-level):\n{}", content); // --- Low-level API: get_reader_async --- diff --git a/examples/test_crypto_provider.rs b/examples/test_crypto_provider.rs index de60a34..e711923 100644 --- a/examples/test_crypto_provider.rs +++ b/examples/test_crypto_provider.rs @@ -12,7 +12,7 @@ fn main() -> Result<(), Box> { // Test HTTPS download - crypto provider is already set up println!("\nDownloading test file via HTTPS..."); - let content = oneio::read_to_string("https://spaces.bgpkit.org/oneio/test_data.txt")?; + let content = oneio::read_to_string_lossy("https://spaces.bgpkit.org/oneio/test_data.txt")?; println!("✓ Downloaded content:\n{}", content.trim()); println!("\n✓ All operations completed successfully!"); diff --git a/specs/02-lossy-read/README.md b/specs/02-lossy-read/README.md new file mode 100644 index 0000000..56beb7e --- /dev/null +++ b/specs/02-lossy-read/README.md @@ -0,0 +1,329 @@ +# Spec: Lossy Read Operations + +**Status**: Draft (Revised after MAGI review) +**Author**: Mingwei Zhang +**Created**: 2025-05-12 +**Target Branch**: `dev/lossy-read` + +## 1. Overview + +Add lossy UTF-8 reading variants and byte-perfect helpers, deprecate strict-UTF-8 APIs that fail on real-world data containing non-UTF-8 bytes. + +**Non-goals:** +- Full encoding detection (e.g., CP1252, Shift-JIS) — out of scope +- Changing existing APIs to be lossy by default — backward incompatible +- Async line iteration — not requested, `read_to_string_lossy_async` is sufficient + +**Success criteria:** +- [ ] `read_lines_lossy` no longer aborts on Latin-1 or other non-UTF-8 data +- [ ] CLI no longer exits on non-UTF-8 input +- [ ] Existing strict APIs deprecated with clear migration paths +- [ ] All internal code compiles without deprecation warnings +- [ ] Tests cover lossy replacement, byte-perfect round-trip, and legacy failure mode + +## 2. Current State + +`OneIo::read_lines`, `OneIo::read_to_string`, and `read_to_string_async` use Rust's strict UTF-8 reading (`BufRead::lines()`, `Read::read_to_string()`, `AsyncReadExt::read_to_string()`). Any invalid byte sequence yields `io::Error(InvalidData)` and stops iteration or fails the entire read. + +Real-world impact: The RIPE IRR database (~280 MB) contains Latin-1 `0xf3` bytes in `descr:` fields. `read_lines` fails after ~1700 lines with `"stream did not contain valid UTF-8"`. + +The CLI (`src/bin/oneio.rs`) manually wraps a resolved reader in `BufReader` and calls `.lines()`, so it also dies on the first bad byte with exit code 1. + +There is **no built-in lossy line iterator in `std::io`**. `BufRead::lines()` is strict UTF-8 only, and `BufRead::split(b'\n')` leaves trailing `\r` on CRLF input while also double-allocating per line. A custom iterator is required. + +## 3. Proposed Solution + +### Core type: `impl Iterator` helper + +A private helper using `std::iter::from_fn` that mirrors `std::io::Lines` semantics but replaces invalid UTF-8 with `U+FFFD`. Uses a reusable `Vec` buffer and `read_until` for single-allocation-per-line performance. + +```rust +fn lossy_lines(mut buf: B) -> impl Iterator> { + let mut bytes = Vec::new(); + std::iter::from_fn(move || { + bytes.clear(); + match buf.read_until(b'\n', &mut bytes) { + Ok(0) => None, + Ok(_) => { + // Match BufRead::lines() semantics + if bytes.ends_with(b"\n") { + bytes.pop(); + if bytes.ends_with(b"\r") { + bytes.pop(); + } + } + Some(Ok(String::from_utf8_lossy(&bytes).into_owned())) + } + Err(e) => Some(Err(e)), + } + }) +} +``` + +**Why `impl Iterator` over a public struct or `Box`:** + +| Approach | CRLF handling | Allocations/line | API surface | Verdict | +|----------|---------------|------------------|-------------|---------| +| `BufRead::split(b'\n')` + `from_utf8_lossy` | Broken (leaves `\r`) | 2 (Vec + String) | `Box` | Rejected | +| `Box>>` | Same as above | 2 | Opaque, dynamic dispatch | Rejected | +| **`impl Iterator` via `from_fn`** | Correct | 1 | Minimal — no new public types | **Accepted** | +| `LossyLines` public struct | Correct | 1 | Adds permanent public type | Rejected | + +**Tradeoff:** `impl Iterator` cannot be named in struct fields. In practice, 100% of `read_lines` callers in this codebase use it directly in a `for` loop or `.map().collect()` chain, not stored in structs. If a namable type is needed later, a public struct can be added without breaking existing `impl Iterator` consumers. + +### New public APIs + +| API | Location | Description | +|-----|----------|-------------| +| `OneIo::to_lines_lossy` | `src/client.rs` | Converts `Box` into `impl Iterator>` | +| `OneIo::read_lines_lossy` | `src/client.rs` | `get_reader` + `to_lines_lossy` | +| `OneIo::read_to_string_lossy` | `src/client.rs` | `read_to_end` + `String::from_utf8_lossy` | +| `OneIo::read_to_bytes` | `src/client.rs` | `read_to_end`, returns `Vec` | +| `oneio::read_lines_lossy` | `src/lib.rs` | Module-level convenience | +| `oneio::read_to_string_lossy` | `src/lib.rs` | Module-level convenience | +| `oneio::read_to_bytes` | `src/lib.rs` | Module-level convenience | +| `oneio::read_to_string_lossy_async` | `src/async_reader.rs` | Async `read_to_end` + lossy conversion | +| `oneio::read_to_bytes_async` | `src/async_reader.rs` | Async byte-perfect read | + +### Deprecated APIs + +Attach `#[deprecated(since = "0.23.0", note = "...")]`: + +- `OneIo::read_lines` → *"Use `read_lines_lossy` for lossy text, `read_to_bytes` for byte-perfect whole-file reads, or `get_reader` for byte streaming"* +- `OneIo::read_to_string` → *"Use `read_to_string_lossy` or `read_to_bytes`"* +- `oneio::read_lines` → same as above +- `oneio::read_to_string` → same as above +- `oneio::read_to_string_async` → *"Use `read_to_string_lossy_async` or `read_to_bytes_async`"* + +### CLI change + +Add a `--strict-utf8` flag and default to lossy reading: + +```rust +/// Fail on invalid UTF-8 instead of replacing with U+FFFD +#[clap(long)] +strict_utf8: bool, +``` + +Replace the inline `reader.lines()` loop: + +```rust +// Before +let reader = Box::new(BufReader::new(reader_result?)); +for line in reader.lines() { ... } + +// After +let lines = if cli.strict_utf8 { + Box::new(BufReader::new(reader_result?).lines()) + as Box>> +} else { + oneio.to_lines_lossy(reader_result?) +}; +for line in lines { ... } +``` + +## 4. Breaking Changes + +### Library API + +No breaking changes. All existing functions remain unchanged with identical signatures and behavior. Strict-UTF-8 APIs are **deprecated** with `#[deprecated]` but not removed. + +| API | Change Type | Detail | +|-----|-------------|--------| +| `OneIo::read_lines` | Deprecated (warning only) | Still strict UTF-8, still works | +| `OneIo::read_to_string` | Deprecated (warning only) | Still strict UTF-8, still works | +| `oneio::read_lines` | Deprecated (warning only) | Still strict UTF-8, still works | +| `oneio::read_to_string` | Deprecated (warning only) | Still strict UTF-8, still works | +| `oneio::read_to_string_async` | Deprecated (warning only) | Still strict UTF-8, still works | +| `OneIo::read_lines_lossy` | **New** | Returns `impl Iterator>` | +| `OneIo::read_to_string_lossy` | **New** | Returns `Result` | +| `OneIo::read_to_bytes` | **New** | Returns `Result, OneIoError>` | +| `oneio::read_lines_lossy` | **New** | Returns `impl Iterator>` | +| `oneio::read_to_string_lossy` | **New** | Returns `Result` | +| `oneio::read_to_bytes` | **New** | Returns `Result, OneIoError>` | +| `oneio::read_to_string_lossy_async` | **New** | Async, returns `String` | +| `oneio::read_to_bytes_async` | **New** | Async, returns `Vec` | + +### CLI Binary + +**Behavioral breaking change.** The CLI previously exited with code 1 when encountering invalid UTF-8 in the input stream. After this change, the CLI silently replaces invalid byte sequences with `U+FFFD` and continues processing. + +Impact on users: +- **Shell scripts** using `set -e` or checking `$?` will no longer catch encoding errors via exit code +- **Data pipelines** that relied on `oneio` failing on non-UTF-8 data will now receive lossy output instead of an error +- **Log processing** will now complete on partially-invalid files instead of aborting mid-stream + +This is an intentional change: the CLI is a data pipeline tool, and hard failure on encoding is unexpected for non-programmers. Users who need strict validation can use the new `--strict-utf8` CLI flag. + +## 6. Implementation Plan + +| Phase | Task | Acceptance Criteria | +|-------|------|-------------------| +| 1 | Write spec | This document exists and is reviewed | +| 2 | Add private `lossy_lines` helper | Compiles, handles `\r\n`, `\n`, and no-trailing-newline correctly | +| 3 | Add `OneIo::to_lines_lossy` | Returns `impl Iterator`, used by `read_lines_lossy` and CLI | +| 4 | Add `read_lines_lossy`, `read_to_string_lossy`, `read_to_bytes` | All have module-level wrappers | +| 5 | Add async variants | `read_to_string_lossy_async`, `read_to_bytes_async` | +| 6 | Deprecate strict APIs | `#[deprecated]` on 5 functions, zero internal warnings | +| 7 | Migrate CLI to lossy | CLI uses `to_lines_lossy`, no manual `BufReader` | +| 8 | Migrate tests/examples | All internal callers use new APIs | +| 9 | Add encoding test | Test file with `0xf3` validates lossy and bytes behavior | +| 10 | Update docs/CHANGELOG | `lib.rs` examples, `README.md`, `CHANGELOG.md` updated | + +## 7. Testing Strategy + +### 7.1 Test Fixtures + +All tests create fixtures programmatically (no checked-in binary files) to avoid cross-platform encoding issues: + +```rust +const LATIN1_BYTES: &[u8] = b"valid\nbad: \xf3\nnext\n"; +const CRLF_BYTES: &[u8] = b"line1\r\nline2\r\n"; +const BARE_CR_BYTES: &[u8] = b"line1\rline2\r"; +const NO_NEWLINE_BYTES: &[u8] = b"no newline"; +const EMPTY_BYTES: &[u8] = b""; +const ALL_INVALID_BYTES: &[u8] = b"\xff\xfe\xfd"; +const MIXED_INVALID_BYTES: &[u8] = b"hello\n\xffworld\n"; +const UTF8_WITH_BOM: &[u8] = b"\xef\xbb\xbfhello\nworld\n"; +``` + +### 7.2 Unit Tests (`tests/lossy_lines_tests.rs` or inline in `src/client.rs`) + +Test the private `lossy_lines` helper directly using `std::io::Cursor`: + +| # | Test Name | Input | Expected Result | +|---|-----------|-------|-----------------| +| 1 | `test_lossy_lines_basic` | `b"line1\nline2\n"` | 2 lines: `"line1"`, `"line2"` | +| 2 | `test_lossy_lines_latin1` | `LATIN1_BYTES` | 3 lines: `"valid"`, `"bad: \u{FFFD}"`, `"next"` | +| 3 | `test_lossy_lines_crlf` | `CRLF_BYTES` | 2 lines: `"line1"`, `"line2"` (no trailing `\r`) | +| 4 | `test_lossy_lines_bare_cr` | `BARE_CR_BYTES` | 1 line: `"line1\rline2\r"` (bare `\r` preserved, only `\n` splits) | +| 5 | `test_lossy_lines_no_trailing_newline` | `NO_NEWLINE_BYTES` | 1 line: `"no newline"` | +| 6 | `test_lossy_lines_empty` | `EMPTY_BYTES` | 0 lines | +| 7 | `test_lossy_lines_all_invalid` | `ALL_INVALID_BYTES` | 1 line: `"\u{FFFD}\u{FFFD}\u{FFFD}"` | +| 8 | `test_lossy_lines_mixed_invalid` | `MIXED_INVALID_BYTES` | 2 lines: `"hello"`, `"\u{FFFD}world"` | +| 9 | `test_lossy_lines_utf8_bom` | `UTF8_WITH_BOM` | 2 lines: BOM preserved in first line `"\u{FEFF}hello"` | +| 10 | `test_lossy_lines_single_long_line` | `b"x"` repeated 1MB + `b"\n"` | 1 line, 1MB of `"x"` | + +### 7.3 Integration Tests (`tests/basic_integration.rs` additions) + +Test the public API surface via temp files: + +| # | Test Name | API Under Test | Input | Expected Result | +|---|-----------|----------------|-------|-----------------| +| 11 | `test_read_lines_lossy_latin1` | `oneio::read_lines_lossy` | Temp file with `LATIN1_BYTES` | 3 lines, middle contains `\u{FFFD}` | +| 12 | `test_read_lines_lossy_crlf` | `oneio::read_lines_lossy` | Temp file with `CRLF_BYTES` | 2 lines, no `\r` | +| 13 | `test_read_lines_lossy_empty` | `oneio::read_lines_lossy` | Empty temp file | 0 lines | +| 14 | `test_read_lines_lossy_continuation` | `oneio::read_lines_lossy` | `LATIN1_BYTES` + more valid lines after | All lines yielded, not truncated at bad byte | +| 15 | `test_read_to_string_lossy_latin1` | `oneio::read_to_string_lossy` | Temp file with `LATIN1_BYTES` | String containing `\u{FFFD}` | +| 16 | `test_read_to_string_lossy_valid_utf8` | `oneio::read_to_string_lossy` | Valid UTF-8 file | Exact same content as `read_to_string` | +| 17 | `test_read_to_bytes_roundtrip` | `oneio::read_to_bytes` | Temp file with `LATIN1_BYTES` | Exact `LATIN1_BYTES` returned | +| 18 | `test_read_to_bytes_empty` | `oneio::read_to_bytes` | Empty temp file | Empty `Vec` | +| 19 | `test_read_lines_strict_still_fails` | `oneio::read_lines` | Temp file with `LATIN1_BYTES` | `Err(InvalidData)` on second `next()` | +| 20 | `test_read_to_string_strict_still_fails` | `oneio::read_to_string` | Temp file with `LATIN1_BYTES` | `Err(InvalidData)` | +| 21 | `test_client_read_lines_lossy` | `OneIo::read_lines_lossy` | Via `OneIo::new()` client | Same as test 11 | +| 22 | `test_client_to_lines_lossy` | `OneIo::to_lines_lossy` | Via `get_reader` + `to_lines_lossy` | Same as test 11 | +| 23 | `test_client_read_to_bytes` | `OneIo::read_to_bytes` | Via `OneIo::new()` client | Same as test 17 | + +### 7.4 Async Tests (`tests/async_integration.rs` additions) + +| # | Test Name | API Under Test | Input | Expected Result | +|---|-----------|----------------|-------|-----------------| +| 24 | `test_read_to_string_lossy_async_latin1` | `oneio::read_to_string_lossy_async` | Temp file with `LATIN1_BYTES` | String containing `\u{FFFD}` | +| 25 | `test_read_to_bytes_async_roundtrip` | `oneio::read_to_bytes_async` | Temp file with `LATIN1_BYTES` | Exact `LATIN1_BYTES` returned | +| 26 | `test_read_to_string_async_strict_still_fails` | `oneio::read_to_string_async` | Temp file with `LATIN1_BYTES` | `Err(InvalidData)` | + +### 7.5 CLI Tests + +| # | Test Name | Command | Input | Expected Result | +|---|-----------|---------|-------|-----------------| +| 27 | `cli_lossy_default` | `cargo run -- --stats file_with_0xf3` | File with Latin-1 byte | Prints line count, exits 0 | +| 28 | `cli_lossy_output` | `cargo run -- file_with_0xf3` | File with Latin-1 byte | Prints all lines with `\u{FFFD}` in output, exits 0 | +| 29 | `cli_strict_utf8_fails` | `cargo run -- --strict-utf8 --stats file_with_0xf3` | File with Latin-1 byte | Exits with non-zero code | +| 30 | `cli_strict_utf8_success` | `cargo run -- --strict-utf8 --stats valid_utf8_file` | Valid UTF-8 file | Prints line count, exits 0 | +| 31 | `cli_crlf_handling` | `cargo run -- file_with_crlf` | File with `\r\n` | Output lines do not contain `\r` | +| 32 | `cli_compression_with_lossy` | `cargo run -- file_with_0xf3.gz` | Gzipped file with Latin-1 | Decompresses and prints with `\u{FFFD}`, exits 0 | + +### 7.6 Edge Case & Stress Tests + +| # | Test Name | Scenario | Expected Result | +|---|-----------|----------|-----------------| +| 33 | `test_very_large_file` | 100MB file with scattered invalid bytes | Completes without OOM, all lines yielded | +| 34 | `test_concurrent_reads` | Multiple threads calling `read_lines_lossy` on different files | No data races, all iterators independent | +| 35 | `test_send_trait` | Verify `read_lines_lossy` result can be moved across threads | Compiles (required for `Send` usage) | +| 36 | `test_no_leak_on_early_drop` | Drop iterator after 1 line | No memory leak, file handle released | +| 37 | `test_lines_with_embedded_nul` | `b"hello\x00world\n"` | Line contains embedded NUL, lossy conversion not triggered (NUL is valid UTF-8) | +| 38 | `test_max_line_length` | Single line > BufReader capacity (8KB default) | Correctly reads full line, not truncated | + +### 7.7 Deprecation & Compatibility Tests + +| # | Test Name | Scenario | Expected Result | +|---|-----------|----------|-----------------| +| 39 | `test_deprecated_api_compiles` | Use `#[allow(deprecated)]` on `read_lines` call | Compiles without error | +| 40 | `test_zero_internal_deprecation_warnings` | Build with `--all-features` | No deprecation warnings from library's own code | +| 41 | `test_new_api_no_deprecation` | Use `read_lines_lossy` | No deprecation warning | + +### 7.8 Test Infrastructure + +**Helper function for temp files:** +```rust +fn write_temp(bytes: &[u8]) -> tempfile::NamedTempFile { + let mut file = tempfile::NamedTempFile::new().unwrap(); + file.write_all(bytes).unwrap(); + file.flush().unwrap(); + file +} +``` + +**CI integration:** +- All new tests run with `cargo test --all-features` +- Encoding tests run on Linux, macOS, and Windows CI runners +- CLI tests use shell scripts in `.github/workflows/cli-tests.sh` + +## 8. Risks + +| Risk | Likelihood | Mitigation | +|------|------------|------------| +| Deprecation warnings in external dependents | High | Clear deprecation messages with direct replacement names | +| `read_to_string_lossy` allocates entire file into memory | Low | Same cost as `String::from_utf8_lossy`; document streaming recommendation | +| Test fixture encoding issues on Windows | Low | Write test fixtures in code (hex literals), don't check in binary files | +| `impl Iterator` helper diverges from future `BufRead::lines()` behavior | Low | `lines()` semantics are stable; tests lock expected behavior | + +## 9. Design Discussion: Should We Expect Unicode? + +A central question: is `read_lines` failing on non-UTF-8 actually a bug, or a feature? + +### Arguments for strict UTF-8 (current behavior) + +- **Rust `String` requires UTF-8** — Any non-UTF-8 byte that slips through is technically invalid `String` data +- **Data quality validation** — For JSON, configs, or protocol text, invalid UTF-8 genuinely means corrupted/malformed input +- **Caller expectations** — Some users may rely on `read_lines` as an implicit validation step; silently lossy conversion could hide data corruption +- **HTTP content-type** — Servers often declare `charset=utf-8`; failure on invalid bytes honors that contract + +### Arguments against strict UTF-8 (the issue #72 perspective) + +- **Real-world data is messy** — RIPE IRR (`0xf3` Latin-1), legacy logs (CP1252), mail archives (ISO-8859-1), scraped HTML frequently contain non-UTF-8 bytes +- **No encoding metadata in compressed files** — `.gz`, `.bz2`, `.zst` provide no charset information; assuming UTF-8 is just a guess +- **Failure mode is unhelpful** — Stopping mid-stream with `InvalidData` gives the caller no way to recover or salvage the remaining 99% of valid data +- **OneIO is a general I/O library, not a validator** — Its job is to get bytes from A to B; strictness is the caller's concern if they need it +- **100% of `read_lines` usage in this codebase** — Used in `for` loops or `.map().collect()`; none store the iterator or depend on strictness for validation + +### Conclusion + +For a general-purpose I/O library, **assuming Unicode is not safe**. The default should tolerate non-UTF-8 data (via lossy replacement) because: +1. The failure case (stopping mid-stream) is worse than the replacement case (`U+FFFD` in one field) +2. Callers who need strict validation can opt in via the (now deprecated but functional) `read_lines` or via `--strict-utf8` in the CLI +3. This matches the Rust ecosystem convention: `String::from_utf8_lossy` exists precisely because real-world bytes are not always valid UTF-8 + +The deprecation of strict APIs signals that lossy tolerance is the preferred default for general I/O, while keeping strict paths available for callers who need validation. + +## 10. Decision Log + +| Date | Decision | Rationale | +|------|----------|-----------| +| 2025-05-12 | Add lossy variants | Real-world data (RIPE IRR, logs) contains non-UTF-8 bytes | +| 2025-05-12 | Deprecate rather than change default | Backward compatibility; callers may rely on strict UTF-8 validation | +| 2025-05-12 | No encoding detection dependency | Out of scope; `String::from_utf8_lossy` is the standard Rust lossy conversion | +| 2025-05-12 | CLI defaults to lossy without a flag | CLI is a data pipeline tool, hard failure on encoding is unexpected | +| 2025-05-12 | `impl Iterator` via `from_fn` over public struct or `Box` | Minimal API surface — no new public types, no dynamic dispatch, correct CRLF | +| 2025-05-12 | `read_until` over `split(b'\n')` | `split` leaves `\r` on CRLF and allocates twice per line | +| 2025-05-12 | `--strict-utf8` CLI flag | Provides escape hatch for users who need strict validation in the CLI | diff --git a/specs/README.md b/specs/README.md index a9698f1..9cd91f5 100644 --- a/specs/README.md +++ b/specs/README.md @@ -3,9 +3,10 @@ This directory contains design specifications for oneio features. | # | Spec | Status | -|---|------|--------| +|---|---|------| | 01 | [rusty-s3-migration](rusty-s3-migration/README.md) | Complete | +| 02 | [lossy-read](02-lossy-read/README.md) | Draft | --- -*Last updated: 2025-05-01* +*Last updated: 2025-05-12* diff --git a/src/async_reader.rs b/src/async_reader.rs index 592f043..cf34862 100644 --- a/src/async_reader.rs +++ b/src/async_reader.rs @@ -18,6 +18,10 @@ pub async fn get_reader_async(path: &str) -> Result Result { let mut reader = get_reader_async(path).await?; @@ -26,6 +30,25 @@ pub async fn read_to_string_async(path: &str) -> Result { Ok(content) } +/// Reads the entire content of a file asynchronously into a string, +/// replacing invalid UTF-8 sequences with `U+FFFD`. +#[cfg(feature = "async")] +pub async fn read_to_string_lossy_async(path: &str) -> Result { + let mut reader = get_reader_async(path).await?; + let mut buf = Vec::new(); + reader.read_to_end(&mut buf).await?; + Ok(String::from_utf8_lossy(&buf).into_owned()) +} + +/// Reads the entire content of a file asynchronously into raw bytes. +#[cfg(feature = "async")] +pub async fn read_to_bytes_async(path: &str) -> Result, OneIoError> { + let mut reader = get_reader_async(path).await?; + let mut buf = Vec::new(); + reader.read_to_end(&mut buf).await?; + Ok(buf) +} + /// Downloads a file asynchronously from a URL to a local path #[cfg(feature = "async")] pub async fn download_async(url: &str, path: &str) -> Result<(), OneIoError> { diff --git a/src/bin/oneio.rs b/src/bin/oneio.rs index 0da481a..805cf65 100644 --- a/src/bin/oneio.rs +++ b/src/bin/oneio.rs @@ -59,6 +59,10 @@ struct Cli { #[clap(long)] compression: Option, + /// Fail on invalid UTF-8 instead of replacing with U+FFFD + #[clap(long)] + strict_utf8: bool, + #[clap(subcommand)] command: Option, } @@ -325,19 +329,25 @@ fn main() { oneio.get_reader(path) }; - let reader = Box::new(BufReader::new(match reader_result { + let reader = match reader_result { Ok(r) => r, Err(e) => { eprintln!("cannot open {path}: {e}"); exit(1); } - })); + }; let mut stdout = std::io::stdout(); let mut count_lines = 0usize; let mut count_chars = 0usize; - for line in reader.lines() { + let lines: Box>> = if cli.strict_utf8 { + Box::new(BufReader::new(reader).lines()) + } else { + Box::new(oneio.to_lines_lossy(reader)) + }; + + for line in lines { let line = match line { Ok(l) => l, Err(e) => { diff --git a/src/client.rs b/src/client.rs index 6c63934..67d5518 100644 --- a/src/client.rs +++ b/src/client.rs @@ -12,6 +12,30 @@ use std::fs::File; use std::io::{BufRead, BufReader, BufWriter, Lines, Read, Write}; use std::path::Path; +/// Private helper: lossy UTF-8 line iterator over any `BufRead`. +/// +/// Invalid UTF-8 sequences are replaced with `U+FFFD`. Line splitting matches +/// `BufRead::lines()`: strips terminal `\n` and `\r\n`. +fn lossy_lines(mut buf: B) -> impl Iterator> { + let mut bytes = Vec::new(); + std::iter::from_fn(move || { + bytes.clear(); + match buf.read_until(b'\n', &mut bytes) { + Ok(0) => None, + Ok(_) => { + if bytes.ends_with(b"\n") { + bytes.pop(); + if bytes.ends_with(b"\r") { + bytes.pop(); + } + } + Some(Ok(String::from_utf8_lossy(&bytes).into_owned())) + } + Err(e) => Some(Err(e)), + } + }) +} + /// Reusable OneIO client for applying request configuration across multiple operations. /// /// Use [`OneIo::builder()`] to customize default headers, TLS certificates, and @@ -148,6 +172,7 @@ impl OneIo { } /// Reads the full contents of a file or URL into a string. + #[deprecated(since = "0.23.0", note = "Use read_to_string_lossy or read_to_bytes")] pub fn read_to_string(&self, path: &str) -> Result { let mut reader = self.get_reader(path)?; let mut content = String::new(); @@ -155,6 +180,23 @@ impl OneIo { Ok(content) } + /// Reads the full contents of a file or URL into a string, + /// replacing invalid UTF-8 sequences with `U+FFFD`. + pub fn read_to_string_lossy(&self, path: &str) -> Result { + let mut reader = self.get_reader(path)?; + let mut buf = Vec::new(); + reader.read_to_end(&mut buf)?; + Ok(String::from_utf8_lossy(&buf).into_owned()) + } + + /// Reads the full contents of a file or URL into raw bytes. + pub fn read_to_bytes(&self, path: &str) -> Result, OneIoError> { + let mut reader = self.get_reader(path)?; + let mut buf = Vec::new(); + reader.read_to_end(&mut buf)?; + Ok(buf) + } + /// Reads and deserializes JSON into the requested type. #[cfg(feature = "json")] pub fn read_json_struct(&self, path: &str) -> Result { @@ -164,6 +206,10 @@ impl OneIo { } /// Returns an iterator over lines from the provided path. + #[deprecated( + since = "0.23.0", + note = "Use read_lines_lossy for lossy text, read_to_bytes for byte-perfect whole-file reads, or get_reader for byte streaming" + )] pub fn read_lines( &self, path: &str, @@ -172,6 +218,28 @@ impl OneIo { Ok(BufReader::new(reader).lines()) } + /// Converts any reader into a lossy line iterator. + /// + /// Invalid UTF-8 sequences are replaced with `U+FFFD`. I/O errors from the + /// underlying reader still propagate as `Err`. + pub fn to_lines_lossy( + &self, + reader: Box, + ) -> impl Iterator> + Send { + lossy_lines(BufReader::new(reader)) + } + + /// Like [`read_lines`], but invalid UTF-8 sequences are replaced with + /// `U+FFFD` instead of producing `Err(io::ErrorKind::InvalidData)`. + /// I/O errors from the underlying reader still propagate as `Err`. + pub fn read_lines_lossy( + &self, + path: &str, + ) -> Result> + Send, OneIoError> { + let reader = self.get_reader(path)?; + Ok(self.to_lines_lossy(reader)) + } + /// Determines the raw content length for a local or remote path. pub fn get_content_length(&self, path: &str) -> Result { match crate::get_protocol(path) { diff --git a/src/lib.rs b/src/lib.rs index 2aa65cf..972934f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,7 +16,7 @@ oneio = "0.22" use oneio; // Read a remote compressed file -let content = oneio::read_to_string("https://example.com/data.txt.gz")?; +let content = oneio::read_to_string_lossy("https://example.com/data.txt.gz")?; # Ok(()) # } ``` @@ -59,10 +59,10 @@ oneio = { version = "0.22", default-features = false, features = ["http", "nativ ```rust,no_run # fn main() -> Result<(), Box> { // Read entire file to string -let content = oneio::read_to_string("data.txt")?; +let content = oneio::read_to_string_lossy("data.txt")?; // Read lines -for line in oneio::read_lines("data.txt")? { +for line in oneio::read_lines_lossy("data.txt")? { println!("{}", line?); } @@ -99,8 +99,8 @@ let client = OneIo::builder() .timeout(std::time::Duration::from_secs(30)) .build()?; -let data1 = client.read_to_string("https://api.example.com/1.json")?; -let data2 = client.read_to_string("https://api.example.com/2.json")?; +let data1 = client.read_to_string_lossy("https://api.example.com/1.json")?; +let data2 = client.read_to_string_lossy("https://api.example.com/2.json")?; # Ok(()) # } # #[cfg(not(feature = "http"))] @@ -148,7 +148,7 @@ Enable the `async` feature: ```rust # #[cfg(feature = "async")] # async fn example() -> Result<(), oneio::OneIoError> { -let content = oneio::read_to_string_async("https://example.com/data.txt").await?; +let content = oneio::read_to_string_lossy_async("https://example.com/data.txt").await?; # Ok(()) # } ``` @@ -310,10 +310,23 @@ pub fn exists(path: &str) -> Result { } /// Reads the full contents of a file or URL into a string. +#[deprecated(since = "0.23.0", note = "Use read_to_string_lossy or read_to_bytes")] +#[allow(deprecated)] pub fn read_to_string(path: &str) -> Result { builder::default_oneio()?.read_to_string(path) } +/// Reads the full contents of a file or URL into a string, +/// replacing invalid UTF-8 sequences with `U+FFFD`. +pub fn read_to_string_lossy(path: &str) -> Result { + builder::default_oneio()?.read_to_string_lossy(path) +} + +/// Reads the full contents of a file or URL into raw bytes. +pub fn read_to_bytes(path: &str) -> Result, OneIoError> { + builder::default_oneio()?.read_to_bytes(path) +} + /// Reads and deserializes JSON into the requested type. #[cfg(feature = "json")] pub fn read_json_struct(path: &str) -> Result { @@ -321,12 +334,25 @@ pub fn read_json_struct(path: &str) -> Result Result>>, OneIoError> { builder::default_oneio()?.read_lines(path) } +/// Like [`read_lines`], but invalid UTF-8 sequences are replaced with +/// `U+FFFD` instead of producing `Err(io::ErrorKind::InvalidData)`. +pub fn read_lines_lossy( + path: &str, +) -> Result> + Send, OneIoError> { + builder::default_oneio()?.read_lines_lossy(path) +} + /// Downloads a remote resource to a local path. pub fn download(remote: &str, local: &str) -> Result<(), OneIoError> { builder::default_oneio()?.download(remote, local) @@ -351,11 +377,29 @@ pub async fn get_reader_async( } /// Reads the entire content of a file asynchronously into a string. +#[deprecated( + since = "0.23.0", + note = "Use read_to_string_lossy_async or read_to_bytes_async" +)] +#[allow(deprecated)] #[cfg(feature = "async")] pub async fn read_to_string_async(path: &str) -> Result { async_reader::read_to_string_async(path).await } +/// Reads the entire content of a file asynchronously into a string, +/// replacing invalid UTF-8 sequences with `U+FFFD`. +#[cfg(feature = "async")] +pub async fn read_to_string_lossy_async(path: &str) -> Result { + async_reader::read_to_string_lossy_async(path).await +} + +/// Reads the entire content of a file asynchronously into raw bytes. +#[cfg(feature = "async")] +pub async fn read_to_bytes_async(path: &str) -> Result, OneIoError> { + async_reader::read_to_bytes_async(path).await +} + /// Downloads a file asynchronously from a URL to a local path. #[cfg(feature = "async")] pub async fn download_async(url: &str, path: &str) -> Result<(), OneIoError> { diff --git a/tests/async_integration.rs b/tests/async_integration.rs index 3ad16ef..6fae8f9 100644 --- a/tests/async_integration.rs +++ b/tests/async_integration.rs @@ -81,6 +81,64 @@ async fn async_download_http_to_file() { let _ = std::fs::remove_file(tmp_path); } +fn tmp_path(name: &str) -> std::path::PathBuf { + use std::sync::atomic::{AtomicU64, Ordering}; + static COUNTER: AtomicU64 = AtomicU64::new(0); + let id = COUNTER.fetch_add(1, Ordering::SeqCst); + std::env::temp_dir().join(format!( + "oneio_async_test_{}_{}_{name}", + std::process::id(), + id + )) +} + +#[tokio::test] +async fn async_read_to_string_lossy_latin1() { + let tmp_path = tmp_path("lossy"); + let _ = std::fs::remove_file(&tmp_path); + std::fs::write(&tmp_path, b"valid\nbad: \xf3\nnext\n").unwrap(); + + let content = oneio::read_to_string_lossy_async(tmp_path.to_str().unwrap()) + .await + .unwrap(); + assert!(content.contains('\u{FFFD}')); + assert!(content.contains("valid")); + assert!(content.contains("next")); + + let _ = std::fs::remove_file(&tmp_path); +} + +#[tokio::test] +async fn async_read_to_bytes_roundtrip() { + let tmp_path = tmp_path("bytes"); + let _ = std::fs::remove_file(&tmp_path); + let expected = b"valid\nbad: \xf3\nnext\n"; + std::fs::write(&tmp_path, expected).unwrap(); + + let bytes = oneio::read_to_bytes_async(tmp_path.to_str().unwrap()) + .await + .unwrap(); + assert_eq!(bytes, expected); + + let _ = std::fs::remove_file(&tmp_path); +} + +#[tokio::test] +#[allow(deprecated)] +async fn async_read_to_string_async_strict_still_fails() { + let tmp_path = tmp_path("strict"); + let _ = std::fs::remove_file(&tmp_path); + std::fs::write(&tmp_path, b"valid\nbad: \xf3\nnext\n").unwrap(); + + let result = oneio::read_to_string_async(tmp_path.to_str().unwrap()).await; + assert!( + result.is_err(), + "strict async read_to_string should fail on Latin-1 byte" + ); + + let _ = std::fs::remove_file(&tmp_path); +} + #[cfg(feature = "any_gz")] #[tokio::test] async fn async_download_preserves_compressed_bytes() { diff --git a/tests/basic_integration.rs b/tests/basic_integration.rs index e3ff178..e69d314 100644 --- a/tests/basic_integration.rs +++ b/tests/basic_integration.rs @@ -58,11 +58,11 @@ fn test_read(file_path: &str) { assert_eq!(text.as_str(), TEST_TEXT); assert_eq!( - oneio::read_to_string(file_path).unwrap().as_str(), + oneio::read_to_string_lossy(file_path).unwrap().as_str(), TEST_TEXT ); - let lines = oneio::read_lines(file_path) + let lines = oneio::read_lines_lossy(file_path) .unwrap() .map(|line| line.unwrap()) .collect::>(); @@ -152,8 +152,8 @@ fn test_oneio_builder_reuses_default_headers() { .build() .unwrap(); - let first = oneio.read_to_string(&url).unwrap(); - let second = oneio.read_to_string(&url).unwrap(); + let first = oneio.read_to_string_lossy(&url).unwrap(); + let second = oneio.read_to_string_lossy(&url).unwrap(); assert_eq!(first, TEST_TEXT); assert_eq!(second, TEST_TEXT); diff --git a/tests/lossy_read_tests.rs b/tests/lossy_read_tests.rs new file mode 100644 index 0000000..e009056 --- /dev/null +++ b/tests/lossy_read_tests.rs @@ -0,0 +1,345 @@ +//! Tests for lossy UTF-8 reading and byte-perfect reads. +//! +//! These validate the `read_lines_lossy`, `read_to_string_lossy`, and `read_to_bytes` +//! sync APIs added to handle real-world non-UTF-8 data. + +use std::io::Write; +use std::path::PathBuf; + +const LATIN1_BYTES: &[u8] = b"valid\nbad: \xf3\nnext\n"; +const CRLF_BYTES: &[u8] = b"line1\r\nline2\r\n"; +const BARE_CR_BYTES: &[u8] = b"line1\rline2\r"; +const NO_NEWLINE_BYTES: &[u8] = b"no newline"; +const EMPTY_BYTES: &[u8] = b""; +const ALL_INVALID_BYTES: &[u8] = b"\xff\xfe\xfd"; +const MIXED_INVALID_BYTES: &[u8] = b"hello\n\xffworld\n"; + +/// RAII temp file that deletes on drop. +struct TempFile { + path: PathBuf, +} + +impl TempFile { + fn new(bytes: &[u8]) -> Self { + use std::sync::atomic::{AtomicU64, Ordering}; + static COUNTER: AtomicU64 = AtomicU64::new(0); + let id = COUNTER.fetch_add(1, Ordering::SeqCst); + let path = std::env::temp_dir().join(format!( + "oneio_lossy_test_{}_{}.txt", + std::process::id(), + id + )); + let mut file = std::fs::File::create(&path).unwrap(); + file.write_all(bytes).unwrap(); + file.flush().unwrap(); + Self { path } + } + + fn path(&self) -> &PathBuf { + &self.path + } +} + +impl Drop for TempFile { + fn drop(&mut self) { + let _ = std::fs::remove_file(&self.path); + } +} + +// ───────────────────────────────────────────── +// Unit-style tests via private helper +// ───────────────────────────────────────────── + +#[test] +fn test_lossy_lines_basic() { + let cursor = std::io::Cursor::new(b"line1\nline2\n"); + let lines: Vec = oneio::OneIo::new() + .unwrap() + .to_lines_lossy(Box::new(cursor)) + .map(|r| r.unwrap()) + .collect(); + assert_eq!(lines, vec!["line1", "line2"]); +} + +#[test] +fn test_lossy_lines_latin1() { + let cursor = std::io::Cursor::new(LATIN1_BYTES); + let lines: Vec = oneio::OneIo::new() + .unwrap() + .to_lines_lossy(Box::new(cursor)) + .map(|r| r.unwrap()) + .collect(); + assert_eq!(lines.len(), 3); + assert_eq!(lines[0], "valid"); + assert_eq!(lines[1], "bad: \u{FFFD}"); + assert_eq!(lines[2], "next"); +} + +#[test] +fn test_lossy_lines_crlf() { + let cursor = std::io::Cursor::new(CRLF_BYTES); + let lines: Vec = oneio::OneIo::new() + .unwrap() + .to_lines_lossy(Box::new(cursor)) + .map(|r| r.unwrap()) + .collect(); + assert_eq!(lines.len(), 2); + assert_eq!(lines[0], "line1"); + assert_eq!(lines[1], "line2"); + assert!(!lines[0].ends_with('\r')); +} + +#[test] +fn test_lossy_lines_bare_cr() { + // Files with bare \r and no \n are read as a single line (same as BufRead::lines()) + let cursor = std::io::Cursor::new(BARE_CR_BYTES); + let lines: Vec = oneio::OneIo::new() + .unwrap() + .to_lines_lossy(Box::new(cursor)) + .map(|r| r.unwrap()) + .collect(); + assert_eq!(lines.len(), 1); + assert_eq!(lines[0], "line1\rline2\r"); +} + +#[test] +fn test_lossy_lines_no_trailing_newline() { + let cursor = std::io::Cursor::new(NO_NEWLINE_BYTES); + let lines: Vec = oneio::OneIo::new() + .unwrap() + .to_lines_lossy(Box::new(cursor)) + .map(|r| r.unwrap()) + .collect(); + assert_eq!(lines.len(), 1); + assert_eq!(lines[0], "no newline"); +} + +#[test] +fn test_lossy_lines_empty() { + let cursor = std::io::Cursor::new(EMPTY_BYTES); + let lines: Vec = oneio::OneIo::new() + .unwrap() + .to_lines_lossy(Box::new(cursor)) + .map(|r| r.unwrap()) + .collect(); + assert!(lines.is_empty()); +} + +#[test] +fn test_lossy_lines_all_invalid() { + let cursor = std::io::Cursor::new(ALL_INVALID_BYTES); + let lines: Vec = oneio::OneIo::new() + .unwrap() + .to_lines_lossy(Box::new(cursor)) + .map(|r| r.unwrap()) + .collect(); + assert_eq!(lines.len(), 1); + assert_eq!(lines[0], "\u{FFFD}\u{FFFD}\u{FFFD}"); +} + +#[test] +fn test_lossy_lines_mixed_invalid() { + let cursor = std::io::Cursor::new(MIXED_INVALID_BYTES); + let lines: Vec = oneio::OneIo::new() + .unwrap() + .to_lines_lossy(Box::new(cursor)) + .map(|r| r.unwrap()) + .collect(); + assert_eq!(lines.len(), 2); + assert_eq!(lines[0], "hello"); + assert_eq!(lines[1], "\u{FFFD}world"); +} + +#[test] +fn test_lossy_lines_continuation_past_bad_byte() { + let cursor = std::io::Cursor::new(LATIN1_BYTES); + let lines: Vec = oneio::OneIo::new() + .unwrap() + .to_lines_lossy(Box::new(cursor)) + .map(|r| r.unwrap()) + .collect(); + assert_eq!(lines.len(), 3, "iterator should continue past bad byte"); +} + +// ───────────────────────────────────────────── +// Integration tests via public API +// ───────────────────────────────────────────── + +#[test] +fn test_read_lines_lossy_latin1() { + let file = TempFile::new(LATIN1_BYTES); + let path = file.path().clone(); + let lines: Vec = oneio::read_lines_lossy(path.to_str().unwrap()) + .unwrap() + .map(|r| r.unwrap()) + .collect(); + assert_eq!(lines.len(), 3); + assert_eq!(lines[1], "bad: \u{FFFD}"); +} + +#[test] +fn test_read_lines_lossy_crlf() { + let file = TempFile::new(CRLF_BYTES); + let path = file.path().clone(); + let lines: Vec = oneio::read_lines_lossy(path.to_str().unwrap()) + .unwrap() + .map(|r| r.unwrap()) + .collect(); + assert_eq!(lines[0], "line1"); + assert!(!lines[0].ends_with('\r')); +} + +#[test] +fn test_read_lines_lossy_empty() { + let file = TempFile::new(EMPTY_BYTES); + let path = file.path().clone(); + let lines: Vec = oneio::read_lines_lossy(path.to_str().unwrap()) + .unwrap() + .map(|r| r.unwrap()) + .collect(); + assert!(lines.is_empty()); +} + +#[test] +fn test_read_to_string_lossy_latin1() { + let file = TempFile::new(LATIN1_BYTES); + let path = file.path().clone(); + let content = oneio::read_to_string_lossy(path.to_str().unwrap()).unwrap(); + assert!(content.contains('\u{FFFD}')); + assert!(content.contains("valid")); + assert!(content.contains("next")); +} + +#[test] +fn test_read_to_string_lossy_valid_utf8() { + let valid = b"hello\nworld\n"; + let file = TempFile::new(valid); + let path = file.path().clone(); + let lossy = oneio::read_to_string_lossy(path.to_str().unwrap()).unwrap(); + #[allow(deprecated)] + let strict = oneio::read_to_string(path.to_str().unwrap()).unwrap(); + assert_eq!(lossy, strict, "lossy should match strict on valid UTF-8"); +} + +#[test] +fn test_read_to_bytes_roundtrip() { + let file = TempFile::new(LATIN1_BYTES); + let path = file.path().clone(); + let bytes = oneio::read_to_bytes(path.to_str().unwrap()).unwrap(); + assert_eq!(bytes, LATIN1_BYTES); +} + +#[test] +fn test_read_to_bytes_empty() { + let file = TempFile::new(EMPTY_BYTES); + let path = file.path().clone(); + let bytes = oneio::read_to_bytes(path.to_str().unwrap()).unwrap(); + assert!(bytes.is_empty()); +} + +#[test] +fn test_client_read_lines_lossy() { + let client = oneio::OneIo::new().unwrap(); + let file = TempFile::new(LATIN1_BYTES); + let path = file.path().clone(); + let lines: Vec = client + .read_lines_lossy(path.to_str().unwrap()) + .unwrap() + .map(|r| r.unwrap()) + .collect(); + assert_eq!(lines.len(), 3); + assert_eq!(lines[1], "bad: \u{FFFD}"); +} + +#[test] +fn test_client_to_lines_lossy() { + let client = oneio::OneIo::new().unwrap(); + let file = TempFile::new(LATIN1_BYTES); + let path = file.path().clone(); + let reader = client.get_reader(path.to_str().unwrap()).unwrap(); + let lines: Vec = client.to_lines_lossy(reader).map(|r| r.unwrap()).collect(); + assert_eq!(lines.len(), 3); + assert_eq!(lines[1], "bad: \u{FFFD}"); +} + +#[test] +fn test_client_read_to_bytes() { + let client = oneio::OneIo::new().unwrap(); + let file = TempFile::new(LATIN1_BYTES); + let path = file.path().clone(); + let bytes = client.read_to_bytes(path.to_str().unwrap()).unwrap(); + assert_eq!(bytes, LATIN1_BYTES); +} + +// ───────────────────────────────────────────── +// Legacy strict behavior (deprecated but functional) +// ───────────────────────────────────────────── + +#[test] +#[allow(deprecated)] +fn test_read_lines_strict_still_fails() { + let file = TempFile::new(LATIN1_BYTES); + let path = file.path().clone(); + let mut lines = oneio::read_lines(path.to_str().unwrap()).unwrap(); + assert_eq!(lines.next().unwrap().unwrap(), "valid"); + let second = lines.next().unwrap(); + assert!( + second.is_err(), + "strict read_lines should fail on Latin-1 byte" + ); +} + +#[test] +#[allow(deprecated)] +fn test_read_to_string_strict_still_fails() { + let file = TempFile::new(LATIN1_BYTES); + let path = file.path().clone(); + let result = oneio::read_to_string(path.to_str().unwrap()); + assert!( + result.is_err(), + "strict read_to_string should fail on Latin-1 byte" + ); +} + +// ───────────────────────────────────────────── +// Edge cases +// ───────────────────────────────────────────── + +#[test] +fn test_lines_with_embedded_nul() { + let bytes = b"hello\x00world\n"; + let file = TempFile::new(bytes); + let path = file.path().clone(); + let lines: Vec = oneio::read_lines_lossy(path.to_str().unwrap()) + .unwrap() + .map(|r| r.unwrap()) + .collect(); + assert_eq!(lines.len(), 1); + assert_eq!(lines[0], "hello\x00world"); +} + +#[test] +fn test_send_trait() { + let client = oneio::OneIo::new().unwrap(); + let file = TempFile::new(LATIN1_BYTES); + let path = file.path().clone(); + let lines = client.read_lines_lossy(path.to_str().unwrap()).unwrap(); + // Move to another thread — should compile if Send + let handle = std::thread::spawn(move || { + let collected: Vec = lines.map(|r| r.unwrap()).collect(); + assert_eq!(collected.len(), 3); + }); + handle.join().unwrap(); +} + +#[test] +fn test_no_leak_on_early_drop() { + let client = oneio::OneIo::new().unwrap(); + let file = TempFile::new(LATIN1_BYTES); + let path = file.path().clone(); + let mut lines = client.read_lines_lossy(path.to_str().unwrap()).unwrap(); + // Read only first line, then drop + assert_eq!(lines.next().unwrap().unwrap(), "valid"); + drop(lines); + // If this doesn't hang or panic, the test passes +}