diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 180d113..3b93e76 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -6,14 +6,45 @@ on: workflow_dispatch: jobs: - test-accelerated: - name: Test accelerated (aarch64, x86_64) + test-aarch64: + name: Test aarch64 strategy: matrix: - os: [ubuntu-latest, ubuntu-22.04-arm, ubuntu-24.04-arm, macos-latest] + os: [ubuntu-22.04-arm, ubuntu-24.04-arm, macos-14, macos-15, macos-26, macos-latest, windows-11-arm] rust-toolchain: - "1.81" # minimum for this crate - - "1.89" # when VPCLMULQDQ was stabilized + - "1.89" # when AVX-512 VPCLMULQDQ was stabilized + - "stable" + - "nightly" + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 # not pinning to commit hash since this is a GitHub action, which we trust + - uses: actions-rust-lang/setup-rust-toolchain@9d7e65c320fdb52dcd45ffaa68deb6c02c8754d9 # v1.12.0 + with: + toolchain: ${{ matrix.rust-toolchain }} + components: rustfmt, clippy + cache-key: ${{ matrix.os }}-${{ matrix.rust-toolchain }} + - name: Check + run: cargo check + - name: Architecture check + run: cargo run --bin arch-check + - if: ${{ matrix.rust-toolchain != 'nightly' }} + name: Format + run: cargo fmt -- --check + - if: ${{ matrix.rust-toolchain != 'nightly' }} + name: Clippy + run: cargo clippy + - name: Test + run: cargo test + + test-x86_64: + name: Test x86_64 + strategy: + matrix: + os: [ ubuntu-latest, ubuntu-22.04, ubuntu-24.04, macos-13, macos-15-intel, windows-2022, windows-2025, windows-latest ] + rust-toolchain: + - "1.81" # minimum for this crate + - "1.89" # when AVX-512 VPCLMULQDQ was stabilized - "stable" - "nightly" runs-on: ${{ matrix.os }} @@ -38,14 +69,14 @@ jobs: run: cargo test test-x86: - name: Test accelerated (x86) + name: Test x86 runs-on: ubuntu-latest strategy: matrix: target: [i586-unknown-linux-gnu, i686-unknown-linux-gnu] rust-toolchain: - "1.81" # minimum for this crate - - "1.89" # when VPCLMULQDQ was stabilized + - "1.89" # when AVX-512 VPCLMULQDQ was stabilized - "stable" - "nightly" steps: @@ -71,7 +102,7 @@ jobs: target: [powerpc-unknown-linux-gnu, powerpc64-unknown-linux-gnu] rust-toolchain: - "1.81" # minimum for this crate - - "1.89" # when VPCLMULQDQ was stabilized + - "1.89" # when AVX-512 VPCLMULQDQ was stabilized - "stable" - "nightly" steps: diff --git a/.gitignore b/.gitignore index 24335e6..d97ced5 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ /test/test_*.bin .idea .DS_Store -.git \ No newline at end of file +.git +.vscode \ No newline at end of file diff --git a/.kiro/specs/checksum-benchmark-option/design.md b/.kiro/specs/checksum-benchmark-option/design.md new file mode 100644 index 0000000..26cf154 --- /dev/null +++ b/.kiro/specs/checksum-benchmark-option/design.md @@ -0,0 +1,202 @@ +# Design Document + +## Overview + +This design extends the existing `bin/checksum.rs` tool with benchmark functionality through a new `-b` flag. The benchmark mode will measure CRC performance using either user-provided data (files/strings) or randomly generated data, reporting throughput in GiB/s along with the acceleration target used. + +The design maintains backward compatibility while adding a clean benchmark interface that leverages existing patterns from the `benches/benchmark.rs` implementation. + +## Architecture + +### Command Line Interface + +The tool will extend the existing argument parsing to support: +- `-b`: Enable benchmark mode +- `--size `: Specify data size for random generation (when no file/string provided) +- `--duration `: Benchmark duration as floating-point seconds (default: 10.0) +- Existing `-a `: CRC algorithm (required in benchmark mode) +- Existing `-f ` or `-s `: Optional data source for benchmarking + +### Data Flow + +``` +User Input → Argument Parsing → Mode Detection → Benchmark Execution → Results Display + ↓ + [Normal Checksum Mode] + ↓ + [Existing Functionality] +``` + +In benchmark mode: +1. Parse and validate benchmark parameters +2. Determine data source (file, string, or generated) +3. For string/generated data: Load/generate test data once; For file data: use file path directly +4. Run benchmark loop for specified duration using appropriate checksum function +5. Calculate and display results + +## Components and Interfaces + +### Enhanced Config Structure + +```rust +#[derive(Debug)] +struct Config { + algorithm: String, + file: Option, + string: Option, + format: OutputFormat, + benchmark: Option, +} + +#[derive(Debug)] +struct BenchmarkConfig { + size: Option, + duration: f64, +} +``` + +### Benchmark Execution Module + +```rust +enum BenchmarkData { + InMemory(Vec), + File(String), +} + +struct BenchmarkRunner { + algorithm: CrcAlgorithm, + data: BenchmarkData, + duration: f64, +} + +impl BenchmarkRunner { + fn new(algorithm: CrcAlgorithm, data: BenchmarkData, duration: f64) -> Self + fn run(&self) -> BenchmarkResult +} + +struct BenchmarkResult { + iterations: u64, + elapsed_seconds: f64, + throughput_gibs: f64, + time_per_iteration_nanos: f64, + acceleration_target: String, + data_size: u64, +} +``` + +### Data Generation + +The benchmark will reuse the random data generation pattern from `benches/benchmark.rs`: + +```rust +fn generate_random_data(size: usize) -> Vec { + let mut rng = rand::rng(); + let mut buf = vec![0u8; size]; + rng.fill_bytes(&mut buf); + buf +} +``` + +## Data Models + +### Input Data Sources + +1. **File Input**: Use `checksum_file()` function to benchmark the entire file I/O and checksum stack +2. **String Input**: Use string bytes directly with in-memory `checksum()` function +3. **Generated Data**: Create random data of specified size using `rand::RngCore::fill_bytes()` and use in-memory `checksum()` function + +### Benchmark Metrics + +- **Iterations**: Number of checksum calculations performed +- **Elapsed Time**: Actual benchmark duration in seconds +- **Throughput**: Calculated as `(data_size * iterations) / elapsed_time / (1024^3)` GiB/s +- **Acceleration Target**: Result from `crc_fast::get_calculator_target(algorithm)` + +## Error Handling + +### Validation Errors + +- Invalid algorithm names (reuse existing validation) +- Invalid size parameters (non-positive values) +- Invalid duration parameters (non-positive values) +- File read errors (reuse existing error handling) + +### Runtime Errors + +- Memory allocation failures for large data sizes +- Timer precision issues (fallback to alternative timing methods) + +### Error Messages + +All errors will follow the existing pattern of displaying the error message followed by usage information. + +## Testing Strategy + +### Unit Tests + +- Argument parsing validation for benchmark flags +- BenchmarkConfig creation and validation +- Data generation with various sizes +- Throughput calculation accuracy + +### Integration Tests + +- End-to-end benchmark execution with different algorithms +- File and string input handling in benchmark mode +- Error handling for invalid parameters +- Backward compatibility verification + +### Performance Validation + +- Verify benchmark results are reasonable (within expected ranges) +- Compare with existing `benches/benchmark.rs` results for consistency +- Test with various data sizes to ensure linear scaling + +## Implementation Notes + +### Timing Mechanism + +Use `std::time::Instant` for high-precision timing, with different approaches for different data sources: + +```rust +let start = std::time::Instant::now(); +let mut iterations = 0u64; + +while start.elapsed().as_secs_f64() < duration { + match &self.data { + BenchmarkData::InMemory(data) => { + std::hint::black_box(checksum(algorithm, data)); + } + BenchmarkData::File(filename) => { + std::hint::black_box(checksum_file(algorithm, filename, None).unwrap()); + } + } + iterations += 1; +} + +let elapsed = start.elapsed().as_secs_f64(); +``` + +### Memory Considerations + +- Pre-allocate test data once before benchmark loop +- Use `std::hint::black_box()` to prevent compiler optimizations +- Consider memory alignment for optimal performance (optional enhancement) + +### Output Format + +``` +Algorithm: CRC-32/ISCSI +Acceleration Target: aarch64-neon-sha3 +Data Size: 1,048,576 bytes (1.0 MiB) +Duration: 10.00 seconds +Iterations: 12,345 +Throughput: 45.67 GiB/s +Time per iteration: 810.2 μs +``` + +### Default Values + +- **Size**: 1,048,576 bytes (1 MiB) +- **Duration**: 10.0 seconds +- **Algorithm**: Must be specified via `-a` flag (no default) \ No newline at end of file diff --git a/.kiro/specs/checksum-benchmark-option/requirements.md b/.kiro/specs/checksum-benchmark-option/requirements.md new file mode 100644 index 0000000..4da3195 --- /dev/null +++ b/.kiro/specs/checksum-benchmark-option/requirements.md @@ -0,0 +1,52 @@ +# Requirements Document + +## Introduction + +This feature adds a simple benchmark option to the existing `bin/checksum.rs` tool via a command-line flag. The benchmark will allow users to test performance across different platforms using a single binary, reporting throughput in GiB/s and the acceleration target used. This enables cross-platform performance comparison without requiring a full development environment checkout. + +## Glossary + +- **Checksum_Tool**: The existing `bin/checksum.rs` binary application +- **Benchmark_Mode**: A new operational mode that measures and reports performance metrics +- **Acceleration_Target**: The hardware-specific optimization path returned by `get_calculator_target()` +- **Throughput_Metric**: Performance measurement expressed in GiB/s (gibibytes per second) +- **Test_Data**: Randomly generated byte array used for benchmark measurements +- **Black_Box**: Rust's `std::hint::black_box()` function that prevents compiler optimizations during benchmarking + +## Requirements + +### Requirement 1 + +**User Story:** As a developer, I want to run performance benchmarks from the checksum tool, so that I can compare CRC performance across different hardware platforms without setting up a full development environment. + +#### Acceptance Criteria + +1. WHEN the user provides a `-b` flag, THE Checksum_Tool SHALL enter Benchmark_Mode +2. WHILE in Benchmark_Mode, THE Checksum_Tool SHALL generate Test_Data of the specified size once and reuse it for all iterations +3. THE Checksum_Tool SHALL report Throughput_Metric in GiB/s format +4. THE Checksum_Tool SHALL display the Acceleration_Target used for the benchmark +5. THE Checksum_Tool SHALL use Black_Box to prevent compiler optimizations during measurement + +### Requirement 2 + +**User Story:** As a developer, I want to specify benchmark parameters, so that I can control the test conditions for consistent cross-platform comparisons. + +#### Acceptance Criteria + +1. WHEN the user provides `-a` parameter with `-b` flag, THE Checksum_Tool SHALL use the specified CRC algorithm for benchmarking +2. WHEN the user provides `--size` parameter, THE Checksum_Tool SHALL generate Test_Data of the specified byte size +3. WHEN the user provides `--duration` parameter, THE Checksum_Tool SHALL run the benchmark for the specified number of seconds +4. WHERE no benchmark parameters are provided, THE Checksum_Tool SHALL use default values of 1 MiB for size and 10 seconds for duration +5. THE Checksum_Tool SHALL validate all benchmark parameter values before starting the benchmark + +### Requirement 3 + +**User Story:** As a developer, I want the benchmark to support both file/string input and generated data, so that I can benchmark with specific data or use random data for consistent testing. + +#### Acceptance Criteria + +1. WHEN the user provides `-b` with `-f` or `-s` flags, THE Checksum_Tool SHALL use the file or string content as Test_Data for benchmarking +2. WHEN the user provides `-b` with `--size` parameter but no `-f` or `-s` flags, THE Checksum_Tool SHALL generate random Test_Data of the specified size +3. IF the user provides `-b` without any data source or size specification, THEN THE Checksum_Tool SHALL generate random Test_Data using the default size +4. THE Checksum_Tool SHALL display appropriate usage information when benchmark parameters are invalid +5. THE Checksum_Tool SHALL maintain backward compatibility with existing checksum functionality \ No newline at end of file diff --git a/.kiro/specs/checksum-benchmark-option/tasks.md b/.kiro/specs/checksum-benchmark-option/tasks.md new file mode 100644 index 0000000..492dd08 --- /dev/null +++ b/.kiro/specs/checksum-benchmark-option/tasks.md @@ -0,0 +1,49 @@ +# Implementation Plan + +- [x] 1. Extend command line argument parsing for benchmark options + - Add `-b` flag to enable benchmark mode in the argument parser + - Add `--size` parameter for specifying random data size + - Add `--duration` parameter for benchmark duration (floating-point seconds) + - Update the `Config` struct to include optional `BenchmarkConfig` + - Update usage/help text to include new benchmark options + - _Requirements: 1.1, 2.1, 2.2, 2.3, 2.4_ + +- [x] 2. Implement benchmark data structures and validation + - Create `BenchmarkConfig` struct with size and duration fields + - Create `BenchmarkData` enum to handle in-memory vs file data sources + - Create `BenchmarkRunner` struct with algorithm, data, and duration + - Create `BenchmarkResult` struct with all metrics including time per iteration + - Add validation logic for benchmark parameters (positive values) + - _Requirements: 2.5, 3.4_ + +- [x] 3. Implement benchmark execution logic + - Create benchmark runner with timing loop using `std::time::Instant` + - Implement separate execution paths for in-memory data vs file data + - Use `std::hint::black_box()` to prevent compiler optimizations + - Calculate throughput in GiB/s and time per iteration with appropriate units + - Integrate `get_calculator_target()` for acceleration target reporting + - _Requirements: 1.2, 1.3, 1.4, 1.5_ + +- [x] 4. Implement data source handling + - Add random data generation function using `rand::RngCore::fill_bytes()` + - Implement logic to determine data source (file, string, or generated) + - Handle file size detection for throughput calculations + - Create `BenchmarkData` instances based on user input + - _Requirements: 3.1, 3.2, 3.3_ + +- [x] 5. Integrate benchmark mode into main application flow + - Modify main function to detect benchmark mode and route accordingly + - Ensure mutual exclusivity validation between benchmark and normal modes + - Add benchmark result formatting and display + - Update error handling to include benchmark-specific errors + - Maintain backward compatibility with existing functionality + - _Requirements: 3.4, 3.5_ + +- [x] 6. Add comprehensive testing for benchmark functionality + - Write unit tests for argument parsing with benchmark flags + - Test benchmark parameter validation (invalid sizes, durations) + - Test data source selection logic (file vs string vs generated) + - Test benchmark execution with different algorithms + - Verify throughput calculation accuracy + - Test error handling for invalid benchmark configurations + - _Requirements: All requirements_ \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index f6ff87c..e48ecb2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "aho-corasick" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" dependencies = [ "memchr", ] @@ -19,9 +19,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "0.6.20" +version = "0.6.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" dependencies = [ "anstyle", "anstyle-parse", @@ -34,9 +34,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.11" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" [[package]] name = "anstyle-parse" @@ -53,7 +53,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" dependencies = [ - "windows-sys", + "windows-sys 0.60.2", ] [[package]] @@ -64,7 +64,7 @@ checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys", + "windows-sys 0.60.2", ] [[package]] @@ -75,9 +75,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "bitflags" -version = "2.9.3" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34efbcccd345379ca2868b2b2c9d3782e9cc58ba87bc7d79d5b53d9c9ae6f25d" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" [[package]] name = "block-buffer" @@ -102,9 +102,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cbindgen" -version = "0.29.0" +version = "0.29.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "975982cdb7ad6a142be15bdf84aea7ec6a9e5d4d797c004d43185b24cfe4e684" +checksum = "befbfd072a8e81c02f8c507aefce431fe5e7d051f83d48a23ffc9b9fe5a11799" dependencies = [ "clap", "heck", @@ -121,9 +121,9 @@ dependencies = [ [[package]] name = "cfg-if" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "ciborium" @@ -154,18 +154,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.46" +version = "4.5.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c5e4fcf9c21d2e544ca1ee9d8552de13019a42aa7dbf32747fa7aaf1df76e57" +checksum = "4c26d721170e0295f191a69bd9a1f93efcdb0aff38684b61ab5750468972e5f5" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.46" +version = "4.5.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fecb53a0e6fcfb055f686001bc2e2592fa527efaf38dbe81a6a9563562e57d41" +checksum = "75835f0c7bf681bfd05abe44e965760fea999a5286c6eb2d59883634fd02011a" dependencies = [ "anstream", "anstyle", @@ -175,9 +175,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" +checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "colorchoice" @@ -208,6 +208,7 @@ dependencies = [ "crc", "criterion", "digest", + "indexmap", "rand", "regex", "rustversion", @@ -311,12 +312,12 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "errno" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -327,9 +328,9 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "generic-array" -version = "0.14.7" +version = "0.14.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +checksum = "4bb6743198531e02858aeaea5398fcc883e71851fcbcb5a2f773e2fb6cb1edf2" dependencies = [ "typenum", "version_check", @@ -337,31 +338,32 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", "r-efi", - "wasi", + "wasip2", ] [[package]] name = "half" -version = "2.6.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ "cfg-if", "crunchy", + "zerocopy", ] [[package]] name = "hashbrown" -version = "0.15.5" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" [[package]] name = "heck" @@ -371,9 +373,9 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "indexmap" -version = "2.11.0" +version = "2.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2481980430f9f78649238835720ddccc57e52df14ffce1c6f37391d61b563e9" +checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5" dependencies = [ "equivalent", "hashbrown", @@ -381,9 +383,9 @@ dependencies = [ [[package]] name = "is_terminal_polyfill" -version = "1.70.1" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" [[package]] name = "itertools" @@ -402,9 +404,9 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "js-sys" -version = "0.3.77" +version = "0.3.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65" dependencies = [ "once_cell", "wasm-bindgen", @@ -412,27 +414,27 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.175" +version = "0.2.177" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" +checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" [[package]] name = "linux-raw-sys" -version = "0.9.4" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" [[package]] name = "log" -version = "0.4.27" +version = "0.4.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" [[package]] name = "memchr" -version = "2.7.5" +version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" [[package]] name = "num-traits" @@ -451,9 +453,9 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "once_cell_polyfill" -version = "1.70.1" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "oorandom" @@ -500,18 +502,18 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.101" +version = "1.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.40" +version = "1.0.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" dependencies = [ "proc-macro2", ] @@ -573,9 +575,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.11.2" +version = "1.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" dependencies = [ "aho-corasick", "memchr", @@ -585,9 +587,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.10" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" dependencies = [ "aho-corasick", "memchr", @@ -596,21 +598,21 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.6" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" [[package]] name = "rustix" -version = "1.0.8" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8" +checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ "bitflags", "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -636,18 +638,28 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.219" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", @@ -656,23 +668,24 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.143" +version = "1.0.145" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d401abef1d108fbd9cbaebc3e46611f4b1021f714a0597a71f41ee463f5f4a5a" +checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" dependencies = [ "itoa", "memchr", "ryu", "serde", + "serde_core", ] [[package]] name = "serde_spanned" -version = "0.6.9" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +checksum = "e24345aa0fe688594e73770a5f6d1b216508b4f93484c0026d521acd30134392" dependencies = [ - "serde", + "serde_core", ] [[package]] @@ -683,9 +696,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.106" +version = "2.0.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917" dependencies = [ "proc-macro2", "quote", @@ -694,15 +707,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.21.0" +version = "3.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e" +checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" dependencies = [ "fastrand", "getrandom", "once_cell", "rustix", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -717,56 +730,54 @@ dependencies = [ [[package]] name = "toml" -version = "0.8.23" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +checksum = "f0dc8b1fb61449e27716ec0e1bdf0f6b8f3e8f6b05391e8497b8b6d7804ea6d8" dependencies = [ - "serde", + "indexmap", + "serde_core", "serde_spanned", "toml_datetime", - "toml_edit", + "toml_parser", + "toml_writer", + "winnow", ] [[package]] name = "toml_datetime" -version = "0.6.11" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +checksum = "f2cdb639ebbc97961c51720f858597f7f24c4fc295327923af55b74c3c724533" dependencies = [ - "serde", + "serde_core", ] [[package]] -name = "toml_edit" -version = "0.22.27" +name = "toml_parser" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +checksum = "c0cbe268d35bdb4bb5a56a2de88d0ad0eb70af5384a99d648cd4b3d04039800e" dependencies = [ - "indexmap", - "serde", - "serde_spanned", - "toml_datetime", - "toml_write", "winnow", ] [[package]] -name = "toml_write" -version = "0.1.2" +name = "toml_writer" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" +checksum = "df8b2b54733674ad286d16267dcfc7a71ed5c776e4ac7aa3c3e2561f7c637bf2" [[package]] name = "typenum" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" [[package]] name = "unicode-ident" -version = "1.0.18" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" +checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" [[package]] name = "utf8parse" @@ -791,45 +802,32 @@ dependencies = [ ] [[package]] -name = "wasi" -version = "0.14.3+wasi-0.2.4" +name = "wasip2" +version = "1.0.1+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a51ae83037bdd272a9e28ce236db8c07016dd0d50c27038b3f407533c030c95" +checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" dependencies = [ "wit-bindgen", ] [[package]] name = "wasm-bindgen" -version = "0.2.100" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60" dependencies = [ "cfg-if", "once_cell", "rustversion", "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" -dependencies = [ - "bumpalo", - "log", - "proc-macro2", - "quote", - "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.100" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -837,31 +835,31 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.100" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc" dependencies = [ + "bumpalo", "proc-macro2", "quote", "syn", - "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.100" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76" dependencies = [ "unicode-ident", ] [[package]] name = "web-sys" -version = "0.3.77" +version = "0.3.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +checksum = "3a1f95c0d03a47f4ae1f7a64643a6bb97465d9b740f0fa8f90ea33915c99a9a1" dependencies = [ "js-sys", "wasm-bindgen", @@ -869,18 +867,18 @@ dependencies = [ [[package]] name = "winapi-util" -version = "0.1.10" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0978bf7171b3d90bac376700cb56d606feb40f251a475a5d6634613564460b22" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys", + "windows-sys 0.61.2", ] [[package]] name = "windows-link" -version = "0.1.3" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" [[package]] name = "windows-sys" @@ -891,11 +889,20 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + [[package]] name = "windows-targets" -version = "0.53.3" +version = "0.53.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" dependencies = [ "windows-link", "windows_aarch64_gnullvm", @@ -910,81 +917,78 @@ dependencies = [ [[package]] name = "windows_aarch64_gnullvm" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" [[package]] name = "windows_aarch64_msvc" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" [[package]] name = "windows_i686_gnu" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" [[package]] name = "windows_i686_gnullvm" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" [[package]] name = "windows_i686_msvc" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" [[package]] name = "windows_x86_64_gnu" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" [[package]] name = "windows_x86_64_gnullvm" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" [[package]] name = "windows_x86_64_msvc" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" [[package]] name = "winnow" version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf" -dependencies = [ - "memchr", -] [[package]] name = "wit-bindgen" -version = "0.45.0" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "052283831dbae3d879dc7f51f3d92703a316ca49f91540417d38591826127814" +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" [[package]] name = "zerocopy" -version = "0.8.26" +version = "0.8.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" +checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.26" +version = "0.8.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" +checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 7108c11..128aef2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,9 +22,12 @@ bench = true crc = "3" digest = { version = "0.10", features = ["alloc"] } rand = "0.9" -regex = "1.11" +regex = "1.12" rustversion = "1.0" +# constrain indexmap (transitive) to a version compatible with Rust 1.81.0 +indexmap = { version = ">=2.11.0, <2.12.0", optional = true } + [dev-dependencies] criterion = "0.7" cbindgen = "0.29" diff --git a/README.md b/README.md index 368c157..75e9499 100644 --- a/README.md +++ b/README.md @@ -339,73 +339,62 @@ This is a summary of the performance for the most important and popular CRC chec AKA `crc32c` in many, but not all, implementations. -| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | -|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:| -| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | avx512-vpclmulqdq | ~49 | ~111 | -| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | sse-pclmulqdq | ~18 | ~52 | -| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512-vpclmulqdq | ~23 | ~54 | -| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | sse-pclmulqdq | ~11 | ~20 | -| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon-eor3-pclmulqdq | ~19 | ~39 | -| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon-pclmulqdq | ~10 | ~17 | -| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon-eor3-pclmulqdq | ~49 | ~99 | -| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq | ~56 | ~94 | +| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | +|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:| +| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | avx512-vpclmulqdq | ~52 | ~111 | +| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512-vpclmulqdq | ~24 | ~54 | +| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon-pmull-eor3 | ~21 | ~53 | +| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon-pmull | ~11 | ~17 | +| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon-pmull-eor3 | ~49 | ~99 | +| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon-pmull-eor3 | ~56 | ~94 | ### CRC-32/ISO-HDLC (reflected) AKA `crc32` in many, but not all, implementations. -| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | -|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:| -| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-248xl | avx512-vpclmulqdq | ~24 | ~110 | -| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-248xl | sse-pclmulqdq | ~21 | ~28 | -| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512-vpclmulqdq | ~24 | ~55 | -| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | sse-pclmulqdq | ~12 | ~14 | -| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon-eor3-pclmulqdq | ~19 | ~39 | -| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon-pclmulqdq | ~10 | ~17 | -| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon-eor3-pclmulqdq | ~48 | ~98 | -| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq | ~56 | ~94 | +| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | +|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:| +| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-248xl | avx512-vpclmulqdq | ~20 | ~88 | +| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512-vpclmulqdq | ~15 | ~55 | +| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon-pmull-eor3 | ~21 | ~53 | +| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon-pmull | ~11 | ~17 | +| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon-pmull-eor3 | ~48 | ~98 | +| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon-pmull-eor3 | ~56 | ~94 | ### CRC-64/NVME (reflected) [AWS S3's recommended checksum option](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html) -| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | -|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:| -| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | avx512-vpclmulqdq | ~25 | ~110 | -| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | sse-pclmulqdq | ~21 | ~28 | -| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512-vpclmulqdq | ~25 | ~55 | -| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | sse-pclmulqdq | ~11 | ~14 | -| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon-eor3-pclmulqdq | ~20 | ~37 | -| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon-pclmulqdq | ~10 | ~16 | -| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon-eor3-pclmulqdq | ~50 | ~72 | -| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq | ~52 | ~72 | +| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | +|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:| +| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | avx512-vpclmulqdq | ~21 | ~110 | +| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512-vpclmulqdq | ~16 | ~55 | +| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon-pmull-eor3 | ~21 | ~41 | +| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon-pmull | ~11 | ~16 | +| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon-pmull-eor3 | ~50 | ~72 | +| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon-pmull-eor3 | ~52 | ~72 | ### CRC-32/BZIP2 (forward) -| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | -|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:| -| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | avx512-vpclmulqdq | ~23 | ~56 | -| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | sse-pclmulqdq | ~19 | ~28 | -| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512-vpclmulqdq | ~21 | ~43 | -| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | sse-pclmulqdq | ~11 | ~13 | -| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon-eor3-pclmulqdq | ~16 | ~32 | -| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon-pclmulqdq | ~9 | ~14 | -| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon-eor3-pclmulqdq | ~41 | ~59 | -| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq | ~47 | ~64 | +| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | +|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:| +| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | avx512-vpclmulqdq | ~20 | ~56 | +| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512-vpclmulqdq | ~14 | ~43 | +| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon-pmull-eor3 | ~18 | ~40 | +| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon-pmull | ~9 | ~14 | +| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon-pmull-eor3 | ~41 | ~59 | +| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon-pmull-eor3 | ~47 | ~64 | ### CRC-64/ECMA-182 (forward) -| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | -|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:| -| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | avx512-vpclmulqdq | ~24 | ~56 | -| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | sse-pclmulqdq | ~19 | ~28 | -| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512-vpclmulqdq | ~21 | ~43 | -| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | sse-pclmulqdq | ~11 | ~13 | -| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon-eor3-pclmulqdq | ~18 | ~31 | -| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon-pclmulqdq | ~9 | ~14 | -| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon-eor3-pclmulqdq | ~40 | ~59 | -| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq | ~46 | ~61 | - +| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | +|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:| +| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | avx512-vpclmulqdq | ~21 | ~56 | +| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512-vpclmulqdq | ~14 | ~43 | +| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon-pmull-eor3 | ~19 | ~40 | +| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon-pmull | ~9 | ~14 | +| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon-pmull-eor3 | ~40 | ~59 | +| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon-pmull-eor3 | ~46 | ~61 | ## Other CRC widths @@ -422,6 +411,7 @@ PRs welcome! ## References +* [Catalogue of parametrised CRC algorithms](https://reveng.sourceforge.io/crc-catalogue/all.htm) * [crc32-fast](https://crates.io/crates/crc32fast) Original `CRC-32/ISO-HDLC` (`crc32`) implementation in `Rust`. * [crc64-fast](https://github.com/tikv/crc64fast) Original `CRC-64/XZ` implementation in `Rust`. * [crc64fast-nvme](https://github.com/awesomized/crc64fast-nvme) Original `CRC-64/NVME` implementation in `Rust`. diff --git a/benches/benchmark.rs b/benches/benchmark.rs index ecec9d8..82ee13a 100644 --- a/benches/benchmark.rs +++ b/benches/benchmark.rs @@ -31,11 +31,12 @@ pub const SIZES: &[(&str, i32); 2] = &[ ]; // these are the most important algorithms in popular use, with forward/reflected coverage -pub const CRC32_ALGORITHMS: &[CrcAlgorithm; 3] = &[ +pub const CRC32_ALGORITHMS: &[CrcAlgorithm; 4] = &[ // benchmark both CRC-32/ISCSI and CRC-32/ISO-HDLC since they're special flowers with lots of // different acceleration targets. - CrcAlgorithm::Crc32Iscsi, // reflected - CrcAlgorithm::Crc32IsoHdlc, // reflected + CrcAlgorithm::Crc32Autosar, // reflected + CrcAlgorithm::Crc32Iscsi, // reflected, fusion + CrcAlgorithm::Crc32IsoHdlc, // reflected, fusion CrcAlgorithm::Crc32Bzip2, // forward ]; diff --git a/src/algorithm.rs b/src/algorithm.rs index fe1459b..e3a83b9 100644 --- a/src/algorithm.rs +++ b/src/algorithm.rs @@ -53,12 +53,7 @@ fn extract_keys_array(params: CrcParams) -> [u64; 23] { } /// Main entry point that works for both CRC-32 and CRC-64 -#[inline] -#[cfg_attr( - any(target_arch = "x86", target_arch = "x86_64"), - target_feature(enable = "sse3,sse4.1,pclmulqdq") -)] -#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))] +#[inline(always)] pub unsafe fn update( state: W::Value, bytes: &[u8], diff --git a/src/arch/aarch64.rs b/src/arch/aarch64/aes.rs similarity index 86% rename from src/arch/aarch64.rs rename to src/arch/aarch64/aes.rs index 884ed82..8b9abb7 100644 --- a/src/arch/aarch64.rs +++ b/src/arch/aarch64/aes.rs @@ -1,16 +1,22 @@ // Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. -//! This module provides AArch64-specific implementations of the ArchOps trait. +//! This module provides AArch64-specific implementations of the ArchOps trait for architectures +//! with AES support. #![cfg(target_arch = "aarch64")] use crate::traits::ArchOps; use std::arch::aarch64::*; +// Tier-specific ArchOps implementations for AArch64 + +/// Base AArch64 implementation with AES+NEON optimizations +/// NEON is implicit with AES support on AArch64 #[derive(Debug, Copy, Clone)] -pub struct AArch64Ops; +pub struct Aarch64AesOps; -impl ArchOps for AArch64Ops { +// Base implementation for AArch64 AES tier +impl ArchOps for Aarch64AesOps { type Vector = uint8x16_t; #[inline] @@ -56,7 +62,6 @@ impl ArchOps for AArch64Ops { #[target_feature(enable = "neon")] unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2] { let x = vreinterpretq_u64_u8(vector); - [vgetq_lane_u64(x, 0), vgetq_lane_u64(x, 1)] } @@ -64,7 +69,6 @@ impl ArchOps for AArch64Ops { #[target_feature(enable = "neon")] unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2] { let x = vreinterpretq_p64_u8(vector); - [vgetq_lane_p64(x, 0), vgetq_lane_p64(x, 1)] } @@ -89,7 +93,6 @@ impl ArchOps for AArch64Ops { #[inline] #[target_feature(enable = "neon")] unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector { - // AArch64 uses vqtbl1q_u8 for byte shuffle vqtbl1q_u8(data, mask) } @@ -101,16 +104,13 @@ impl ArchOps for AArch64Ops { b: Self::Vector, mask: Self::Vector, ) -> Self::Vector { - // AArch64 needs explicit MSB mask creation and uses vbslq_u8 let msb_mask = vcltq_s8(vreinterpretq_s8_u8(mask), vdupq_n_s8(0)); - vbslq_u8(msb_mask, b, a) } #[inline] #[target_feature(enable = "neon")] unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector { - // AArch64 uses vextq_u8 with 0 for shifting vextq_u8(vdupq_n_u8(0), vector, 8) } @@ -123,7 +123,6 @@ impl ArchOps for AArch64Ops { #[inline] #[target_feature(enable = "neon")] unsafe fn create_compare_mask(&self, vector: Self::Vector) -> Self::Vector { - // Create a mask based on MSB for AArch64 vcltq_s8(vreinterpretq_s8_u8(vector), vdupq_n_s8(0)) } @@ -136,14 +135,12 @@ impl ArchOps for AArch64Ops { #[inline] #[target_feature(enable = "neon")] unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector { - // AArch64 uses vextq_u8 for shifting vextq_u8(vector, vdupq_n_u8(0), 4) } #[inline] #[target_feature(enable = "neon")] unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector { - // AArch64 uses vextq_u8 with 0 for shifting vextq_u8(vdupq_n_u8(0), vector, 12) // 16-4=12 } @@ -158,56 +155,48 @@ impl ArchOps for AArch64Ops { // For high=false, place in the low 32 bits of the low 64 bits result = vreinterpretq_u64_u32(vsetq_lane_u32(value, vreinterpretq_u32_u64(result), 0)); } - vreinterpretq_u8_u64(result) } #[inline] #[target_feature(enable = "neon")] unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector { - // AArch64 uses vextq_u8 with 0 for shifting left vextq_u8(vdupq_n_u8(0), vector, 12) // 16-4=12 } #[inline] #[target_feature(enable = "neon")] unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector { - // AArch64 uses vextq_u8 for shifting right vextq_u8(vector, vdupq_n_u8(0), 4) } #[inline] #[target_feature(enable = "neon")] unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector { - // AArch64 uses vextq_u8 for shifting vextq_u8(vector, vdupq_n_u8(0), 8) } #[inline] #[target_feature(enable = "neon")] unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector { - // AArch64 uses vextq_u8 for shifting vextq_u8(vector, vdupq_n_u8(0), 5) } #[inline] #[target_feature(enable = "neon")] unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector { - // AArch64 uses vextq_u8 for shifting vextq_u8(vector, vdupq_n_u8(0), 6) } #[inline] #[target_feature(enable = "neon")] unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector { - // AArch64 uses vextq_u8 for shifting vextq_u8(vector, vdupq_n_u8(0), 7) } #[inline] #[target_feature(enable = "neon")] unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector { - // AArch64 uses vextq_u8 for shifting vextq_u8(vector, vdupq_n_u8(0), 12) } @@ -216,7 +205,6 @@ impl ArchOps for AArch64Ops { unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector { let low_32 = vgetq_lane_u32(vreinterpretq_u32_u8(vector), 0); let result = vsetq_lane_u32(low_32, vdupq_n_u32(0), 3); - vreinterpretq_u8_u32(result) } @@ -232,7 +220,6 @@ impl ArchOps for AArch64Ops { #[inline] #[target_feature(enable = "aes")] unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { - // Low 64 bits of a, high 64 bits of b let a_low = vgetq_lane_p64(vreinterpretq_p64_u8(a), 1); let b_high = vgetq_lane_p64(vreinterpretq_p64_u8(b), 0); vreinterpretq_u8_p128(vmull_p64(a_low, b_high)) @@ -257,19 +244,6 @@ impl ArchOps for AArch64Ops { } #[inline] - #[cfg(target_feature = "sha3")] - #[target_feature(enable = "sha3")] - unsafe fn xor3_vectors( - &self, - a: Self::Vector, - b: Self::Vector, - c: Self::Vector, - ) -> Self::Vector { - veor3q_u8(a, b, c) - } - - #[inline] - #[cfg(not(target_feature = "sha3"))] #[target_feature(enable = "neon")] unsafe fn xor3_vectors( &self, @@ -277,13 +251,11 @@ impl ArchOps for AArch64Ops { b: Self::Vector, c: Self::Vector, ) -> Self::Vector { - // Fallback for when SHA3 is not available veorq_u8(veorq_u8(a, b), c) } } -impl AArch64Ops { - // Helper methods specific to AArch64 +impl Aarch64AesOps { #[inline] #[target_feature(enable = "neon")] unsafe fn load_key_pair(&self, idx1: u64, idx2: u64) -> uint8x16_t { diff --git a/src/arch/aarch64/aes_sha3.rs b/src/arch/aarch64/aes_sha3.rs new file mode 100644 index 0000000..bc3d365 --- /dev/null +++ b/src/arch/aarch64/aes_sha3.rs @@ -0,0 +1,201 @@ +// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. + +//! This module provides AArch64-specific implementations of the ArchOps trait for architectures +//! with AES+SHA3 support. + +#![cfg(target_arch = "aarch64")] + +use crate::arch::aarch64::aes::Aarch64AesOps; +use crate::traits::ArchOps; +use std::arch::aarch64::*; + +/// AArch64 AES+SHA3 tier - delegates to AES tier and overrides XOR3 operations +/// Provides EOR3 instruction for optimal XOR3 performance +#[derive(Debug, Copy, Clone)] +pub struct Aarch64AesSha3Ops(Aarch64AesOps); + +impl Aarch64AesSha3Ops { + #[inline(always)] + pub fn new() -> Self { + Self(Aarch64AesOps) + } +} + +// SHA3 tier implementation - delegates to AES tier and overrides XOR3 +impl ArchOps for Aarch64AesSha3Ops { + type Vector = uint8x16_t; + + // Delegate methods to the base AES implementation + #[inline(always)] + unsafe fn create_vector_from_u64_pair( + &self, + high: u64, + low: u64, + reflected: bool, + ) -> Self::Vector { + self.0.create_vector_from_u64_pair(high, low, reflected) + } + + #[inline(always)] + unsafe fn create_vector_from_u64_pair_non_reflected( + &self, + high: u64, + low: u64, + ) -> Self::Vector { + self.0.create_vector_from_u64_pair_non_reflected(high, low) + } + + #[inline(always)] + unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector { + self.0.create_vector_from_u64(value, high) + } + + #[inline(always)] + unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2] { + self.0.extract_u64s(vector) + } + + #[inline(always)] + unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2] { + self.0.extract_poly64s(vector) + } + + #[inline(always)] + unsafe fn xor_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.xor_vectors(a, b) + } + + #[inline(always)] + unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector { + self.0.load_bytes(ptr) + } + + #[inline(always)] + unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector { + self.0.load_aligned(ptr) + } + + #[inline(always)] + unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector { + self.0.shuffle_bytes(data, mask) + } + + #[inline(always)] + unsafe fn blend_vectors( + &self, + a: Self::Vector, + b: Self::Vector, + mask: Self::Vector, + ) -> Self::Vector { + self.0.blend_vectors(a, b, mask) + } + + #[inline(always)] + unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_left_8(vector) + } + + #[inline(always)] + unsafe fn set_all_bytes(&self, value: u8) -> Self::Vector { + self.0.set_all_bytes(value) + } + + #[inline(always)] + unsafe fn create_compare_mask(&self, vector: Self::Vector) -> Self::Vector { + self.0.create_compare_mask(vector) + } + + #[inline(always)] + unsafe fn and_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.and_vectors(a, b) + } + + #[inline(always)] + unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_32(vector) + } + + #[inline(always)] + unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_left_32(vector) + } + + #[inline(always)] + unsafe fn create_vector_from_u32(&self, value: u32, high: bool) -> Self::Vector { + self.0.create_vector_from_u32(value, high) + } + + #[inline(always)] + unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_left_4(vector) + } + + #[inline(always)] + unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_4(vector) + } + + #[inline(always)] + unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_8(vector) + } + + #[inline(always)] + unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_5(vector) + } + + #[inline(always)] + unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_6(vector) + } + + #[inline(always)] + unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_7(vector) + } + + #[inline(always)] + unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_12(vector) + } + + #[inline(always)] + unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_left_12(vector) + } + + #[inline(always)] + unsafe fn carryless_mul_00(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.carryless_mul_00(a, b) + } + + #[inline(always)] + unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.carryless_mul_01(a, b) + } + + #[inline(always)] + unsafe fn carryless_mul_10(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.carryless_mul_10(a, b) + } + + #[inline(always)] + unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.carryless_mul_11(a, b) + } + + // Override XOR3 to use SHA3 EOR3 instruction when available + #[inline] + #[target_feature(enable = "sha3")] + unsafe fn xor3_vectors( + &self, + a: Self::Vector, + b: Self::Vector, + c: Self::Vector, + ) -> Self::Vector { + // SHA3 tier always uses EOR3 instruction + // Feature detection is handled at the dispatch level + veor3q_u8(a, b, c) + } +} diff --git a/src/arch/aarch64/mod.rs b/src/arch/aarch64/mod.rs new file mode 100644 index 0000000..4f04674 --- /dev/null +++ b/src/arch/aarch64/mod.rs @@ -0,0 +1,8 @@ +// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. + +//! This module provides AArch64-specific implementations of the ArchOps trait. + +#![cfg(target_arch = "aarch64")] + +pub mod aes; +pub mod aes_sha3; diff --git a/src/arch/mod.rs b/src/arch/mod.rs index 669ba2c..aef965d 100644 --- a/src/arch/mod.rs +++ b/src/arch/mod.rs @@ -3,167 +3,173 @@ //! This module provides the main entry point for the SIMD CRC calculation. //! //! It dispatches to the appropriate architecture-specific implementation -//! based on the target architecture. #[cfg(target_arch = "aarch64")] use std::arch::is_aarch64_feature_detected; -#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))] -use crate::algorithm; - use crate::CrcParams; -#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))] -use crate::structs::{Width32, Width64}; - #[cfg(target_arch = "aarch64")] -use aarch64::AArch64Ops; +use crate::arch::aarch64::aes::Aarch64AesOps; -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -use x86::X86Ops; +#[cfg(target_arch = "aarch64")] +use crate::arch::aarch64::aes_sha3::Aarch64AesSha3Ops; -#[rustversion::since(1.89)] -#[cfg(target_arch = "x86_64")] -use vpclmulqdq::Vpclmulqdq512Ops; +#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))] +use crate::{ + algorithm, + structs::{Width32, Width64}, +}; -mod aarch64; -mod software; -mod vpclmulqdq; -mod x86; +pub mod aarch64; +pub mod software; +pub mod x86; +pub mod x86_64; /// Main entry point that dispatches to the appropriate architecture /// -/// /// # Safety /// May use native CPU features -#[inline] +#[inline(always)] #[cfg(target_arch = "aarch64")] -#[target_feature(enable = "aes")] pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 { - let ops = AArch64Ops; + use crate::feature_detection::{get_arch_ops, ArchOpsInstance}; + + match get_arch_ops() { + ArchOpsInstance::Aarch64AesSha3(ops) => update_aarch64_aes_sha3(state, bytes, params, *ops), + ArchOpsInstance::Aarch64Aes(ops) => update_aarch64_aes(state, bytes, params, *ops), + ArchOpsInstance::SoftwareFallback => { + if !is_aarch64_feature_detected!("aes") || !is_aarch64_feature_detected!("neon") { + #[cfg(any(not(target_feature = "aes"), not(target_feature = "neon")))] + { + // Use software implementation when no SIMD support is available + return crate::arch::software::update(state, bytes, params); + } + } - match params.width { - 64 => algorithm::update::(state, bytes, params, &ops), - 32 => algorithm::update::(state as u32, bytes, params, &ops) as u64, - _ => panic!("Unsupported CRC width: {}", params.width), + // This should likely never happen, but just in case + panic!("aarch64 features missing (NEON and/or AES)"); + } } } -#[rustversion::before(1.89)] -#[inline] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")] -pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 { - update_x86_sse(state, bytes, params) -} - -#[rustversion::since(1.89)] #[inline] -#[cfg(target_arch = "x86")] -#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")] -pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 { - update_x86_sse(state, bytes, params) +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "aes")] +unsafe fn update_aarch64_aes( + state: u64, + bytes: &[u8], + params: CrcParams, + ops: Aarch64AesOps, +) -> u64 { + match params.width { + 64 => algorithm::update::<_, Width64>(state, bytes, params, &ops), + 32 => algorithm::update::<_, Width32>(state as u32, bytes, params, &ops) as u64, + _ => panic!("Unsupported CRC width: {}", params.width), + } } -#[rustversion::since(1.89)] #[inline] -#[cfg(target_arch = "x86_64")] -#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")] -pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 { - use std::arch::is_x86_feature_detected; - - if bytes.len() >= 256 - && is_x86_feature_detected!("vpclmulqdq") - && is_x86_feature_detected!("avx512f") - && is_x86_feature_detected!("avx512vl") - { - let ops = Vpclmulqdq512Ops::new(); - - return match params.width { - 64 => algorithm::update::(state, bytes, params, &ops), - 32 => algorithm::update::(state as u32, bytes, params, &ops) - as u64, - _ => panic!("Unsupported CRC width: {}", params.width), - }; +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "aes,sha3")] +unsafe fn update_aarch64_aes_sha3( + state: u64, + bytes: &[u8], + params: CrcParams, + ops: Aarch64AesSha3Ops, +) -> u64 { + match params.width { + 64 => algorithm::update::<_, Width64>(state, bytes, params, &ops), + 32 => algorithm::update::<_, Width32>(state as u32, bytes, params, &ops) as u64, + _ => panic!("Unsupported CRC width: {}", params.width), } - - // fallback to the standard x86 SSE implementation - update_x86_sse(state, bytes, params) } -#[inline] -#[cfg(all( - not(target_arch = "x86"), - not(target_arch = "x86_64"), - not(target_arch = "aarch64") -))] +/// Main entry point for x86/x86_64 (Rust 1.89+ which supports AVX-512) +/// +/// # Safety +/// May use native CPU features +#[rustversion::since(1.89)] +#[inline(always)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 { - software::update(state, bytes, params) -} + use crate::feature_detection::{get_arch_ops, ArchOpsInstance}; -#[inline] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")] -unsafe fn update_x86_sse(state: u64, bytes: &[u8], params: CrcParams) -> u64 { - let ops = X86Ops; + match get_arch_ops() { + #[cfg(target_arch = "x86_64")] + ArchOpsInstance::X86_64Avx512Vpclmulqdq(ops) => match params.width { + 64 => algorithm::update::<_, Width64>(state, bytes, params, ops), + 32 => algorithm::update::<_, Width32>(state as u32, bytes, params, ops) as u64, + _ => panic!("Unsupported CRC width: {}", params.width), + }, + #[cfg(target_arch = "x86_64")] + ArchOpsInstance::X86_64Avx512Pclmulqdq(ops) => match params.width { + 64 => algorithm::update::<_, Width64>(state, bytes, params, ops), + 32 => algorithm::update::<_, Width32>(state as u32, bytes, params, ops) as u64, + _ => panic!("Unsupported CRC width: {}", params.width), + }, + ArchOpsInstance::X86SsePclmulqdq(ops) => match params.width { + 64 => algorithm::update::<_, Width64>(state, bytes, params, ops), + 32 => algorithm::update::<_, Width32>(state as u32, bytes, params, ops) as u64, + _ => panic!("Unsupported CRC width: {}", params.width), + }, + ArchOpsInstance::SoftwareFallback => { + #[cfg(target_arch = "x86")] + crate::arch::x86_software_update(state, bytes, params); - match params.width { - 64 => algorithm::update::(state, bytes, params, &ops), - 32 => algorithm::update::(state as u32, bytes, params, &ops) as u64, - _ => panic!("Unsupported CRC width: {}", params.width), + // This should never happen, but just in case + panic!("x86 features missing (SSE4.1 && PCLMULQDQ)"); + } } } +/// Main entry point for x86/x86_64 (Rust < 1.89 with no AVX-512 support) +/// +/// # Safety +/// May use native CPU features #[rustversion::before(1.89)] -pub fn get_target() -> String { - #[cfg(target_arch = "aarch64")] - { - if is_aarch64_feature_detected!("sha3") { - return "aarch64-neon-eor3-pclmulqdq".to_string(); - } +#[inline(always)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 { + use crate::feature_detection::{get_arch_ops, ArchOpsInstance}; - "aarch64-neon-pclmulqdq".to_string() + match get_arch_ops() { + ArchOpsInstance::X86SsePclmulqdq(ops) => match params.width { + 64 => algorithm::update::<_, Width64>(state, bytes, params, ops), + 32 => algorithm::update::<_, Width32>(state as u32, bytes, params, ops) as u64, + _ => panic!("Unsupported CRC width: {}", params.width), + }, + ArchOpsInstance::SoftwareFallback => x86_software_update(state, bytes, params), } - - #[allow(unreachable_code)] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - return "x86-sse-pclmulqdq".to_string(); - - #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))] - return "software-fallback-tables".to_string(); } -#[rustversion::since(1.89)] -pub fn get_target() -> String { - #[cfg(target_arch = "aarch64")] - { - if is_aarch64_feature_detected!("sha3") { - return "aarch64-neon-eor3-pclmulqdq".to_string(); - } - - "aarch64-neon-pclmulqdq".to_string() - } - - #[cfg(target_arch = "x86_64")] - { - if is_x86_feature_detected!("vpclmulqdq") - && is_x86_feature_detected!("avx512f") - && is_x86_feature_detected!("avx512vl") +#[inline(always)] +#[allow(unused)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn x86_software_update(state: u64, bytes: &[u8], params: CrcParams) -> u64 { + if !is_x86_feature_detected!("sse4.1") || !is_x86_feature_detected!("pclmulqdq") { + #[cfg(all( + target_arch = "x86", + any(not(target_feature = "sse4.1"), not(target_feature = "pclmulqdq")) + ))] { - return "x86_64-avx512-vpclmulqdq".to_string(); - } - - if is_x86_feature_detected!("avx2") { - return "x86_64-avx2-pclmulqdq".to_string(); + // Use software implementation when no SIMD support is available + crate::arch::software::update(state, bytes, params); } } - #[allow(unreachable_code)] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - return "x86-sse-pclmulqdq".to_string(); + // This should never happen, but just in case + panic!("x86 features missing (SSE4.1 && PCLMULQDQ)"); +} - #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))] - return "software-fallback-tables".to_string(); +#[inline] +#[cfg(all( + not(target_arch = "x86"), + not(target_arch = "x86_64"), + not(target_arch = "aarch64") +))] +pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 { + crate::arch::software::update(state, bytes, params) } #[cfg(test)] diff --git a/src/arch/software.rs b/src/arch/software.rs index 4739a1b..9cac286 100644 --- a/src/arch/software.rs +++ b/src/arch/software.rs @@ -1,66 +1,102 @@ // Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. //! This module contains a software fallback for unsupported architectures. - -#![cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))] +//! +//! Software fallback is conditionally compiled based on target architecture: +//! - Always included for non-SIMD architectures (not x86/x86_64/aarch64) +//! - Included for x86 when SSE4.1/PCLMULQDQ may not be available +//! - Included for aarch64 for runtime fallback when AES is not detected +//! - Excluded for x86_64 since SSE4.1/PCLMULQDQ are always available (but included for testing) + +#![cfg(any( + // Non-aarch64/x86/x86_64 architectures always need software fallback + not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")), + // x86 may not have SSE4.1/PCLMULQDQ support + all(target_arch = "x86", any(not(target_feature = "sse4.1"), not(target_feature = "pclmulqdq"))), + // aarch64 needs software fallback for runtime detection when AES is not available... + // NEON doesn't guarantee AES, so for rare outlier CPUs this might not work 100%... + all(target_arch = "aarch64", not(target_feature = "aes")), + // Include for testing on all architectures + test +))] use crate::consts::CRC_64_NVME; use crate::CrcAlgorithm; use crate::CrcParams; use crc::{Algorithm, Table}; +#[allow(unused)] const RUST_CRC32_AIXM: crc::Crc> = crc::Crc::>::new(&crc::CRC_32_AIXM); +#[allow(unused)] const RUST_CRC32_AUTOSAR: crc::Crc> = crc::Crc::>::new(&crc::CRC_32_AUTOSAR); +#[allow(unused)] const RUST_CRC32_BASE91_D: crc::Crc> = crc::Crc::>::new(&crc::CRC_32_BASE91_D); +#[allow(unused)] const RUST_CRC32_BZIP2: crc::Crc> = crc::Crc::>::new(&crc::CRC_32_BZIP2); +#[allow(unused)] const RUST_CRC32_CD_ROM_EDC: crc::Crc> = crc::Crc::>::new(&crc::CRC_32_CD_ROM_EDC); +#[allow(unused)] const RUST_CRC32_CKSUM: crc::Crc> = crc::Crc::>::new(&crc::CRC_32_CKSUM); +#[allow(unused)] const RUST_CRC32_ISCSI: crc::Crc> = crc::Crc::>::new(&crc::CRC_32_ISCSI); +#[allow(unused)] const RUST_CRC32_ISO_HDLC: crc::Crc> = crc::Crc::>::new(&crc::CRC_32_ISO_HDLC); +#[allow(unused)] const RUST_CRC32_JAMCRC: crc::Crc> = crc::Crc::>::new(&crc::CRC_32_JAMCRC); +#[allow(unused)] const RUST_CRC32_MEF: crc::Crc> = crc::Crc::>::new(&crc::CRC_32_MEF); +#[allow(unused)] const RUST_CRC32_MPEG_2: crc::Crc> = crc::Crc::>::new(&crc::CRC_32_MPEG_2); +#[allow(unused)] const RUST_CRC32_XFER: crc::Crc> = crc::Crc::>::new(&crc::CRC_32_XFER); +#[allow(unused)] const RUST_CRC64_ECMA_182: crc::Crc> = crc::Crc::>::new(&crc::CRC_64_ECMA_182); +#[allow(unused)] const RUST_CRC64_GO_ISO: crc::Crc> = crc::Crc::>::new(&crc::CRC_64_GO_ISO); +#[allow(unused)] const RUST_CRC64_MS: crc::Crc> = crc::Crc::>::new(&crc::CRC_64_MS); +#[allow(unused)] const RUST_CRC64_NVME: crc::Crc> = crc::Crc::>::new(&CRC_64_NVME); +#[allow(unused)] const RUST_CRC64_REDIS: crc::Crc> = crc::Crc::>::new(&crc::CRC_64_REDIS); +#[allow(unused)] const RUST_CRC64_WE: crc::Crc> = crc::Crc::>::new(&crc::CRC_64_WE); +#[allow(unused)] const RUST_CRC64_XZ: crc::Crc> = crc::Crc::>::new(&crc::CRC_64_XZ); +#[allow(unused)] // Dispatch function that handles the generic case pub(crate) fn update(state: u64, data: &[u8], params: CrcParams) -> u64 { match params.width { @@ -80,7 +116,7 @@ pub(crate) fn update(state: u64, data: &[u8], params: CrcParams) -> u64 { CrcAlgorithm::Crc32Xfer => RUST_CRC32_XFER, CrcAlgorithm::Crc32Custom => { let algorithm: Algorithm = Algorithm { - width: params.width as u8, + width: params.width, poly: params.poly as u32, init: params.init as u32, refin: params.refin, @@ -110,13 +146,13 @@ pub(crate) fn update(state: u64, data: &[u8], params: CrcParams) -> u64 { CrcAlgorithm::Crc64Xz => RUST_CRC64_XZ, CrcAlgorithm::Crc64Custom => { let algorithm: Algorithm = Algorithm { - width: params.width as u8, - poly: params.poly as u64, - init: params.init as u64, + width: params.width, + poly: params.poly, + init: params.init, refin: params.refin, refout: params.refout, - xorout: params.xorout as u64, - check: params.check as u64, + xorout: params.xorout, + check: params.check, residue: 0x0000000000000000, // unused in this context }; diff --git a/src/arch/x86/mod.rs b/src/arch/x86/mod.rs new file mode 100644 index 0000000..bead067 --- /dev/null +++ b/src/arch/x86/mod.rs @@ -0,0 +1,7 @@ +// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. + +//! This module provides x86-specific implementations of the ArchOps trait. + +#![cfg(any(target_arch = "x86", target_arch = "x86_64"))] + +pub mod sse; diff --git a/src/arch/x86.rs b/src/arch/x86/sse.rs similarity index 85% rename from src/arch/x86.rs rename to src/arch/x86/sse.rs index 08a8b4a..adaf8b8 100644 --- a/src/arch/x86.rs +++ b/src/arch/x86/sse.rs @@ -4,7 +4,7 @@ //! //! This module is designed to work with both x86 and x86_64 architectures. //! -//! It uses the SSE2 and SSE4.1 instruction sets for SIMD operations. +//! It uses the SSE4.1 instruction sets for SIMD operations. #![cfg(any(target_arch = "x86", target_arch = "x86_64"))] @@ -16,10 +16,15 @@ use std::arch::x86_64::*; use crate::traits::ArchOps; +// Tier-specific ArchOps implementations for x86/x86_64 + +/// Base x86/x86_64 SSE+PCLMULQDQ implementation - baseline performance for both architectures +/// Uses SSE4.1 and PCLMULQDQ instructions #[derive(Debug, Copy, Clone)] -pub struct X86Ops; +pub struct X86SsePclmulqdqOps; -impl ArchOps for X86Ops { +// Base implementation for x86/x86_64 SSE+PCLMULQDQ tier +impl ArchOps for X86SsePclmulqdqOps { type Vector = __m128i; #[inline] @@ -52,7 +57,6 @@ impl ArchOps for X86Ops { #[inline] #[target_feature(enable = "sse4.1")] unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector { - // x86 uses custom helper self.create_u64_vector(value, high) } @@ -78,21 +82,18 @@ impl ArchOps for X86Ops { #[inline] #[target_feature(enable = "sse2")] unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector { - // x86 requires cast to __m128i* _mm_loadu_si128(ptr as *const __m128i) } #[inline] #[target_feature(enable = "sse2")] unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector { - // x86 requires cast to __m128i* _mm_loadu_si128(ptr as *const __m128i) } #[inline] #[target_feature(enable = "ssse3")] unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector { - // x86 uses specific SSSE3 instruction _mm_shuffle_epi8(data, mask) } @@ -104,14 +105,12 @@ impl ArchOps for X86Ops { b: Self::Vector, mask: Self::Vector, ) -> Self::Vector { - // x86 has native blend that uses MSB automatically _mm_blendv_epi8(a, b, mask) } #[inline] #[target_feature(enable = "sse2")] unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector { - // x86 has a dedicated shift instruction _mm_slli_si128(vector, 8) } @@ -227,23 +226,6 @@ impl ArchOps for X86Ops { _mm_clmulepi64_si128(a, b, 0x11) } - #[rustversion::since(1.89)] - #[inline] - #[target_feature(enable = "sse4.1")] - unsafe fn xor3_vectors( - &self, - a: Self::Vector, - b: Self::Vector, - c: Self::Vector, - ) -> Self::Vector { - if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") { - return self.xor3_vectors_avx512(a, b, c); - } - - self.xor3_vectors_sse(a, b, c) - } - - #[rustversion::before(1.89)] #[inline] #[target_feature(enable = "sse4.1")] unsafe fn xor3_vectors( @@ -252,12 +234,12 @@ impl ArchOps for X86Ops { b: Self::Vector, c: Self::Vector, ) -> Self::Vector { - self.xor3_vectors_sse(a, b, c) + // SSE tier always uses two XOR operations + _mm_xor_si128(_mm_xor_si128(a, b), c) } } -impl X86Ops { - // Helper methods specific to x86/x86_64 +impl X86SsePclmulqdqOps { #[inline] #[target_feature(enable = "sse2")] unsafe fn set_epi64x(&self, e1: u64, e0: u64) -> __m128i { @@ -318,20 +300,4 @@ impl X86Ops { lo | (hi << 32) } } - - #[rustversion::since(1.89)] - #[inline] - #[target_feature(enable = "avx512f,avx512vl")] - unsafe fn xor3_vectors_avx512(&self, a: __m128i, b: __m128i, c: __m128i) -> __m128i { - _mm_ternarylogic_epi64( - a, b, c, 0x96, // XOR3 - ) - } - - #[inline] - #[target_feature(enable = "sse4.1")] - unsafe fn xor3_vectors_sse(&self, a: __m128i, b: __m128i, c: __m128i) -> __m128i { - // x86 doesn't have native XOR3 in SSE, use two XORs - _mm_xor_si128(_mm_xor_si128(a, b), c) - } } diff --git a/src/arch/x86_64/avx512.rs b/src/arch/x86_64/avx512.rs new file mode 100644 index 0000000..d26ec2f --- /dev/null +++ b/src/arch/x86_64/avx512.rs @@ -0,0 +1,220 @@ +// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. + +//! This module provides x86_64-specific implementations of the ArchOps trait. +//! +//! It uses the AVX-512 instruction sets for SIMD operations. + +#![cfg(target_arch = "x86_64")] + +#[rustversion::since(1.89)] +use std::arch::x86_64::*; + +#[rustversion::since(1.89)] +use crate::arch::x86::sse::X86SsePclmulqdqOps; +#[rustversion::since(1.89)] +use crate::traits::ArchOps; + +/// x86_64-only AVX512+PCLMULQDQ tier - delegates to SSE tier and overrides XOR3 operations +/// Uses AVX512 ternary logic for XOR3 operations with PCLMULQDQ +#[rustversion::since(1.89)] +#[derive(Debug, Copy, Clone)] +pub struct X86_64Avx512PclmulqdqOps(X86SsePclmulqdqOps); + +#[rustversion::since(1.89)] +impl X86_64Avx512PclmulqdqOps { + #[inline(always)] + pub fn new() -> Self { + Self(X86SsePclmulqdqOps) + } +} + +#[rustversion::since(1.89)] +impl ArchOps for X86_64Avx512PclmulqdqOps { + type Vector = __m128i; + + // Delegate all methods to the base SSE implementation + #[inline(always)] + unsafe fn create_vector_from_u64_pair( + &self, + high: u64, + low: u64, + reflected: bool, + ) -> Self::Vector { + self.0.create_vector_from_u64_pair(high, low, reflected) + } + + #[inline(always)] + unsafe fn create_vector_from_u64_pair_non_reflected( + &self, + high: u64, + low: u64, + ) -> Self::Vector { + self.0.create_vector_from_u64_pair_non_reflected(high, low) + } + + #[inline(always)] + unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector { + self.0.create_vector_from_u64(value, high) + } + + #[inline(always)] + unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2] { + self.0.extract_u64s(vector) + } + + #[inline(always)] + unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2] { + self.0.extract_poly64s(vector) + } + + #[inline(always)] + unsafe fn xor_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.xor_vectors(a, b) + } + + #[inline(always)] + unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector { + self.0.load_bytes(ptr) + } + + #[inline(always)] + unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector { + self.0.load_aligned(ptr) + } + + #[inline(always)] + unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector { + self.0.shuffle_bytes(data, mask) + } + + #[inline(always)] + unsafe fn blend_vectors( + &self, + a: Self::Vector, + b: Self::Vector, + mask: Self::Vector, + ) -> Self::Vector { + self.0.blend_vectors(a, b, mask) + } + + #[inline(always)] + unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_left_8(vector) + } + + #[inline(always)] + unsafe fn set_all_bytes(&self, value: u8) -> Self::Vector { + self.0.set_all_bytes(value) + } + + #[inline(always)] + unsafe fn create_compare_mask(&self, vector: Self::Vector) -> Self::Vector { + self.0.create_compare_mask(vector) + } + + #[inline(always)] + unsafe fn and_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.and_vectors(a, b) + } + + #[inline(always)] + unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_32(vector) + } + + #[inline(always)] + unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_left_32(vector) + } + + #[inline(always)] + unsafe fn create_vector_from_u32(&self, value: u32, high: bool) -> Self::Vector { + self.0.create_vector_from_u32(value, high) + } + + #[inline(always)] + unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_left_4(vector) + } + + #[inline(always)] + unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_4(vector) + } + + #[inline(always)] + unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_8(vector) + } + + #[inline(always)] + unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_5(vector) + } + + #[inline(always)] + unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_6(vector) + } + + #[inline(always)] + unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_7(vector) + } + + #[inline(always)] + unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_12(vector) + } + + #[inline(always)] + unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_left_12(vector) + } + + #[inline(always)] + unsafe fn carryless_mul_00(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.carryless_mul_00(a, b) + } + + #[inline(always)] + unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.carryless_mul_01(a, b) + } + + #[inline(always)] + unsafe fn carryless_mul_10(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.carryless_mul_10(a, b) + } + + #[inline(always)] + unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.carryless_mul_11(a, b) + } + + #[rustversion::since(1.89)] + #[inline] + #[target_feature(enable = "avx512vl")] + unsafe fn xor3_vectors( + &self, + a: Self::Vector, + b: Self::Vector, + c: Self::Vector, + ) -> Self::Vector { + // AVX512 tier always uses ternary logic + _mm_ternarylogic_epi64(a, b, c, 0x96) // XOR3 operation + } + + // Fallback for older Rust versions + #[rustversion::before(1.89)] + #[inline(always)] + unsafe fn xor3_vectors( + &self, + a: Self::Vector, + b: Self::Vector, + c: Self::Vector, + ) -> Self::Vector { + // Rust < 1.89 doesn't have _mm_ternarylogic_epi64, fall back to SSE + self.0.xor3_vectors(a, b, c) + } +} diff --git a/src/arch/x86_64/avx512_vpclmulqdq.rs b/src/arch/x86_64/avx512_vpclmulqdq.rs new file mode 100644 index 0000000..881ca7e --- /dev/null +++ b/src/arch/x86_64/avx512_vpclmulqdq.rs @@ -0,0 +1,630 @@ +// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. + +//! This module provides AVX-512 and VPCLMULQDQ-specific implementations of the ArchOps trait. +//! +//! It performs folding using 4 x ZMM registers of 512-bits each. + +#![cfg(target_arch = "x86_64")] + +#[rustversion::since(1.89)] +use crate::arch::x86::sse::X86SsePclmulqdqOps; + +#[rustversion::since(1.89)] +use crate::enums::Reflector; + +#[rustversion::since(1.89)] +use crate::structs::CrcState; + +#[rustversion::since(1.89)] +use crate::traits::{ArchOps, EnhancedCrcWidth}; + +#[rustversion::since(1.89)] +use std::arch::x86_64::*; + +#[rustversion::since(1.89)] +use std::ops::BitXor; + +/// Implements the ArchOps trait using 512-bit AVX-512 and VPCLMULQDQ instructions at 512 bits. +/// Delegates to X86SsePclmulqdqOps for standard 128-bit operations +#[rustversion::since(1.89)] +#[derive(Debug, Copy, Clone)] +pub struct X86_64Avx512VpclmulqdqOps(X86SsePclmulqdqOps); + +#[rustversion::since(1.89)] +impl X86_64Avx512VpclmulqdqOps { + #[inline(always)] + pub fn new() -> Self { + Self(X86SsePclmulqdqOps) + } +} + +// Wrapper for __m512i to make it easier to work with +#[rustversion::since(1.89)] +#[derive(Debug, Copy, Clone)] +struct Simd512(__m512i); + +#[rustversion::since(1.89)] +impl Simd512 { + #[inline] + #[target_feature(enable = "avx512f")] + #[allow(clippy::too_many_arguments)] + unsafe fn new(x7: u64, x6: u64, x5: u64, x4: u64, x3: u64, x2: u64, x1: u64, x0: u64) -> Self { + Self(_mm512_set_epi64( + x7 as i64, x6 as i64, x5 as i64, x4 as i64, x3 as i64, x2 as i64, x1 as i64, x0 as i64, + )) + } + + #[inline] + #[target_feature(enable = "avx512vl,vpclmulqdq")] + unsafe fn fold_64(&self, coeff: &Self, new_data: &Self) -> Self { + // Use 512-bit ternary logic XOR3 with carryless multiplication + Self(_mm512_ternarylogic_epi64( + _mm512_clmulepi64_epi128(self.0, coeff.0, 0), // Low parts + _mm512_clmulepi64_epi128(self.0, coeff.0, 17), // High parts + new_data.0, + 0x96, // XOR3 operation + )) + } + + #[inline] + #[target_feature(enable = "avx512f")] + unsafe fn extract_u64s(&self) -> [u64; 8] { + let mut result = [0u64; 8]; + _mm512_storeu_si512(result.as_mut_ptr().cast(), self.0); + + result + } + + #[inline] + #[target_feature(enable = "avx512f")] + unsafe fn load_from_ptr(ptr: *const u8) -> Self { + Self(_mm512_loadu_si512(ptr as *const __m512i)) + } + + #[inline] + #[target_feature(enable = "avx512f")] + unsafe fn to_128i_extract(self) -> __m128i { + _mm512_extracti32x4_epi32(self.0, INDEX) + } + + #[inline] + #[target_feature(enable = "avx512f")] + unsafe fn xor(&self, other: &Self) -> Self { + Self(_mm512_xor_si512(self.0, other.0)) + } + + #[inline] + #[target_feature(enable = "avx512f")] + #[allow(unused)] + unsafe fn print_hex(&self, prefix: &str) { + let values = self.extract_u64s(); + println!( + "{}={:#016x}_{:016x}_{:016x}_{:016x}_{:016x}_{:016x}_{:016x}_{:016x}", + prefix, + values[7], + values[6], + values[5], + values[4], + values[3], + values[2], + values[1], + values[0] + ); + } +} + +#[rustversion::since(1.89)] +impl X86_64Avx512VpclmulqdqOps { + /// Process aligned blocks using VPCLMULQDQ with 4 x 512-bit registers + /// + /// Note that #[inline(always)] loses the inlining performance boost, despite no native + /// target_features being used directly. Odd since that's not how Rust's docs make it sound... + #[inline] + #[target_feature(enable = "avx512vl,avx512bw,vpclmulqdq")] + unsafe fn process_blocks( + &self, + state: &mut CrcState<::Vector>, + first: &[__m128i; 8], + rest: &[[__m128i; 8]], + keys: [u64; 23], + reflected: bool, + ) -> W::Value + where + W::Value: Copy + BitXor, + { + let state_u64s = self.extract_u64s(state.value); + + let positioned_state = if reflected { + Simd512::new(0, 0, 0, 0, 0, 0, 0, state_u64s[0]) + } else { + Simd512::new(state_u64s[1], 0, 0, 0, 0, 0, 0, 0) + }; + + let reflector = create_reflector512(reflected); + + // Load first 256 bytes (2nd half is rest[0] since these are 128-byte blocks) + let first_ptr = first.as_ptr() as *const u8; + let first_rest_ptr = rest[0].as_ptr() as *const u8; + + let mut x = [ + reflect_bytes512(&reflector, Simd512::load_from_ptr(first_ptr)), + reflect_bytes512(&reflector, Simd512::load_from_ptr(first_ptr.add(64))), + reflect_bytes512(&reflector, Simd512::load_from_ptr(first_rest_ptr)), + reflect_bytes512(&reflector, Simd512::load_from_ptr(first_rest_ptr.add(64))), + ]; + + x[0] = positioned_state.xor(&x[0]); + + let coeff = self.create_avx512_256byte_coefficient(keys, reflected); + + let remaining_rest = &rest[1..]; + let pair_count = remaining_rest.len() / 2; + + for i in 0..pair_count { + let block1_ptr = remaining_rest[i * 2].as_ptr() as *const u8; + let block2_ptr = remaining_rest[i * 2 + 1].as_ptr() as *const u8; + + x[0] = x[0].fold_64( + &coeff, + &reflect_bytes512(&reflector, Simd512::load_from_ptr(block1_ptr)), + ); + x[1] = x[1].fold_64( + &coeff, + &reflect_bytes512(&reflector, Simd512::load_from_ptr(block1_ptr.add(64))), + ); + x[2] = x[2].fold_64( + &coeff, + &reflect_bytes512(&reflector, Simd512::load_from_ptr(block2_ptr)), + ); + x[3] = x[3].fold_64( + &coeff, + &reflect_bytes512(&reflector, Simd512::load_from_ptr(block2_ptr.add(64))), + ); + } + + let processed_pairs = pair_count * 2; + let remaining_single_count = remaining_rest.len() - processed_pairs; + + if remaining_single_count > 0 { + // We have 1 unprocessed block (128 bytes) + // Fold 4×512 down to 2×512 and process the remaining block with 2-register mode + let folded_2reg = self.fold_from_4x512_to_2x256(x, keys, reflected); + let coeff_2reg = self.create_avx512_128byte_coefficient(keys, reflected); + + let last_block_ptr = remaining_rest[processed_pairs].as_ptr() as *const u8; + + let final_x = [ + folded_2reg[0].fold_64( + &coeff_2reg, + &reflect_bytes512(&reflector, Simd512::load_from_ptr(last_block_ptr)), + ), + folded_2reg[1].fold_64( + &coeff_2reg, + &reflect_bytes512(&reflector, Simd512::load_from_ptr(last_block_ptr.add(64))), + ), + ]; + + let folded = self.fold_from_2x512_to_1x128(final_x, keys, reflected); + + return W::perform_final_reduction(folded, reflected, keys, self); + } + + // All blocks processed in pairs - fold from 4 x 512-bit to 1 x 128-bit + let folded = self.fold_from_4x512_to_1x128(x, keys, reflected); + + W::perform_final_reduction(folded, reflected, keys, self) + } + + /// Create a folding coefficient for AVX-512 for 128-byte folding distances + #[inline(always)] + unsafe fn create_avx512_128byte_coefficient( + &self, + keys: [u64; 23], + reflected: bool, + ) -> Simd512 { + let (k1, k2) = if reflected { + (keys[3], keys[4]) + } else { + (keys[4], keys[3]) + }; + + // Replicate the coefficient pair + Simd512::new(k1, k2, k1, k2, k1, k2, k1, k2) + } + + /// Create a folding coefficient for AVX-512 for 256-byte folding distances + #[inline(always)] + unsafe fn create_avx512_256byte_coefficient( + &self, + keys: [u64; 23], + reflected: bool, + ) -> Simd512 { + let (k1, k2) = if reflected { + (keys[21], keys[22]) + } else { + (keys[22], keys[21]) + }; + + // Replicate the coefficient pair + Simd512::new(k1, k2, k1, k2, k1, k2, k1, k2) + } + + /// Fold from 4 x 512-bit to 1 x 128-bit + #[inline(always)] + unsafe fn fold_from_4x512_to_1x128( + &self, + x: [Simd512; 4], + keys: [u64; 23], + reflected: bool, + ) -> __m128i { + // Step 1: Fold 4 x 512-bit to 2 x 512-bit + let x2 = self.fold_from_4x512_to_2x256(x, keys, reflected); + + // Step 2: Fold 2 x 512-bit to 1 x 128-bit + self.fold_from_2x512_to_1x128(x2, keys, reflected) + } + + /// Fold from 4 x 512-bit to 2 x 512-bit + #[inline(always)] + unsafe fn fold_from_4x512_to_2x256( + &self, + x: [Simd512; 4], + keys: [u64; 23], + reflected: bool, + ) -> [Simd512; 2] { + // This folds registers that are 128 bytes apart (x[0] with x[2], x[1] with x[3]) + let coeff = self.create_avx512_128byte_coefficient(keys, reflected); + + // Fold pairs: + // x[0] (bytes 0-63) + x[2] (bytes 128-191) → result[0] + // x[1] (bytes 64-127) + x[3] (bytes 192-255) → result[1] + [x[0].fold_64(&coeff, &x[2]), x[1].fold_64(&coeff, &x[3])] + } + + /// Fold from 2 x 512-bit to 1 x 128-bit + #[inline(always)] + unsafe fn fold_from_2x512_to_1x128( + &self, + x: [Simd512; 2], + keys: [u64; 23], + reflected: bool, + ) -> __m128i { + // Create the fold coefficients for different distances + let fold_coefficients = [ + self.create_vector_from_u64_pair(keys[10], keys[9], reflected), // 112 bytes + self.create_vector_from_u64_pair(keys[12], keys[11], reflected), // 96 bytes + self.create_vector_from_u64_pair(keys[14], keys[13], reflected), // 80 bytes + self.create_vector_from_u64_pair(keys[16], keys[15], reflected), // 64 bytes + self.create_vector_from_u64_pair(keys[18], keys[17], reflected), // 48 bytes + self.create_vector_from_u64_pair(keys[20], keys[19], reflected), // 32 bytes + self.create_vector_from_u64_pair(keys[2], keys[1], reflected), // 16 bytes + ]; + + // Extract the 8 x 128-bit vectors from the 2 x 512-bit vectors (this is faster than + // using 256-bit intrinsics for 1KiB payloads) + let v128 = if reflected { + [ + x[0].to_128i_extract::<0>(), // 256-x0.low + x[0].to_128i_extract::<1>(), // 256-x0.high + x[0].to_128i_extract::<2>(), // 256-x1.low + x[0].to_128i_extract::<3>(), // 256-x1.high + x[1].to_128i_extract::<0>(), // 256-x2.low + x[1].to_128i_extract::<1>(), // 256-x2.high + x[1].to_128i_extract::<2>(), // 256-x3.low + x[1].to_128i_extract::<3>(), // 256-x3.high + ] + } else { + [ + x[0].to_128i_extract::<3>(), // 256-x1.high + x[0].to_128i_extract::<2>(), // 256-x1.low + x[0].to_128i_extract::<1>(), // 256-x0.high + x[0].to_128i_extract::<0>(), // 256-x0.low + x[1].to_128i_extract::<3>(), // 256-x3.high + x[1].to_128i_extract::<2>(), // 256-x3.low + x[1].to_128i_extract::<1>(), // 256-x2.high + x[1].to_128i_extract::<0>(), // 256-x2.low + ] + }; + + // Fold the 8 xmm registers to 1 xmm register + let mut res = v128[7]; + + for (i, &coeff) in fold_coefficients.iter().enumerate() { + let folded_h = self.carryless_mul_00(v128[i], coeff); + let folded_l = self.carryless_mul_11(v128[i], coeff); + res = self.xor3_vectors(folded_h, folded_l, res); + } + + res + } +} + +// 512-bit version of the Reflector +#[rustversion::since(1.89)] +#[derive(Clone, Copy)] +enum Reflector512 { + NoReflector, + ForwardReflector { smask: Simd512 }, +} + +// Function to create the appropriate reflector based on CRC parameters +#[rustversion::since(1.89)] +#[inline(always)] +unsafe fn create_reflector512(reflected: bool) -> Reflector512 { + if reflected { + Reflector512::NoReflector + } else { + // Load shuffle mask + let smask = Simd512::new( + 0x08090a0b0c0d0e0f, + 0x0001020304050607, + 0x08090a0b0c0d0e0f, + 0x0001020304050607, + 0x08090a0b0c0d0e0f, + 0x0001020304050607, + 0x08090a0b0c0d0e0f, + 0x0001020304050607, + ); + Reflector512::ForwardReflector { smask } + } +} + +// Function to apply reflection to a 512-bit vector +#[rustversion::since(1.89)] +#[inline(always)] +unsafe fn reflect_bytes512(reflector: &Reflector512, data: Simd512) -> Simd512 { + match reflector { + Reflector512::NoReflector => data, + Reflector512::ForwardReflector { smask } => shuffle_bytes512(data, *smask), + } +} + +// pre-compute the reverse indices for 512-bit shuffling +#[rustversion::since(1.89)] +static REVERSE_INDICES_512: __m512i = + unsafe { std::mem::transmute([7u64, 6u64, 5u64, 4u64, 3u64, 2u64, 1u64, 0u64]) }; + +// Implement a 512-bit byte shuffle function +#[rustversion::since(1.89)] +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +unsafe fn shuffle_bytes512(data: Simd512, mask: Simd512) -> Simd512 { + Simd512(_mm512_permutexvar_epi64( + // Reverse the order using 512-bit permutation + REVERSE_INDICES_512, // reverse indices + _mm512_shuffle_epi8(data.0, mask.0), // shuffled data + )) +} + +// Delegate all ArchOps methods to the inner X86SsePclmulqdqOps instance +#[rustversion::since(1.89)] +impl ArchOps for X86_64Avx512VpclmulqdqOps { + type Vector = __m128i; + + #[inline(always)] + unsafe fn process_enhanced_simd_blocks( + &self, + state: &mut CrcState, + first: &[Self::Vector; 8], + rest: &[[Self::Vector; 8]], + _reflector: &Reflector, + keys: [u64; 23], + ) -> bool + where + Self::Vector: Copy, + { + // Update the state with the result + *state = W::create_state( + self.process_blocks::(state, first, rest, keys, state.reflected), + state.reflected, + self, + ); + + // Return true to indicate we handled it + true + } + + // Delegate all other methods to X86SsePclmulqdqOps + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn create_vector_from_u64_pair( + &self, + high: u64, + low: u64, + reflected: bool, + ) -> Self::Vector { + self.0.create_vector_from_u64_pair(high, low, reflected) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn create_vector_from_u64_pair_non_reflected( + &self, + high: u64, + low: u64, + ) -> Self::Vector { + self.0.create_vector_from_u64_pair_non_reflected(high, low) + } + + #[inline] + #[target_feature(enable = "sse4.1")] + unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector { + self.0.create_vector_from_u64(value, high) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2] { + self.0.extract_u64s(vector) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2] { + self.0.extract_poly64s(vector) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn xor_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.xor_vectors(a, b) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector { + self.0.load_bytes(ptr) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector { + self.0.load_aligned(ptr) + } + + #[inline] + #[target_feature(enable = "ssse3")] + unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector { + self.0.shuffle_bytes(data, mask) + } + + #[inline] + #[target_feature(enable = "sse4.1")] + unsafe fn blend_vectors( + &self, + a: Self::Vector, + b: Self::Vector, + mask: Self::Vector, + ) -> Self::Vector { + self.0.blend_vectors(a, b, mask) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_left_8(vector) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn set_all_bytes(&self, value: u8) -> Self::Vector { + self.0.set_all_bytes(value) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn create_compare_mask(&self, vector: Self::Vector) -> Self::Vector { + self.0.create_compare_mask(vector) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn and_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.and_vectors(a, b) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_32(vector) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_left_32(vector) + } + + #[inline] + #[target_feature(enable = "sse4.1")] + unsafe fn create_vector_from_u32(&self, value: u32, high: bool) -> Self::Vector { + self.0.create_vector_from_u32(value, high) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_left_4(vector) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_4(vector) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_8(vector) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_5(vector) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_6(vector) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_7(vector) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_right_12(vector) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector { + self.0.shift_left_12(vector) + } + + #[inline] + #[target_feature(enable = "pclmulqdq")] + unsafe fn carryless_mul_00(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.carryless_mul_00(a, b) + } + + #[inline] + #[target_feature(enable = "pclmulqdq")] + unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.carryless_mul_01(a, b) + } + + #[inline] + #[target_feature(enable = "pclmulqdq")] + unsafe fn carryless_mul_10(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.carryless_mul_10(a, b) + } + + #[inline] + #[target_feature(enable = "pclmulqdq")] + unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { + self.0.carryless_mul_11(a, b) + } + + #[inline] + #[target_feature(enable = "avx512vl")] + unsafe fn xor3_vectors( + &self, + a: Self::Vector, + b: Self::Vector, + c: Self::Vector, + ) -> Self::Vector { + // VPCLMULQDQ implementation always uses AVX512 ternary logic + // Feature detection is handled at the dispatch level + _mm_ternarylogic_epi64( + a, b, c, 0x96, // XOR3 operation + ) + } +} diff --git a/src/arch/x86_64/mod.rs b/src/arch/x86_64/mod.rs new file mode 100644 index 0000000..4d1422d --- /dev/null +++ b/src/arch/x86_64/mod.rs @@ -0,0 +1,8 @@ +// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. + +//! This module provides x86_64-specific implementations of the ArchOps trait. + +#![cfg(target_arch = "x86_64")] + +pub mod avx512; +pub mod avx512_vpclmulqdq; diff --git a/src/bin/arch-check.rs b/src/bin/arch-check.rs index b1dc3ef..023b2a0 100644 --- a/src/bin/arch-check.rs +++ b/src/bin/arch-check.rs @@ -1,3 +1,5 @@ +// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. + //! This is a simple program that checks if the target architecture supports certain features. #[cfg(target_arch = "aarch64")] diff --git a/src/bin/checksum.rs b/src/bin/checksum.rs index dd02b08..6c017bb 100644 --- a/src/bin/checksum.rs +++ b/src/bin/checksum.rs @@ -1,6 +1,9 @@ +// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. + //! This is a simple program to calculate a checksum from the command line use crc_fast::{checksum, checksum_file, CrcAlgorithm}; +use rand::RngCore; use std::env; use std::process::ExitCode; use std::str::FromStr; @@ -11,6 +14,36 @@ struct Config { file: Option, string: Option, format: OutputFormat, + benchmark: Option, +} + +#[derive(Debug)] +struct BenchmarkConfig { + size: Option, + duration: f64, +} + +#[derive(Debug)] +enum BenchmarkData { + InMemory(Vec), + File(String), +} + +#[derive(Debug)] +struct BenchmarkRunner { + algorithm: CrcAlgorithm, + data: BenchmarkData, + duration: f64, +} + +#[derive(Debug)] +struct BenchmarkResult { + iterations: u64, + elapsed_seconds: f64, + throughput_gibs: f64, + time_per_iteration_nanos: f64, + acceleration_target: String, + data_size: u64, } #[derive(Debug, Clone)] @@ -19,20 +52,144 @@ enum OutputFormat { Decimal, } +impl BenchmarkConfig { + fn validate(&self) -> Result<(), String> { + if self.duration <= 0.0 { + return Err("Duration must be greater than 0".to_string()); + } + + if let Some(size) = self.size { + if size == 0 { + return Err("Size must be greater than 0".to_string()); + } + } + + Ok(()) + } +} + +impl BenchmarkRunner { + fn new(algorithm: CrcAlgorithm, data: BenchmarkData, duration: f64) -> Self { + Self { + algorithm, + data, + duration, + } + } + + fn run(&self) -> Result { + use std::time::Instant; + + let start = Instant::now(); + let mut iterations = 0u64; + + while start.elapsed().as_secs_f64() < self.duration { + match &self.data { + BenchmarkData::InMemory(data) => { + std::hint::black_box(checksum(self.algorithm, data)); + } + BenchmarkData::File(filename) => { + match checksum_file(self.algorithm, filename, None) { + Ok(result) => { + std::hint::black_box(result); + } + Err(e) => { + return Err(format!("Failed to read file during benchmark: {}", e)); + } + } + } + } + iterations += 1; + } + + let elapsed = start.elapsed().as_secs_f64(); + let data_size = match &self.data { + BenchmarkData::InMemory(data) => data.len() as u64, + BenchmarkData::File(filename) => std::fs::metadata(filename) + .map(|m| m.len()) + .map_err(|e| format!("Failed to get file size: {}", e))?, + }; + + let acceleration_target = crc_fast::get_calculator_target(self.algorithm); + + Ok(BenchmarkResult::new( + iterations, + elapsed, + acceleration_target, + data_size, + )) + } +} + +impl BenchmarkResult { + fn new( + iterations: u64, + elapsed_seconds: f64, + acceleration_target: String, + data_size: u64, + ) -> Self { + let throughput_gibs = if elapsed_seconds > 0.0 { + (data_size as f64 * iterations as f64) / elapsed_seconds / (1024.0 * 1024.0 * 1024.0) + } else { + 0.0 + }; + + let time_per_iteration_nanos = if iterations > 0 { + elapsed_seconds * 1_000_000_000.0 / iterations as f64 + } else { + 0.0 + }; + + Self { + iterations, + elapsed_seconds, + throughput_gibs, + time_per_iteration_nanos, + acceleration_target, + data_size, + } + } +} + +fn generate_random_data(size: usize) -> Result, String> { + // Check for reasonable size limits to prevent memory issues + if size > 1_073_741_824 { + // 1 GiB limit + return Err("Data size too large (maximum 1 GiB)".to_string()); + } + + // Use vec! macro to avoid clippy warning about slow initialization + let mut buf = vec![0u8; size]; + let mut rng = rand::rng(); + rng.fill_bytes(&mut buf); + Ok(buf) +} + fn print_usage() { println!("Usage: checksum -a algorithm [-f file] [-s string] [--format hex|decimal]"); + println!( + " checksum -a algorithm -b [--size bytes] [--duration seconds] [-f file] [-s string]" + ); println!(); println!("Example: checksum -a CRC-32/ISCSI -f myfile.txt"); println!("Example: checksum -a CRC-64/NVME -s 'Hello, world!' --format decimal"); + println!("Example: checksum -a CRC-32/ISCSI -b --size 1048576 --duration 5.0"); println!(); println!("Options:"); println!(" -a algorithm Specify the checksum algorithm (required)"); println!(" -f file Calculate checksum for the specified file"); + println!(" -h, --help Show this help message"); println!(" -s string Calculate checksum for the specified string"); println!(" --format hex|decimal Output format (default: hex)"); - println!(" -h, --help Show this help message"); println!(); - println!("Note: Either -f or -s must be provided, but not both."); + println!("Benchmarking:"); + println!(" -b Enable benchmark mode"); + println!(" --duration seconds Benchmark duration in seconds (default: 10.0)"); + println!(" --size bytes Data size for random generation in benchmark mode (default: 1048576 [1MiB])"); + println!(); + println!(); + println!("Note: In normal mode, either -f or -s must be provided, but not both."); + println!(" In benchmark mode (-b), -f or -s are optional for using specific data."); } fn parse_args() -> Result { @@ -51,6 +208,9 @@ fn parse_args() -> Result { let mut file: Option = None; let mut string: Option = None; let mut format = OutputFormat::Hex; // Default to hex + let mut benchmark_mode = false; + let mut benchmark_size: Option = None; + let mut benchmark_duration = 10.0; // Default duration let mut i = 1; // Skip program name while i < args.len() { @@ -98,6 +258,30 @@ fn parse_args() -> Result { } i += 2; } + "-b" => { + benchmark_mode = true; + i += 1; + } + "--size" => { + if i + 1 >= args.len() { + return Err("Missing size value after --size flag".to_string()); + } + benchmark_size = Some( + args[i + 1] + .parse::() + .map_err(|_| format!("Invalid size value: {}", args[i + 1]))?, + ); + i += 2; + } + "--duration" => { + if i + 1 >= args.len() { + return Err("Missing duration value after --duration flag".to_string()); + } + benchmark_duration = args[i + 1] + .parse::() + .map_err(|_| format!("Invalid duration value: {}", args[i + 1]))?; + i += 2; + } arg => { return Err(format!("Unknown argument: {}", arg)); } @@ -107,15 +291,40 @@ fn parse_args() -> Result { // Validate required arguments let algorithm = algorithm.ok_or("Algorithm (-a) is required")?; - if file.is_none() && string.is_none() { - return Err("Either -f (file) or -s (string) must be provided".to_string()); + // Validate mutual exclusivity between benchmark and normal modes + if !benchmark_mode && (benchmark_size.is_some() || benchmark_duration != 10.0) { + return Err("--size and --duration can only be used with -b flag".to_string()); + } + + // Create benchmark config if in benchmark mode + let benchmark = if benchmark_mode { + let config = BenchmarkConfig { + size: benchmark_size, + duration: benchmark_duration, + }; + config.validate()?; + Some(config) + } else { + None + }; + + // Validate input requirements based on mode + if benchmark.is_none() { + // Normal mode: require either file or string input + if file.is_none() && string.is_none() { + return Err( + "Either -f (file) or -s (string) must be provided in normal mode".to_string(), + ); + } } + // Benchmark mode: file and string are optional (will use generated data if neither provided) Ok(Config { algorithm, file, string, format, + benchmark, }) } @@ -123,6 +332,11 @@ fn calculate_checksum(config: &Config) -> Result<(), String> { let algorithm = CrcAlgorithm::from_str(&config.algorithm) .map_err(|_| format!("Invalid algorithm: {}", config.algorithm))?; + // Check if benchmark mode is enabled + if let Some(benchmark_config) = &config.benchmark { + return run_benchmark(config, benchmark_config, algorithm); + } + let checksum = if let Some(ref filename) = config.file { checksum_file(algorithm, filename, None).unwrap() } else if let Some(ref text) = config.string { @@ -139,6 +353,91 @@ fn calculate_checksum(config: &Config) -> Result<(), String> { Ok(()) } +fn run_benchmark( + config: &Config, + benchmark_config: &BenchmarkConfig, + algorithm: CrcAlgorithm, +) -> Result<(), String> { + // Determine data source and create BenchmarkData + let data = if let Some(ref filename) = config.file { + // Validate file exists before benchmarking + if !std::path::Path::new(filename).exists() { + return Err(format!("File not found: {}", filename)); + } + BenchmarkData::File(filename.clone()) + } else if let Some(ref text) = config.string { + BenchmarkData::InMemory(text.as_bytes().to_vec()) + } else { + // Generate random data with specified size or default (1 MiB) + let size = benchmark_config.size.unwrap_or(1_048_576); + let random_data = generate_random_data(size)?; + BenchmarkData::InMemory(random_data) + }; + + // Create and run benchmark + let runner = BenchmarkRunner::new(algorithm, data, benchmark_config.duration); + let result = runner.run()?; + + // Display results with algorithm name + display_benchmark_results(&result, &config.algorithm); + + Ok(()) +} + +// Format numbers with comma separators for better readability +fn format_number_with_commas(n: u64) -> String { + let s = n.to_string(); + let mut result = String::new(); + let chars: Vec = s.chars().collect(); + + for (i, ch) in chars.iter().enumerate() { + if i > 0 && (chars.len() - i) % 3 == 0 { + result.push(','); + } + result.push(*ch); + } + + result +} + +fn display_benchmark_results(result: &BenchmarkResult, algorithm_name: &str) { + println!("Algorithm: {}", algorithm_name); + println!("Acceleration Target: {}", result.acceleration_target); + + // Format data size with appropriate units + let (size_value, size_unit) = if result.data_size >= 1_048_576 { + (result.data_size as f64 / 1_048_576.0, "MiB") + } else if result.data_size >= 1024 { + (result.data_size as f64 / 1024.0, "KiB") + } else { + (result.data_size as f64, "bytes") + }; + + println!( + "Data Size: {} bytes ({:.1} {})", + format_number_with_commas(result.data_size), + size_value, + size_unit + ); + println!("Duration: {:.2} seconds", result.elapsed_seconds); + println!( + "Iterations: {}", + format_number_with_commas(result.iterations) + ); + println!("Throughput: {:.2} GiB/s", result.throughput_gibs); + + // Format time per iteration with appropriate units + let (time_value, time_unit) = if result.time_per_iteration_nanos >= 1_000_000.0 { + (result.time_per_iteration_nanos / 1_000_000.0, "ms") + } else if result.time_per_iteration_nanos >= 1_000.0 { + (result.time_per_iteration_nanos / 1_000.0, "μs") + } else { + (result.time_per_iteration_nanos, "ns") + }; + + println!("Time per iteration: {:.1} {}", time_value, time_unit); +} + fn main() -> ExitCode { match parse_args() { Ok(config) => { @@ -162,3 +461,261 @@ fn main() -> ExitCode { ExitCode::SUCCESS } + +#[cfg(test)] +mod tests { + use super::*; + use std::str::FromStr; + + #[test] + fn test_benchmark_config_validation_valid() { + let config = BenchmarkConfig { + size: Some(1024), + duration: 5.0, + }; + assert!(config.validate().is_ok()); + } + + #[test] + fn test_benchmark_config_validation_zero_duration() { + let config = BenchmarkConfig { + size: Some(1024), + duration: 0.0, + }; + assert!(config.validate().is_err()); + assert_eq!( + config.validate().unwrap_err(), + "Duration must be greater than 0" + ); + } + + #[test] + fn test_benchmark_config_validation_negative_duration() { + let config = BenchmarkConfig { + size: Some(1024), + duration: -1.0, + }; + assert!(config.validate().is_err()); + assert_eq!( + config.validate().unwrap_err(), + "Duration must be greater than 0" + ); + } + + #[test] + fn test_benchmark_config_validation_zero_size() { + let config = BenchmarkConfig { + size: Some(0), + duration: 5.0, + }; + assert!(config.validate().is_err()); + assert_eq!( + config.validate().unwrap_err(), + "Size must be greater than 0" + ); + } + + #[test] + fn test_benchmark_config_validation_none_size() { + let config = BenchmarkConfig { + size: None, + duration: 5.0, + }; + assert!(config.validate().is_ok()); + } + + #[test] + fn test_generate_random_data_valid_size() { + let data = generate_random_data(1024).unwrap(); + assert_eq!(data.len(), 1024); + } + + #[test] + fn test_generate_random_data_large_size() { + let result = generate_random_data(1_073_741_825); // 1 GiB + 1 + assert!(result.is_err()); + assert_eq!(result.unwrap_err(), "Data size too large (maximum 1 GiB)"); + } + + #[test] + fn test_benchmark_result_calculation() { + let result = BenchmarkResult::new( + 1000, // iterations + 2.0, // elapsed_seconds + "test".to_string(), // acceleration_target + 1024, // data_size + ); + + assert_eq!(result.iterations, 1000); + assert_eq!(result.elapsed_seconds, 2.0); + assert_eq!(result.data_size, 1024); + + // Throughput should be (1024 * 1000) / 2.0 / (1024^3) GiB/s + let expected_throughput = (1024.0 * 1000.0) / 2.0 / (1024.0 * 1024.0 * 1024.0); + assert!((result.throughput_gibs - expected_throughput).abs() < 1e-10); + + // Time per iteration should be 2.0 * 1e9 / 1000 nanoseconds + let expected_time_per_iter = 2.0 * 1_000_000_000.0 / 1000.0; + assert!((result.time_per_iteration_nanos - expected_time_per_iter).abs() < 1e-6); + } + + #[test] + fn test_benchmark_runner_creation() { + let algorithm = CrcAlgorithm::from_str("CRC-32/ISCSI").unwrap(); + let data = BenchmarkData::InMemory(vec![1, 2, 3, 4]); + let runner = BenchmarkRunner::new(algorithm, data, 1.0); + + assert_eq!(runner.duration, 1.0); + assert_eq!(runner.algorithm, algorithm); + } + + #[test] + fn test_benchmark_runner_execution_in_memory() { + let algorithm = CrcAlgorithm::from_str("CRC-32/ISCSI").unwrap(); + let data = BenchmarkData::InMemory(vec![1, 2, 3, 4]); + let runner = BenchmarkRunner::new(algorithm, data, 0.1); // Short duration for test + + let result = runner.run().unwrap(); + assert!(result.iterations > 0); + assert!(result.elapsed_seconds > 0.0); + assert_eq!(result.data_size, 4); + assert!(result.throughput_gibs >= 0.0); + } + + #[test] + fn test_parse_args_benchmark_mode() { + // We can't easily mock std::env::args in unit tests, so we'll test the parsing logic + // by creating a Config directly and validating its structure + let config = Config { + algorithm: "CRC-32/ISCSI".to_string(), + file: None, + string: None, + format: OutputFormat::Hex, + benchmark: Some(BenchmarkConfig { + size: Some(1024), + duration: 5.0, + }), + }; + + assert!(config.benchmark.is_some()); + let benchmark_config = config.benchmark.unwrap(); + assert_eq!(benchmark_config.size, Some(1024)); + assert_eq!(benchmark_config.duration, 5.0); + } + + #[test] + fn test_parse_args_normal_mode() { + let config = Config { + algorithm: "CRC-32/ISCSI".to_string(), + file: Some("test.txt".to_string()), + string: None, + format: OutputFormat::Hex, + benchmark: None, + }; + + assert!(config.benchmark.is_none()); + assert_eq!(config.file, Some("test.txt".to_string())); + } + + #[test] + fn test_data_source_selection_file() { + // Test that file input creates File variant + let config = Config { + algorithm: "CRC-32/ISCSI".to_string(), + file: Some("test.txt".to_string()), + string: None, + format: OutputFormat::Hex, + benchmark: Some(BenchmarkConfig { + size: None, + duration: 1.0, + }), + }; + + // This would be tested in the run_benchmark function + // We can verify the config structure is correct for file input + assert!(config.file.is_some()); + assert!(config.string.is_none()); + } + + #[test] + fn test_data_source_selection_string() { + let config = Config { + algorithm: "CRC-32/ISCSI".to_string(), + file: None, + string: Some("test data".to_string()), + format: OutputFormat::Hex, + benchmark: Some(BenchmarkConfig { + size: None, + duration: 1.0, + }), + }; + + assert!(config.file.is_none()); + assert!(config.string.is_some()); + } + + #[test] + fn test_data_source_selection_generated() { + let config = Config { + algorithm: "CRC-32/ISCSI".to_string(), + file: None, + string: None, + format: OutputFormat::Hex, + benchmark: Some(BenchmarkConfig { + size: Some(1024), + duration: 1.0, + }), + }; + + // When neither file nor string is provided, generated data should be used + assert!(config.file.is_none()); + assert!(config.string.is_none()); + assert_eq!(config.benchmark.as_ref().unwrap().size, Some(1024)); + } + + #[test] + fn test_throughput_calculation_accuracy() { + // Test with known values to verify calculation accuracy + let data_size = 1_048_576u64; // 1 MiB + let iterations = 1000u64; + let elapsed_seconds = 1.0; + + let result = + BenchmarkResult::new(iterations, elapsed_seconds, "test".to_string(), data_size); + + // Expected throughput: (1 MiB * 1000) / 1 second = 1000 MiB/s = ~0.9537 GiB/s + let expected_gibs = + (data_size as f64 * iterations as f64) / elapsed_seconds / (1024.0 * 1024.0 * 1024.0); + assert!((result.throughput_gibs - expected_gibs).abs() < 1e-10); + } + + #[test] + fn test_output_format_variants() { + let hex_format = OutputFormat::Hex; + let decimal_format = OutputFormat::Decimal; + + // Test that both variants can be created and are different + match hex_format { + OutputFormat::Hex => assert!(true), + OutputFormat::Decimal => assert!(false), + } + + match decimal_format { + OutputFormat::Decimal => assert!(true), + OutputFormat::Hex => assert!(false), + } + } + + #[test] + fn test_format_number_with_commas() { + assert_eq!(format_number_with_commas(0), "0"); + assert_eq!(format_number_with_commas(123), "123"); + assert_eq!(format_number_with_commas(1234), "1,234"); + assert_eq!(format_number_with_commas(12345), "12,345"); + assert_eq!(format_number_with_commas(123456), "123,456"); + assert_eq!(format_number_with_commas(1234567), "1,234,567"); + assert_eq!(format_number_with_commas(12345678), "12,345,678"); + assert_eq!(format_number_with_commas(123456789), "123,456,789"); + assert_eq!(format_number_with_commas(1000000000), "1,000,000,000"); + } +} diff --git a/src/bin/get-custom-params.rs b/src/bin/get-custom-params.rs index ac5df47..d730788 100644 --- a/src/bin/get-custom-params.rs +++ b/src/bin/get-custom-params.rs @@ -1,3 +1,5 @@ +// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. + //! This is a simple program to get custom CRC parameters from the command line. use std::env; diff --git a/src/crc32/fusion/aarch64.rs b/src/crc32/fusion/aarch64.rs deleted file mode 100644 index c9f0207..0000000 --- a/src/crc32/fusion/aarch64.rs +++ /dev/null @@ -1,1073 +0,0 @@ -//! Provides CRC-32/ISCSI and CRC-32/ISO-HDLC calculations using a fusion of native CLMUL -//! instructions and native CRC calculation instructions on aarch64. -//! -//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/ -//! -//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ -//! with the help of Claude.ai using: -//! -//! ./generate -i neon -p crc32c -a v12e_v1 -//! ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3 -//! ./generate -i neon -p crc32 -a v12e_v1 -//! -//! Modified as necessary for this Rust implementation. -//! -//! MIT licensed. - -#![cfg(target_arch = "aarch64")] - -use std::arch::aarch64::*; - -/// Safe wrapper for CRC32 iSCSI calculation -#[inline] -#[cfg(target_feature = "sha3")] -pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 { - unsafe { - const LARGE_BUFFER_THRESHOLD: usize = 1024; - - // Select implementation based on buffer size - if data.len() <= LARGE_BUFFER_THRESHOLD { - crc32_iscsi_v12e_v1(crc, data.as_ptr(), data.len()) - } else { - crc32_iscsi_eor3_v9s3x2e_s3(crc, data.as_ptr(), data.len()) - } - } -} - -#[inline] -#[cfg(not(target_feature = "sha3"))] -pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 { - unsafe { crc32_iscsi_v12e_v1(crc, data.as_ptr(), data.len()) } -} - -/// Safe wrapper for CRC32 ISO-HDLC calculation -#[inline] -#[cfg(target_feature = "sha3")] -pub fn crc32_iso_hdlc(crc: u32, data: &[u8]) -> u32 { - unsafe { - const LARGE_BUFFER_THRESHOLD: usize = 1024; - - // Select implementation based on buffer size - if data.len() <= LARGE_BUFFER_THRESHOLD { - crc32_iso_hdlc_v12e_v1(crc, data.as_ptr(), data.len()) - } else { - crc32_iso_hdlc_eor3_v9s3x2e_s3(crc, data.as_ptr(), data.len()) - } - } -} - -#[inline] -#[cfg(not(target_feature = "sha3"))] -pub fn crc32_iso_hdlc(crc: u32, data: &[u8]) -> u32 { - unsafe { crc32_iso_hdlc_v12e_v1(crc, data.as_ptr(), data.len()) } -} - -#[inline] -#[cfg(target_feature = "sha3")] -#[target_feature(enable = "aes")] -unsafe fn clmul_lo_eor3(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { - // Polynomial multiply low parts - convert u128 result to uint64x2_t - let result = vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0)); - vreinterpretq_u64_p128(result) -} - -#[inline] -#[cfg(target_feature = "sha3")] -#[target_feature(enable = "aes")] -unsafe fn clmul_hi_eor3(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { - // Polynomial multiply high parts - convert u128 result to uint64x2_t - let result = vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1)); - vreinterpretq_u64_p128(result) -} - -#[inline] -#[cfg(target_feature = "sha3")] -#[target_feature(enable = "aes")] -unsafe fn clmul_scalar(a: u32, b: u32) -> uint64x2_t { - // Polynomial multiply scalars - convert u128 result to uint64x2_t - let result = vmull_p64(a as u64, b as u64); - vreinterpretq_u64_p128(result) -} - -// x^n mod P, in log(n) time -#[inline] -#[cfg(target_feature = "sha3")] -#[target_feature(enable = "aes")] -unsafe fn xnmodp_crc32_iscsi(mut n: u64) -> u32 { - let mut stack = !1u64; - let mut acc: u32; - let mut low: u32; - - while n > 191 { - stack = (stack << 1) + (n & 1); - n = (n >> 1) - 16; - } - stack = !stack; - acc = 0x80000000u32 >> (n & 31); - n >>= 5; - - while n > 0 { - // ARM CRC32 instruction - acc = unsafe { __crc32cw(acc, 0) }; - n -= 1; - } - - while { - low = (stack & 1) as u32; - stack >>= 1; - stack != 0 - } { - unsafe { - // Convert to polynomial type and square it - let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64)); - let squared = vmull_p8(x, x); - let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0); - acc = __crc32cd(0, y << low); - } - } - acc -} - -#[inline] -#[cfg(target_feature = "sha3")] -#[target_feature(enable = "aes")] -unsafe fn crc_shift_iscsi(crc: u32, nbytes: usize) -> uint64x2_t { - clmul_scalar(crc, xnmodp_crc32_iscsi((nbytes * 8 - 33) as u64)) -} - -// x^n mod P, in log(n) time -#[inline] -#[cfg(target_feature = "sha3")] -#[target_feature(enable = "aes")] -unsafe fn xnmodp_iso_hdlc(mut n: u64) -> u32 { - let mut stack = !1u64; - let mut acc: u32; - let mut low: u32; - - while n > 191 { - stack = (stack << 1) + (n & 1); - n = (n >> 1) - 16; - } - stack = !stack; - acc = 0x80000000u32 >> (n & 31); - n >>= 5; - - while n > 0 { - // ARM CRC32 instruction (ISO-HDLC uses standard CRC32, not CRC32C) - acc = unsafe { __crc32w(acc, 0) }; - n -= 1; - } - - while { - low = (stack & 1) as u32; - stack >>= 1; - stack != 0 - } { - unsafe { - // Convert to polynomial type and square it - let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64)); - let squared = vmull_p8(x, x); - let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0); - acc = __crc32d(0, y << low); - } - } - acc -} - -#[inline] -#[cfg(target_feature = "sha3")] -#[target_feature(enable = "aes")] -unsafe fn crc_shift_iso_hdlc(crc: u32, nbytes: usize) -> uint64x2_t { - clmul_scalar(crc, xnmodp_iso_hdlc((nbytes * 8 - 33) as u64)) -} - -/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ -/// using: -/// -/// ./generate -i neon_eor3 -p crc32c -a v9s3x2e_s3 -#[inline] -#[cfg(target_feature = "sha3")] -#[target_feature(enable = "aes,sha3")] -unsafe fn crc32_iscsi_eor3_v9s3x2e_s3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 { - // Align to 8-byte boundary - while len > 0 && (buf as usize & 7) != 0 { - crc0 = __crc32cb(crc0, *buf); - buf = buf.add(1); - len -= 1; - } - - // Handle 8-byte alignment - if (buf as usize & 8) != 0 && len >= 8 { - crc0 = __crc32cd(crc0, *(buf as *const u64)); - buf = buf.add(8); - len -= 8; - } - - if len >= 192 { - let end = buf.add(len); - let blk = len / 192; - let klen = blk * 16; - let buf2 = buf.add(klen * 3); - let limit = buf.add(klen).sub(32); - let mut crc1 = 0u32; - let mut crc2 = 0u32; - - // First vector chunk - let mut x0 = vld1q_u64(buf2 as *const u64); - let mut x1 = vld1q_u64(buf2.add(16) as *const u64); - let mut x2 = vld1q_u64(buf2.add(32) as *const u64); - let mut x3 = vld1q_u64(buf2.add(48) as *const u64); - let mut x4 = vld1q_u64(buf2.add(64) as *const u64); - let mut x5 = vld1q_u64(buf2.add(80) as *const u64); - let mut x6 = vld1q_u64(buf2.add(96) as *const u64); - let mut x7 = vld1q_u64(buf2.add(112) as *const u64); - let mut x8 = vld1q_u64(buf2.add(128) as *const u64); - - let k_vals: [u64; 2] = [0x7e908048, 0xc96cfdc0]; - let mut k = vld1q_u64(k_vals.as_ptr()); - let mut buf2 = buf2.add(144); - - // Main loop - while buf <= limit { - let y0 = clmul_lo_eor3(x0, k); - x0 = clmul_hi_eor3(x0, k); - let y1 = clmul_lo_eor3(x1, k); - x1 = clmul_hi_eor3(x1, k); - let y2 = clmul_lo_eor3(x2, k); - x2 = clmul_hi_eor3(x2, k); - let y3 = clmul_lo_eor3(x3, k); - x3 = clmul_hi_eor3(x3, k); - let y4 = clmul_lo_eor3(x4, k); - x4 = clmul_hi_eor3(x4, k); - let y5 = clmul_lo_eor3(x5, k); - x5 = clmul_hi_eor3(x5, k); - let y6 = clmul_lo_eor3(x6, k); - x6 = clmul_hi_eor3(x6, k); - let y7 = clmul_lo_eor3(x7, k); - x7 = clmul_hi_eor3(x7, k); - let y8 = clmul_lo_eor3(x8, k); - x8 = clmul_hi_eor3(x8, k); - - x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64)); - x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64)); - x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64)); - x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64)); - x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64)); - x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64)); - x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64)); - x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64)); - x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64)); - - crc0 = __crc32cd(crc0, *(buf as *const u64)); - crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64)); - crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64)); - crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64)); - crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64)); - crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64)); - - buf = buf.add(16); - buf2 = buf2.add(144); - } - - // Reduce x0 ... x8 to just x0 - let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27]; - k = vld1q_u64(k_vals.as_ptr()); - - let y0 = clmul_lo_eor3(x0, k); - x0 = clmul_hi_eor3(x0, k); - x0 = veor3q_u64(x0, y0, x1); - x1 = x2; - x2 = x3; - x3 = x4; - x4 = x5; - x5 = x6; - x6 = x7; - x7 = x8; - - let y0 = clmul_lo_eor3(x0, k); - x0 = clmul_hi_eor3(x0, k); - let y2 = clmul_lo_eor3(x2, k); - x2 = clmul_hi_eor3(x2, k); - let y4 = clmul_lo_eor3(x4, k); - x4 = clmul_hi_eor3(x4, k); - let y6 = clmul_lo_eor3(x6, k); - x6 = clmul_hi_eor3(x6, k); - - x0 = veor3q_u64(x0, y0, x1); - x2 = veor3q_u64(x2, y2, x3); - x4 = veor3q_u64(x4, y4, x5); - x6 = veor3q_u64(x6, y6, x7); - - let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e]; - k = vld1q_u64(k_vals.as_ptr()); - - let y0 = clmul_lo_eor3(x0, k); - x0 = clmul_hi_eor3(x0, k); - let y4 = clmul_lo_eor3(x4, k); - x4 = clmul_hi_eor3(x4, k); - - x0 = veor3q_u64(x0, y0, x2); - x4 = veor3q_u64(x4, y4, x6); - - let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8]; - k = vld1q_u64(k_vals.as_ptr()); - - let y0 = clmul_lo_eor3(x0, k); - x0 = clmul_hi_eor3(x0, k); - x0 = veor3q_u64(x0, y0, x4); - - // Final scalar chunk - crc0 = __crc32cd(crc0, *(buf as *const u64)); - crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64)); - crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64)); - crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64)); - crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64)); - crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64)); - - let vc0 = crc_shift_iscsi(crc0, klen * 2 + blk * 144); - let vc1 = crc_shift_iscsi(crc1, klen + blk * 144); - let vc2 = crc_shift_iscsi(crc2, blk * 144); - let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0); - - // Reduce 128 bits to 32 bits, and multiply by x^32 - crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1)); - - buf = buf2; - len = end.offset_from(buf) as usize; - } - - if len >= 32 { - let klen = ((len - 8) / 24) * 8; - let mut crc1 = 0u32; - let mut crc2 = 0u32; - - // Main loop - loop { - crc0 = __crc32cd(crc0, *(buf as *const u64)); - crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64)); - crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64)); - buf = buf.add(8); - len -= 24; - if len < 32 { - break; - } - } - - let vc0 = crc_shift_iscsi(crc0, klen * 2 + 8); - let vc1 = crc_shift_iscsi(crc1, klen + 8); - let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0); - - // Final 8 bytes - buf = buf.add(klen * 2); - crc0 = crc2; - crc0 = __crc32cd(crc0, *(buf as *const u64) ^ vc); - buf = buf.add(8); - len -= 8; - } - - while len >= 8 { - crc0 = __crc32cd(crc0, *(buf as *const u64)); - buf = buf.add(8); - len -= 8; - } - - while len > 0 { - crc0 = __crc32cb(crc0, *buf); - buf = buf.add(1); - len -= 1; - } - - crc0 -} - -#[inline] -#[target_feature(enable = "aes")] -unsafe fn clmul_lo_e(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t { - // Polynomial multiply low parts and XOR with c - let mul_result = vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0)); - let mul_vec = vreinterpretq_u64_p128(mul_result); - veorq_u64(mul_vec, c) -} - -#[inline] -#[target_feature(enable = "aes")] -unsafe fn clmul_hi_e(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t { - // Polynomial multiply high parts and XOR with c - let mul_result = vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1)); - let mul_vec = vreinterpretq_u64_p128(mul_result); - veorq_u64(mul_vec, c) -} - -/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ -/// using: -/// -/// ./generate -i neon -p crc32c -a v12e_v1 -#[inline] -#[target_feature(enable = "aes")] -unsafe fn crc32_iscsi_v12e_v1(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 { - // Align to 8-byte boundary - while len > 0 && (buf as usize & 7) != 0 { - crc0 = __crc32cb(crc0, *buf); - buf = buf.add(1); - len -= 1; - } - - // Handle 8-byte alignment - if (buf as usize & 8) != 0 && len >= 8 { - crc0 = __crc32cd(crc0, *(buf as *const u64)); - buf = buf.add(8); - len -= 8; - } - - if len >= 192 { - let end = buf.add(len); - let limit = buf.add(len - 192); - - // First vector chunk - let mut x0 = vld1q_u64(buf as *const u64); - let mut x1 = vld1q_u64(buf.add(16) as *const u64); - let mut x2 = vld1q_u64(buf.add(32) as *const u64); - let mut x3 = vld1q_u64(buf.add(48) as *const u64); - let mut x4 = vld1q_u64(buf.add(64) as *const u64); - let mut x5 = vld1q_u64(buf.add(80) as *const u64); - let mut x6 = vld1q_u64(buf.add(96) as *const u64); - let mut x7 = vld1q_u64(buf.add(112) as *const u64); - let mut x8 = vld1q_u64(buf.add(128) as *const u64); - let mut x9 = vld1q_u64(buf.add(144) as *const u64); - let mut x10 = vld1q_u64(buf.add(160) as *const u64); - let mut x11 = vld1q_u64(buf.add(176) as *const u64); - - let k_vals: [u64; 2] = [0xa87ab8a8, 0xab7aff2a]; - let mut k = vld1q_u64(k_vals.as_ptr()); - - // Create CRC vector and XOR with first vector - let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0); - x0 = veorq_u64(crc_vec, x0); - buf = buf.add(192); - - // Main loop - while buf <= limit { - let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64)); - x0 = clmul_hi_e(x0, k, y0); - let y1 = clmul_lo_e(x1, k, vld1q_u64(buf.add(16) as *const u64)); - x1 = clmul_hi_e(x1, k, y1); - let y2 = clmul_lo_e(x2, k, vld1q_u64(buf.add(32) as *const u64)); - x2 = clmul_hi_e(x2, k, y2); - let y3 = clmul_lo_e(x3, k, vld1q_u64(buf.add(48) as *const u64)); - x3 = clmul_hi_e(x3, k, y3); - let y4 = clmul_lo_e(x4, k, vld1q_u64(buf.add(64) as *const u64)); - x4 = clmul_hi_e(x4, k, y4); - let y5 = clmul_lo_e(x5, k, vld1q_u64(buf.add(80) as *const u64)); - x5 = clmul_hi_e(x5, k, y5); - let y6 = clmul_lo_e(x6, k, vld1q_u64(buf.add(96) as *const u64)); - x6 = clmul_hi_e(x6, k, y6); - let y7 = clmul_lo_e(x7, k, vld1q_u64(buf.add(112) as *const u64)); - x7 = clmul_hi_e(x7, k, y7); - let y8 = clmul_lo_e(x8, k, vld1q_u64(buf.add(128) as *const u64)); - x8 = clmul_hi_e(x8, k, y8); - let y9 = clmul_lo_e(x9, k, vld1q_u64(buf.add(144) as *const u64)); - x9 = clmul_hi_e(x9, k, y9); - let y10 = clmul_lo_e(x10, k, vld1q_u64(buf.add(160) as *const u64)); - x10 = clmul_hi_e(x10, k, y10); - let y11 = clmul_lo_e(x11, k, vld1q_u64(buf.add(176) as *const u64)); - x11 = clmul_hi_e(x11, k, y11); - buf = buf.add(192); - } - - // Reduce x0 ... x11 to just x0 - let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27]; - k = vld1q_u64(k_vals.as_ptr()); - - let y0 = clmul_lo_e(x0, k, x1); - x0 = clmul_hi_e(x0, k, y0); - let y2 = clmul_lo_e(x2, k, x3); - x2 = clmul_hi_e(x2, k, y2); - let y4 = clmul_lo_e(x4, k, x5); - x4 = clmul_hi_e(x4, k, y4); - let y6 = clmul_lo_e(x6, k, x7); - x6 = clmul_hi_e(x6, k, y6); - let y8 = clmul_lo_e(x8, k, x9); - x8 = clmul_hi_e(x8, k, y8); - let y10 = clmul_lo_e(x10, k, x11); - x10 = clmul_hi_e(x10, k, y10); - - let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e]; - k = vld1q_u64(k_vals.as_ptr()); - - let y0 = clmul_lo_e(x0, k, x2); - x0 = clmul_hi_e(x0, k, y0); - let y4 = clmul_lo_e(x4, k, x6); - x4 = clmul_hi_e(x4, k, y4); - let y8 = clmul_lo_e(x8, k, x10); - x8 = clmul_hi_e(x8, k, y8); - - let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8]; - k = vld1q_u64(k_vals.as_ptr()); - - let y0 = clmul_lo_e(x0, k, x4); - x0 = clmul_hi_e(x0, k, y0); - x4 = x8; - let y0 = clmul_lo_e(x0, k, x4); - x0 = clmul_hi_e(x0, k, y0); - - // Reduce 128 bits to 32 bits, and multiply by x^32 - crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1)); - len = end.offset_from(buf) as usize; - } - - if len >= 16 { - // First vector chunk - let mut x0 = vld1q_u64(buf as *const u64); - - let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27]; - let k = vld1q_u64(k_vals.as_ptr()); - - // Create CRC vector and XOR with first vector - let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0); - x0 = veorq_u64(crc_vec, x0); - buf = buf.add(16); - len -= 16; - - // Main loop - while len >= 16 { - let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64)); - x0 = clmul_hi_e(x0, k, y0); - buf = buf.add(16); - len -= 16; - } - - // Reduce 128 bits to 32 bits, and multiply by x^32 - crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1)); - } - - while len >= 8 { - crc0 = __crc32cd(crc0, *(buf as *const u64)); - buf = buf.add(8); - len -= 8; - } - - while len > 0 { - crc0 = __crc32cb(crc0, *buf); - buf = buf.add(1); - len -= 1; - } - - crc0 -} - -/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ -/// using: -/// -/// ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3 -#[inline] -#[cfg(target_feature = "sha3")] -#[target_feature(enable = "aes,sha3")] -unsafe fn crc32_iso_hdlc_eor3_v9s3x2e_s3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 { - // Align to 8-byte boundary - while len > 0 && (buf as usize & 7) != 0 { - crc0 = __crc32b(crc0, *buf); - buf = buf.add(1); - len -= 1; - } - - // Handle 8-byte alignment - if (buf as usize & 8) != 0 && len >= 8 { - crc0 = __crc32d(crc0, *(buf as *const u64)); - buf = buf.add(8); - len -= 8; - } - - if len >= 192 { - let end = buf.add(len); - let blk = len / 192; - let klen = blk * 16; - let buf2 = buf.add(klen * 3); - let limit = buf.add(klen).sub(32); - let mut crc1 = 0u32; - let mut crc2 = 0u32; - - // First vector chunk - let mut x0 = vld1q_u64(buf2 as *const u64); - let mut x1 = vld1q_u64(buf2.add(16) as *const u64); - let mut x2 = vld1q_u64(buf2.add(32) as *const u64); - let mut x3 = vld1q_u64(buf2.add(48) as *const u64); - let mut x4 = vld1q_u64(buf2.add(64) as *const u64); - let mut x5 = vld1q_u64(buf2.add(80) as *const u64); - let mut x6 = vld1q_u64(buf2.add(96) as *const u64); - let mut x7 = vld1q_u64(buf2.add(112) as *const u64); - let mut x8 = vld1q_u64(buf2.add(128) as *const u64); - - // ISO-HDLC specific constants - let k_vals: [u64; 2] = [0x26b70c3d, 0x3f41287a]; - let mut k = vld1q_u64(k_vals.as_ptr()); - let mut buf2 = buf2.add(144); - - // Main loop - while buf <= limit { - let y0 = clmul_lo_eor3(x0, k); - x0 = clmul_hi_eor3(x0, k); - let y1 = clmul_lo_eor3(x1, k); - x1 = clmul_hi_eor3(x1, k); - let y2 = clmul_lo_eor3(x2, k); - x2 = clmul_hi_eor3(x2, k); - let y3 = clmul_lo_eor3(x3, k); - x3 = clmul_hi_eor3(x3, k); - let y4 = clmul_lo_eor3(x4, k); - x4 = clmul_hi_eor3(x4, k); - let y5 = clmul_lo_eor3(x5, k); - x5 = clmul_hi_eor3(x5, k); - let y6 = clmul_lo_eor3(x6, k); - x6 = clmul_hi_eor3(x6, k); - let y7 = clmul_lo_eor3(x7, k); - x7 = clmul_hi_eor3(x7, k); - let y8 = clmul_lo_eor3(x8, k); - x8 = clmul_hi_eor3(x8, k); - - x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64)); - x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64)); - x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64)); - x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64)); - x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64)); - x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64)); - x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64)); - x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64)); - x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64)); - - crc0 = __crc32d(crc0, *(buf as *const u64)); - crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64)); - crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64)); - crc0 = __crc32d(crc0, *(buf.add(8) as *const u64)); - crc1 = __crc32d(crc1, *(buf.add(klen + 8) as *const u64)); - crc2 = __crc32d(crc2, *(buf.add(klen * 2 + 8) as *const u64)); - - buf = buf.add(16); - buf2 = buf2.add(144); - } - - // Reduce x0 ... x8 to just x0 - let k_vals: [u64; 2] = [0xae689191, 0xccaa009e]; - k = vld1q_u64(k_vals.as_ptr()); - - let y0 = clmul_lo_eor3(x0, k); - x0 = clmul_hi_eor3(x0, k); - x0 = veor3q_u64(x0, y0, x1); - x1 = x2; - x2 = x3; - x3 = x4; - x4 = x5; - x5 = x6; - x6 = x7; - x7 = x8; - - let y0 = clmul_lo_eor3(x0, k); - x0 = clmul_hi_eor3(x0, k); - let y2 = clmul_lo_eor3(x2, k); - x2 = clmul_hi_eor3(x2, k); - let y4 = clmul_lo_eor3(x4, k); - x4 = clmul_hi_eor3(x4, k); - let y6 = clmul_lo_eor3(x6, k); - x6 = clmul_hi_eor3(x6, k); - - x0 = veor3q_u64(x0, y0, x1); - x2 = veor3q_u64(x2, y2, x3); - x4 = veor3q_u64(x4, y4, x5); - x6 = veor3q_u64(x6, y6, x7); - - let k_vals: [u64; 2] = [0xf1da05aa, 0x81256527]; - k = vld1q_u64(k_vals.as_ptr()); - - let y0 = clmul_lo_eor3(x0, k); - x0 = clmul_hi_eor3(x0, k); - let y4 = clmul_lo_eor3(x4, k); - x4 = clmul_hi_eor3(x4, k); - - x0 = veor3q_u64(x0, y0, x2); - x4 = veor3q_u64(x4, y4, x6); - - let k_vals: [u64; 2] = [0x8f352d95, 0x1d9513d7]; - k = vld1q_u64(k_vals.as_ptr()); - - let y0 = clmul_lo_eor3(x0, k); - x0 = clmul_hi_eor3(x0, k); - x0 = veor3q_u64(x0, y0, x4); - - // Final scalar chunk - crc0 = __crc32d(crc0, *(buf as *const u64)); - crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64)); - crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64)); - crc0 = __crc32d(crc0, *(buf.add(8) as *const u64)); - crc1 = __crc32d(crc1, *(buf.add(klen + 8) as *const u64)); - crc2 = __crc32d(crc2, *(buf.add(klen * 2 + 8) as *const u64)); - - let vc0 = crc_shift_iso_hdlc(crc0, klen * 2 + blk * 144); - let vc1 = crc_shift_iso_hdlc(crc1, klen + blk * 144); - let vc2 = crc_shift_iso_hdlc(crc2, blk * 144); - let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0); - - // Reduce 128 bits to 32 bits, and multiply by x^32 - crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1)); - - buf = buf2; - len = end.offset_from(buf) as usize; - } - - if len >= 32 { - let klen = ((len - 8) / 24) * 8; - let mut crc1 = 0u32; - let mut crc2 = 0u32; - - // Main loop - loop { - crc0 = __crc32d(crc0, *(buf as *const u64)); - crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64)); - crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64)); - buf = buf.add(8); - len -= 24; - if len < 32 { - break; - } - } - - let vc0 = crc_shift_iso_hdlc(crc0, klen * 2 + 8); - let vc1 = crc_shift_iso_hdlc(crc1, klen + 8); - let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0); - - // Final 8 bytes - buf = buf.add(klen * 2); - crc0 = crc2; - crc0 = __crc32d(crc0, *(buf as *const u64) ^ vc); - buf = buf.add(8); - len -= 8; - } - - while len >= 8 { - crc0 = __crc32d(crc0, *(buf as *const u64)); - buf = buf.add(8); - len -= 8; - } - - while len > 0 { - crc0 = __crc32b(crc0, *buf); - buf = buf.add(1); - len -= 1; - } - - crc0 -} - -/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ -/// using: -/// -/// ./generate -i neon -p crc32 -a v12e_v1 -#[inline] -#[target_feature(enable = "aes")] -unsafe fn crc32_iso_hdlc_v12e_v1(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 { - // Align to 8-byte boundary - while len > 0 && (buf as usize & 7) != 0 { - crc0 = __crc32b(crc0, *buf); - buf = buf.add(1); - len -= 1; - } - - // Handle 8-byte alignment - if (buf as usize & 8) != 0 && len >= 8 { - crc0 = __crc32d(crc0, *(buf as *const u64)); - buf = buf.add(8); - len -= 8; - } - - if len >= 192 { - let end = buf.add(len); - let limit = buf.add(len - 192); - - // First vector chunk - let mut x0 = vld1q_u64(buf as *const u64); - let mut x1 = vld1q_u64(buf.add(16) as *const u64); - let mut x2 = vld1q_u64(buf.add(32) as *const u64); - let mut x3 = vld1q_u64(buf.add(48) as *const u64); - let mut x4 = vld1q_u64(buf.add(64) as *const u64); - let mut x5 = vld1q_u64(buf.add(80) as *const u64); - let mut x6 = vld1q_u64(buf.add(96) as *const u64); - let mut x7 = vld1q_u64(buf.add(112) as *const u64); - let mut x8 = vld1q_u64(buf.add(128) as *const u64); - let mut x9 = vld1q_u64(buf.add(144) as *const u64); - let mut x10 = vld1q_u64(buf.add(160) as *const u64); - let mut x11 = vld1q_u64(buf.add(176) as *const u64); - - // ISO-HDLC specific constants for small implementation - let k_vals: [u64; 2] = [0x596c8d81, 0xf5e48c85]; - let mut k = vld1q_u64(k_vals.as_ptr()); - - // Create CRC vector and XOR with first vector - let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0); - x0 = veorq_u64(crc_vec, x0); - buf = buf.add(192); - - // Main loop - while buf <= limit { - let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64)); - x0 = clmul_hi_e(x0, k, y0); - let y1 = clmul_lo_e(x1, k, vld1q_u64(buf.add(16) as *const u64)); - x1 = clmul_hi_e(x1, k, y1); - let y2 = clmul_lo_e(x2, k, vld1q_u64(buf.add(32) as *const u64)); - x2 = clmul_hi_e(x2, k, y2); - let y3 = clmul_lo_e(x3, k, vld1q_u64(buf.add(48) as *const u64)); - x3 = clmul_hi_e(x3, k, y3); - let y4 = clmul_lo_e(x4, k, vld1q_u64(buf.add(64) as *const u64)); - x4 = clmul_hi_e(x4, k, y4); - let y5 = clmul_lo_e(x5, k, vld1q_u64(buf.add(80) as *const u64)); - x5 = clmul_hi_e(x5, k, y5); - let y6 = clmul_lo_e(x6, k, vld1q_u64(buf.add(96) as *const u64)); - x6 = clmul_hi_e(x6, k, y6); - let y7 = clmul_lo_e(x7, k, vld1q_u64(buf.add(112) as *const u64)); - x7 = clmul_hi_e(x7, k, y7); - let y8 = clmul_lo_e(x8, k, vld1q_u64(buf.add(128) as *const u64)); - x8 = clmul_hi_e(x8, k, y8); - let y9 = clmul_lo_e(x9, k, vld1q_u64(buf.add(144) as *const u64)); - x9 = clmul_hi_e(x9, k, y9); - let y10 = clmul_lo_e(x10, k, vld1q_u64(buf.add(160) as *const u64)); - x10 = clmul_hi_e(x10, k, y10); - let y11 = clmul_lo_e(x11, k, vld1q_u64(buf.add(176) as *const u64)); - x11 = clmul_hi_e(x11, k, y11); - buf = buf.add(192); - } - - // Reduce x0 ... x11 to just x0 - let k_vals: [u64; 2] = [0xae689191, 0xccaa009e]; - k = vld1q_u64(k_vals.as_ptr()); - - let y0 = clmul_lo_e(x0, k, x1); - x0 = clmul_hi_e(x0, k, y0); - let y2 = clmul_lo_e(x2, k, x3); - x2 = clmul_hi_e(x2, k, y2); - let y4 = clmul_lo_e(x4, k, x5); - x4 = clmul_hi_e(x4, k, y4); - let y6 = clmul_lo_e(x6, k, x7); - x6 = clmul_hi_e(x6, k, y6); - let y8 = clmul_lo_e(x8, k, x9); - x8 = clmul_hi_e(x8, k, y8); - let y10 = clmul_lo_e(x10, k, x11); - x10 = clmul_hi_e(x10, k, y10); - - let k_vals: [u64; 2] = [0xf1da05aa, 0x81256527]; - k = vld1q_u64(k_vals.as_ptr()); - - let y0 = clmul_lo_e(x0, k, x2); - x0 = clmul_hi_e(x0, k, y0); - let y4 = clmul_lo_e(x4, k, x6); - x4 = clmul_hi_e(x4, k, y4); - let y8 = clmul_lo_e(x8, k, x10); - x8 = clmul_hi_e(x8, k, y8); - - let k_vals: [u64; 2] = [0x8f352d95, 0x1d9513d7]; - k = vld1q_u64(k_vals.as_ptr()); - - let y0 = clmul_lo_e(x0, k, x4); - x0 = clmul_hi_e(x0, k, y0); - x4 = x8; - let y0 = clmul_lo_e(x0, k, x4); - x0 = clmul_hi_e(x0, k, y0); - - // Reduce 128 bits to 32 bits, and multiply by x^32 - crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1)); - len = end.offset_from(buf) as usize; - } - - if len >= 16 { - // First vector chunk - let mut x0 = vld1q_u64(buf as *const u64); - - let k_vals: [u64; 2] = [0xae689191, 0xccaa009e]; - let k = vld1q_u64(k_vals.as_ptr()); - - // Create CRC vector and XOR with first vector - let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0); - x0 = veorq_u64(crc_vec, x0); - buf = buf.add(16); - len -= 16; - - // Main loop - while len >= 16 { - let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64)); - x0 = clmul_hi_e(x0, k, y0); - buf = buf.add(16); - len -= 16; - } - - // Reduce 128 bits to 32 bits, and multiply by x^32 - crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1)); - } - - while len >= 8 { - crc0 = __crc32d(crc0, *(buf as *const u64)); - buf = buf.add(8); - len -= 8; - } - - while len > 0 { - crc0 = __crc32b(crc0, *buf); - buf = buf.add(1); - len -= 1; - } - - crc0 -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::test::consts::TEST_CHECK_STRING; - use crc::{Crc, Table}; - use rand::{rng, Rng}; - - const RUST_CRC32_ISO_HDLC: Crc> = - Crc::>::new(&crc::CRC_32_ISO_HDLC); - - const RUST_CRC32_ISCSI: Crc> = Crc::>::new(&crc::CRC_32_ISCSI); - - #[test] - fn test_crc32_iso_hdlc_check() { - assert_eq!( - crc32_iso_hdlc(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff, - 0xcbf43926 - ); - } - - #[test] - fn test_crc32_iso_hdlc_small_all_lengths() { - for len in 1..=255 { - crc32_iso_hdlc_random(len) - } - } - - #[test] - fn test_crc32_iso_hdlc_medium_lengths() { - // Test each length from 256 to 1024, which should fold and include handling remainders - for len in 256..=1024 { - crc32_iso_hdlc_random(len) - } - } - - #[test] - fn test_crc32_iso_hdlc_large_lengths() { - // Test 1 MiB just before, at, and just after the folding boundaries - for len in 1048575..1048577 { - crc32_iso_hdlc_random(len) - } - } - - #[test] - fn test_crc32_iscsi_check() { - assert_eq!( - crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff, - 0xe3069283 - ); - } - - #[test] - fn test_crc32_iscsi_small_all_lengths() { - for len in 1..=255 { - crc32_iscsi_random(len); - } - } - - #[test] - fn test_crc32_iscsi_medium_lengths() { - // Test each length from 256 to 1024, which should fold and include handling remainders - for len in 256..=1024 { - crc32_iscsi_random(len); - } - } - - #[test] - fn test_crc32_iscsi_large_lengths() { - // Test 1 MiB just before, at, and just after the folding boundaries - for len in 1048575..1048577 { - crc32_iscsi_random(len); - } - } - - #[cfg(target_feature = "sha3")] - fn crc32_iso_hdlc_random(len: usize) { - let mut data = vec![0u8; len]; - rng().fill(&mut data[..]); - - let checksum = RUST_CRC32_ISO_HDLC.checksum(&data); - - assert_eq!(crc32_iso_hdlc(0xffffffff, &data) ^ 0xffffffff, checksum); - - unsafe { - assert_eq!( - crc32_iso_hdlc_eor3_v9s3x2e_s3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, - checksum - ); - - assert_eq!( - crc32_iso_hdlc_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, - checksum - ); - } - } - - #[cfg(not(target_feature = "sha3"))] - fn crc32_iso_hdlc_random(len: usize) { - let mut data = vec![0u8; len]; - rng().fill(&mut data[..]); - - let checksum = RUST_CRC32_ISO_HDLC.checksum(&data); - - assert_eq!(crc32_iso_hdlc(0xffffffff, &data) ^ 0xffffffff, checksum); - - unsafe { - assert_eq!( - crc32_iso_hdlc_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, - checksum - ); - } - } - - #[cfg(target_feature = "sha3")] - fn crc32_iscsi_random(len: usize) { - let mut data = vec![0u8; len]; - rng().fill(&mut data[..]); - - let checksum = RUST_CRC32_ISCSI.checksum(&data); - - assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum); - - unsafe { - assert_eq!( - crc32_iscsi_eor3_v9s3x2e_s3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, - checksum - ); - - assert_eq!( - crc32_iscsi_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, - checksum - ); - } - } - - #[cfg(not(target_feature = "sha3"))] - fn crc32_iscsi_random(len: usize) { - let mut data = vec![0u8; len]; - rng().fill(&mut data[..]); - - let checksum = RUST_CRC32_ISCSI.checksum(&data); - - assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum); - - unsafe { - assert_eq!( - crc32_iscsi_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, - checksum - ); - } - } -} diff --git a/src/crc32/fusion/aarch64/iscsi/crc_pmull.rs b/src/crc32/fusion/aarch64/iscsi/crc_pmull.rs new file mode 100644 index 0000000..1bb2473 --- /dev/null +++ b/src/crc32/fusion/aarch64/iscsi/crc_pmull.rs @@ -0,0 +1,179 @@ +//! Provides CRC-32/ISCSI calculations using a fusion of native CLMUL +//! instructions and native CRC calculation instructions on aarch64. +//! +//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/ +//! +//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +//! with the help of Claude.ai. +//! +//! Modified as necessary for this Rust implementation. +//! +//! MIT licensed. + +#![cfg(target_arch = "aarch64")] + +use crate::crc32::fusion::aarch64::{clmul_hi_and_xor, clmul_lo_and_xor}; +use std::arch::aarch64::{ + __crc32cb, __crc32cd, veorq_u64, vgetq_lane_u64, vld1q_u64, vmovq_n_u64, vsetq_lane_u64, +}; + +/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +/// with the help of Claude.ai using: +/// +/// ./generate -i neon -p crc32c -a v12e_v1 +/// +/// Modified as necessary for this Rust implementation. +#[inline] +#[target_feature(enable = "crc,aes")] +pub(crate) unsafe fn crc32_iscsi_v12e_v1(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 { + // Align to 8-byte boundary + while len > 0 && (buf as usize & 7) != 0 { + crc0 = __crc32cb(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + // Handle 8-byte alignment + if (buf as usize & 8) != 0 && len >= 8 { + crc0 = __crc32cd(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + if len >= 192 { + let end = buf.add(len); + let limit = buf.add(len - 192); + + // First vector chunk + let mut x0 = vld1q_u64(buf as *const u64); + let mut x1 = vld1q_u64(buf.add(16) as *const u64); + let mut x2 = vld1q_u64(buf.add(32) as *const u64); + let mut x3 = vld1q_u64(buf.add(48) as *const u64); + let mut x4 = vld1q_u64(buf.add(64) as *const u64); + let mut x5 = vld1q_u64(buf.add(80) as *const u64); + let mut x6 = vld1q_u64(buf.add(96) as *const u64); + let mut x7 = vld1q_u64(buf.add(112) as *const u64); + let mut x8 = vld1q_u64(buf.add(128) as *const u64); + let mut x9 = vld1q_u64(buf.add(144) as *const u64); + let mut x10 = vld1q_u64(buf.add(160) as *const u64); + let mut x11 = vld1q_u64(buf.add(176) as *const u64); + + let k_vals: [u64; 2] = [0xa87ab8a8, 0xab7aff2a]; + let mut k = vld1q_u64(k_vals.as_ptr()); + + // Create CRC vector and XOR with first vector + let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0); + x0 = veorq_u64(crc_vec, x0); + buf = buf.add(192); + + // Main loop + while buf <= limit { + let y0 = clmul_lo_and_xor(x0, k, vld1q_u64(buf as *const u64)); + x0 = clmul_hi_and_xor(x0, k, y0); + let y1 = clmul_lo_and_xor(x1, k, vld1q_u64(buf.add(16) as *const u64)); + x1 = clmul_hi_and_xor(x1, k, y1); + let y2 = clmul_lo_and_xor(x2, k, vld1q_u64(buf.add(32) as *const u64)); + x2 = clmul_hi_and_xor(x2, k, y2); + let y3 = clmul_lo_and_xor(x3, k, vld1q_u64(buf.add(48) as *const u64)); + x3 = clmul_hi_and_xor(x3, k, y3); + let y4 = clmul_lo_and_xor(x4, k, vld1q_u64(buf.add(64) as *const u64)); + x4 = clmul_hi_and_xor(x4, k, y4); + let y5 = clmul_lo_and_xor(x5, k, vld1q_u64(buf.add(80) as *const u64)); + x5 = clmul_hi_and_xor(x5, k, y5); + let y6 = clmul_lo_and_xor(x6, k, vld1q_u64(buf.add(96) as *const u64)); + x6 = clmul_hi_and_xor(x6, k, y6); + let y7 = clmul_lo_and_xor(x7, k, vld1q_u64(buf.add(112) as *const u64)); + x7 = clmul_hi_and_xor(x7, k, y7); + let y8 = clmul_lo_and_xor(x8, k, vld1q_u64(buf.add(128) as *const u64)); + x8 = clmul_hi_and_xor(x8, k, y8); + let y9 = clmul_lo_and_xor(x9, k, vld1q_u64(buf.add(144) as *const u64)); + x9 = clmul_hi_and_xor(x9, k, y9); + let y10 = clmul_lo_and_xor(x10, k, vld1q_u64(buf.add(160) as *const u64)); + x10 = clmul_hi_and_xor(x10, k, y10); + let y11 = clmul_lo_and_xor(x11, k, vld1q_u64(buf.add(176) as *const u64)); + x11 = clmul_hi_and_xor(x11, k, y11); + buf = buf.add(192); + } + + // Reduce x0 ... x11 to just x0 + let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_and_xor(x0, k, x1); + x0 = clmul_hi_and_xor(x0, k, y0); + let y2 = clmul_lo_and_xor(x2, k, x3); + x2 = clmul_hi_and_xor(x2, k, y2); + let y4 = clmul_lo_and_xor(x4, k, x5); + x4 = clmul_hi_and_xor(x4, k, y4); + let y6 = clmul_lo_and_xor(x6, k, x7); + x6 = clmul_hi_and_xor(x6, k, y6); + let y8 = clmul_lo_and_xor(x8, k, x9); + x8 = clmul_hi_and_xor(x8, k, y8); + let y10 = clmul_lo_and_xor(x10, k, x11); + x10 = clmul_hi_and_xor(x10, k, y10); + + let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_and_xor(x0, k, x2); + x0 = clmul_hi_and_xor(x0, k, y0); + let y4 = clmul_lo_and_xor(x4, k, x6); + x4 = clmul_hi_and_xor(x4, k, y4); + let y8 = clmul_lo_and_xor(x8, k, x10); + x8 = clmul_hi_and_xor(x8, k, y8); + + let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_and_xor(x0, k, x4); + x0 = clmul_hi_and_xor(x0, k, y0); + x4 = x8; + let y0 = clmul_lo_and_xor(x0, k, x4); + x0 = clmul_hi_and_xor(x0, k, y0); + + // Reduce 128 bits to 32 bits, and multiply by x^32 + crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1)); + len = end.offset_from(buf) as usize; + } + + if len >= 16 { + // First vector chunk + let mut x0 = vld1q_u64(buf as *const u64); + + let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27]; + let k = vld1q_u64(k_vals.as_ptr()); + + // Create CRC vector and XOR with first vector + let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0); + x0 = veorq_u64(crc_vec, x0); + buf = buf.add(16); + len -= 16; + + // Main loop + while len >= 16 { + let y0 = clmul_lo_and_xor(x0, k, vld1q_u64(buf as *const u64)); + x0 = clmul_hi_and_xor(x0, k, y0); + buf = buf.add(16); + len -= 16; + } + + // Reduce 128 bits to 32 bits, and multiply by x^32 + crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1)); + } + + while len >= 8 { + crc0 = __crc32cd(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + while len > 0 { + crc0 = __crc32cb(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + crc0 +} diff --git a/src/crc32/fusion/aarch64/iscsi/crc_pmull_sha3.rs b/src/crc32/fusion/aarch64/iscsi/crc_pmull_sha3.rs new file mode 100644 index 0000000..137cb5c --- /dev/null +++ b/src/crc32/fusion/aarch64/iscsi/crc_pmull_sha3.rs @@ -0,0 +1,266 @@ +//! Provides CRC-32/ISCSI calculations using a fusion of native CLMUL +//! instructions and native CRC calculation instructions on aarch64. +//! +//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/ +//! +//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +//! with the help of Claude.ai. +//! +//! Modified as necessary for this Rust implementation. +//! +//! MIT licensed. + +#![cfg(target_arch = "aarch64")] + +use crate::crc32::fusion::aarch64::{clmul_hi, clmul_lo, clmul_scalar}; +use std::arch::aarch64::{ + __crc32cb, __crc32cd, __crc32cw, uint64x2_t, veor3q_u64, veorq_u64, vgetq_lane_u64, vld1q_u64, + vmov_n_u64, vmull_p8, vreinterpret_p8_u64, vreinterpretq_u64_p16, +}; + +/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +/// with the help of Claude.ai using: +/// +/// ./generate -i neon_eor3 -p crc32c -a v9s3x2e_s3 +/// +/// Modified as necessary for this Rust implementation. +#[inline] +#[target_feature(enable = "crc,aes,sha3")] +pub(crate) unsafe fn crc32_iscsi_eor3_v9s3x2e_s3( + mut crc0: u32, + mut buf: *const u8, + mut len: usize, +) -> u32 { + // Align to 8-byte boundary + while len > 0 && (buf as usize & 7) != 0 { + crc0 = __crc32cb(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + // Handle 8-byte alignment + if (buf as usize & 8) != 0 && len >= 8 { + crc0 = __crc32cd(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + if len >= 192 { + let end = buf.add(len); + let blk = len / 192; + let klen = blk * 16; + let buf2 = buf.add(klen * 3); + let limit = buf.add(klen).sub(32); + let mut crc1 = 0u32; + let mut crc2 = 0u32; + + // First vector chunk + let mut x0 = vld1q_u64(buf2 as *const u64); + let mut x1 = vld1q_u64(buf2.add(16) as *const u64); + let mut x2 = vld1q_u64(buf2.add(32) as *const u64); + let mut x3 = vld1q_u64(buf2.add(48) as *const u64); + let mut x4 = vld1q_u64(buf2.add(64) as *const u64); + let mut x5 = vld1q_u64(buf2.add(80) as *const u64); + let mut x6 = vld1q_u64(buf2.add(96) as *const u64); + let mut x7 = vld1q_u64(buf2.add(112) as *const u64); + let mut x8 = vld1q_u64(buf2.add(128) as *const u64); + + let k_vals: [u64; 2] = [0x7e908048, 0xc96cfdc0]; + let mut k = vld1q_u64(k_vals.as_ptr()); + let mut buf2 = buf2.add(144); + + // Main loop + while buf <= limit { + let y0 = clmul_lo(x0, k); + x0 = clmul_hi(x0, k); + let y1 = clmul_lo(x1, k); + x1 = clmul_hi(x1, k); + let y2 = clmul_lo(x2, k); + x2 = clmul_hi(x2, k); + let y3 = clmul_lo(x3, k); + x3 = clmul_hi(x3, k); + let y4 = clmul_lo(x4, k); + x4 = clmul_hi(x4, k); + let y5 = clmul_lo(x5, k); + x5 = clmul_hi(x5, k); + let y6 = clmul_lo(x6, k); + x6 = clmul_hi(x6, k); + let y7 = clmul_lo(x7, k); + x7 = clmul_hi(x7, k); + let y8 = clmul_lo(x8, k); + x8 = clmul_hi(x8, k); + + x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64)); + x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64)); + x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64)); + x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64)); + x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64)); + x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64)); + x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64)); + x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64)); + x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64)); + + crc0 = __crc32cd(crc0, *(buf as *const u64)); + crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64)); + crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64)); + crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64)); + crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64)); + crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64)); + + buf = buf.add(16); + buf2 = buf2.add(144); + } + + // Reduce x0 ... x8 to just x0 + let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo(x0, k); + x0 = clmul_hi(x0, k); + x0 = veor3q_u64(x0, y0, x1); + x1 = x2; + x2 = x3; + x3 = x4; + x4 = x5; + x5 = x6; + x6 = x7; + x7 = x8; + + let y0 = clmul_lo(x0, k); + x0 = clmul_hi(x0, k); + let y2 = clmul_lo(x2, k); + x2 = clmul_hi(x2, k); + let y4 = clmul_lo(x4, k); + x4 = clmul_hi(x4, k); + let y6 = clmul_lo(x6, k); + x6 = clmul_hi(x6, k); + + x0 = veor3q_u64(x0, y0, x1); + x2 = veor3q_u64(x2, y2, x3); + x4 = veor3q_u64(x4, y4, x5); + x6 = veor3q_u64(x6, y6, x7); + + let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo(x0, k); + x0 = clmul_hi(x0, k); + let y4 = clmul_lo(x4, k); + x4 = clmul_hi(x4, k); + + x0 = veor3q_u64(x0, y0, x2); + x4 = veor3q_u64(x4, y4, x6); + + let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo(x0, k); + x0 = clmul_hi(x0, k); + x0 = veor3q_u64(x0, y0, x4); + + // Final scalar chunk + crc0 = __crc32cd(crc0, *(buf as *const u64)); + crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64)); + crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64)); + crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64)); + crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64)); + crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64)); + + let vc0 = crc_shift_iscsi(crc0, klen * 2 + blk * 144); + let vc1 = crc_shift_iscsi(crc1, klen + blk * 144); + let vc2 = crc_shift_iscsi(crc2, blk * 144); + let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0); + + // Reduce 128 bits to 32 bits, and multiply by x^32 + crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1)); + + buf = buf2; + len = end.offset_from(buf) as usize; + } + + if len >= 32 { + let klen = ((len - 8) / 24) * 8; + let mut crc1 = 0u32; + let mut crc2 = 0u32; + + // Main loop + loop { + crc0 = __crc32cd(crc0, *(buf as *const u64)); + crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64)); + crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64)); + buf = buf.add(8); + len -= 24; + if len < 32 { + break; + } + } + + let vc0 = crc_shift_iscsi(crc0, klen * 2 + 8); + let vc1 = crc_shift_iscsi(crc1, klen + 8); + let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0); + + // Final 8 bytes + buf = buf.add(klen * 2); + crc0 = crc2; + crc0 = __crc32cd(crc0, *(buf as *const u64) ^ vc); + buf = buf.add(8); + len -= 8; + } + + while len >= 8 { + crc0 = __crc32cd(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + while len > 0 { + crc0 = __crc32cb(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + crc0 +} + +#[inline] +#[target_feature(enable = "aes")] +unsafe fn crc_shift_iscsi(crc: u32, nbytes: usize) -> uint64x2_t { + clmul_scalar(crc, xnmodp_crc32_iscsi((nbytes * 8 - 33) as u64)) +} + +// x^n mod P, in log(n) time +#[inline] +#[target_feature(enable = "crc,aes")] +unsafe fn xnmodp_crc32_iscsi(mut n: u64) -> u32 { + let mut stack = !1u64; + let mut acc: u32; + let mut low: u32; + + while n > 191 { + stack = (stack << 1) + (n & 1); + n = (n >> 1) - 16; + } + stack = !stack; + acc = 0x80000000u32 >> (n & 31); + n >>= 5; + + while n > 0 { + // ARM CRC32 instruction + acc = __crc32cw(acc, 0); + n -= 1; + } + + while { + low = (stack & 1) as u32; + stack >>= 1; + stack != 0 + } { + // Convert to polynomial type and square it + let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64)); + let squared = vmull_p8(x, x); + let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0); + acc = __crc32cd(0, y << low); + } + acc +} diff --git a/src/crc32/fusion/aarch64/iscsi/mod.rs b/src/crc32/fusion/aarch64/iscsi/mod.rs new file mode 100644 index 0000000..653ea8a --- /dev/null +++ b/src/crc32/fusion/aarch64/iscsi/mod.rs @@ -0,0 +1,9 @@ +// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. + +//! This module provides aarch64-specific implementations of CRC-32/ISCSI calculations using +//! fusion techniques. + +#![cfg(target_arch = "aarch64")] + +pub(crate) mod crc_pmull; +pub(crate) mod crc_pmull_sha3; diff --git a/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull.rs b/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull.rs new file mode 100644 index 0000000..b57a959 --- /dev/null +++ b/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull.rs @@ -0,0 +1,184 @@ +//! Provides CRC-32/ISO-HDLC calculations using a fusion of native CLMUL +//! instructions and native CRC calculation instructions on aarch64. +//! +//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/ +//! +//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +//! with the help of Claude.ai. +//! +//! Modified as necessary for this Rust implementation. +//! +//! MIT licensed. + +#![cfg(target_arch = "aarch64")] + +use crate::crc32::fusion::aarch64::{clmul_hi_and_xor, clmul_lo_and_xor}; +use std::arch::aarch64::{ + __crc32b, __crc32d, veorq_u64, vgetq_lane_u64, vld1q_u64, vmovq_n_u64, vsetq_lane_u64, +}; + +/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +/// with the help of Claude.ai using: +/// +/// ./generate -i neon -p crc32 -a v12e_v1 +/// +/// Modified as necessary for this Rust implementation. +#[inline] +#[target_feature(enable = "crc,aes")] +pub(crate) unsafe fn crc32_iso_hdlc_v12e_v1( + mut crc0: u32, + mut buf: *const u8, + mut len: usize, +) -> u32 { + // Align to 8-byte boundary + while len > 0 && (buf as usize & 7) != 0 { + crc0 = __crc32b(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + // Handle 8-byte alignment + if (buf as usize & 8) != 0 && len >= 8 { + crc0 = __crc32d(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + if len >= 192 { + let end = buf.add(len); + let limit = buf.add(len - 192); + + // First vector chunk + let mut x0 = vld1q_u64(buf as *const u64); + let mut x1 = vld1q_u64(buf.add(16) as *const u64); + let mut x2 = vld1q_u64(buf.add(32) as *const u64); + let mut x3 = vld1q_u64(buf.add(48) as *const u64); + let mut x4 = vld1q_u64(buf.add(64) as *const u64); + let mut x5 = vld1q_u64(buf.add(80) as *const u64); + let mut x6 = vld1q_u64(buf.add(96) as *const u64); + let mut x7 = vld1q_u64(buf.add(112) as *const u64); + let mut x8 = vld1q_u64(buf.add(128) as *const u64); + let mut x9 = vld1q_u64(buf.add(144) as *const u64); + let mut x10 = vld1q_u64(buf.add(160) as *const u64); + let mut x11 = vld1q_u64(buf.add(176) as *const u64); + + // ISO-HDLC specific constants for small implementation + let k_vals: [u64; 2] = [0x596c8d81, 0xf5e48c85]; + let mut k = vld1q_u64(k_vals.as_ptr()); + + // Create CRC vector and XOR with first vector + let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0); + x0 = veorq_u64(crc_vec, x0); + buf = buf.add(192); + + // Main loop + while buf <= limit { + let y0 = clmul_lo_and_xor(x0, k, vld1q_u64(buf as *const u64)); + x0 = clmul_hi_and_xor(x0, k, y0); + let y1 = clmul_lo_and_xor(x1, k, vld1q_u64(buf.add(16) as *const u64)); + x1 = clmul_hi_and_xor(x1, k, y1); + let y2 = clmul_lo_and_xor(x2, k, vld1q_u64(buf.add(32) as *const u64)); + x2 = clmul_hi_and_xor(x2, k, y2); + let y3 = clmul_lo_and_xor(x3, k, vld1q_u64(buf.add(48) as *const u64)); + x3 = clmul_hi_and_xor(x3, k, y3); + let y4 = clmul_lo_and_xor(x4, k, vld1q_u64(buf.add(64) as *const u64)); + x4 = clmul_hi_and_xor(x4, k, y4); + let y5 = clmul_lo_and_xor(x5, k, vld1q_u64(buf.add(80) as *const u64)); + x5 = clmul_hi_and_xor(x5, k, y5); + let y6 = clmul_lo_and_xor(x6, k, vld1q_u64(buf.add(96) as *const u64)); + x6 = clmul_hi_and_xor(x6, k, y6); + let y7 = clmul_lo_and_xor(x7, k, vld1q_u64(buf.add(112) as *const u64)); + x7 = clmul_hi_and_xor(x7, k, y7); + let y8 = clmul_lo_and_xor(x8, k, vld1q_u64(buf.add(128) as *const u64)); + x8 = clmul_hi_and_xor(x8, k, y8); + let y9 = clmul_lo_and_xor(x9, k, vld1q_u64(buf.add(144) as *const u64)); + x9 = clmul_hi_and_xor(x9, k, y9); + let y10 = clmul_lo_and_xor(x10, k, vld1q_u64(buf.add(160) as *const u64)); + x10 = clmul_hi_and_xor(x10, k, y10); + let y11 = clmul_lo_and_xor(x11, k, vld1q_u64(buf.add(176) as *const u64)); + x11 = clmul_hi_and_xor(x11, k, y11); + buf = buf.add(192); + } + + // Reduce x0 ... x11 to just x0 + let k_vals: [u64; 2] = [0xae689191, 0xccaa009e]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_and_xor(x0, k, x1); + x0 = clmul_hi_and_xor(x0, k, y0); + let y2 = clmul_lo_and_xor(x2, k, x3); + x2 = clmul_hi_and_xor(x2, k, y2); + let y4 = clmul_lo_and_xor(x4, k, x5); + x4 = clmul_hi_and_xor(x4, k, y4); + let y6 = clmul_lo_and_xor(x6, k, x7); + x6 = clmul_hi_and_xor(x6, k, y6); + let y8 = clmul_lo_and_xor(x8, k, x9); + x8 = clmul_hi_and_xor(x8, k, y8); + let y10 = clmul_lo_and_xor(x10, k, x11); + x10 = clmul_hi_and_xor(x10, k, y10); + + let k_vals: [u64; 2] = [0xf1da05aa, 0x81256527]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_and_xor(x0, k, x2); + x0 = clmul_hi_and_xor(x0, k, y0); + let y4 = clmul_lo_and_xor(x4, k, x6); + x4 = clmul_hi_and_xor(x4, k, y4); + let y8 = clmul_lo_and_xor(x8, k, x10); + x8 = clmul_hi_and_xor(x8, k, y8); + + let k_vals: [u64; 2] = [0x8f352d95, 0x1d9513d7]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_and_xor(x0, k, x4); + x0 = clmul_hi_and_xor(x0, k, y0); + x4 = x8; + let y0 = clmul_lo_and_xor(x0, k, x4); + x0 = clmul_hi_and_xor(x0, k, y0); + + // Reduce 128 bits to 32 bits, and multiply by x^32 + crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1)); + len = end.offset_from(buf) as usize; + } + + if len >= 16 { + // First vector chunk + let mut x0 = vld1q_u64(buf as *const u64); + + let k_vals: [u64; 2] = [0xae689191, 0xccaa009e]; + let k = vld1q_u64(k_vals.as_ptr()); + + // Create CRC vector and XOR with first vector + let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0); + x0 = veorq_u64(crc_vec, x0); + buf = buf.add(16); + len -= 16; + + // Main loop + while len >= 16 { + let y0 = clmul_lo_and_xor(x0, k, vld1q_u64(buf as *const u64)); + x0 = clmul_hi_and_xor(x0, k, y0); + buf = buf.add(16); + len -= 16; + } + + // Reduce 128 bits to 32 bits, and multiply by x^32 + crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1)); + } + + while len >= 8 { + crc0 = __crc32d(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + while len > 0 { + crc0 = __crc32b(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + crc0 +} diff --git a/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull_sha3.rs b/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull_sha3.rs new file mode 100644 index 0000000..3483774 --- /dev/null +++ b/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull_sha3.rs @@ -0,0 +1,267 @@ +//! Provides CRC-32/ISO-HDLC calculations using a fusion of native CLMUL +//! instructions and native CRC calculation instructions on aarch64. +//! +//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/ +//! +//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +//! with the help of Claude.ai. +//! +//! Modified as necessary for this Rust implementation. +//! +//! MIT licensed. + +#![cfg(target_arch = "aarch64")] + +use crate::crc32::fusion::aarch64::{clmul_hi, clmul_lo, clmul_scalar}; +use std::arch::aarch64::{ + __crc32b, __crc32d, __crc32w, uint64x2_t, veor3q_u64, veorq_u64, vgetq_lane_u64, vld1q_u64, + vmov_n_u64, vmull_p8, vreinterpret_p8_u64, vreinterpretq_u64_p16, +}; + +/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +/// with the help of Claude.ai using: +/// +/// ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3 +/// +/// Modified as necessary for this Rust implementation. +#[inline] +#[target_feature(enable = "crc,aes,sha3")] +pub(crate) unsafe fn crc32_iso_hdlc_eor3_v9s3x2e_s3( + mut crc0: u32, + mut buf: *const u8, + mut len: usize, +) -> u32 { + // Align to 8-byte boundary + while len > 0 && (buf as usize & 7) != 0 { + crc0 = __crc32b(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + // Handle 8-byte alignment + if (buf as usize & 8) != 0 && len >= 8 { + crc0 = __crc32d(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + if len >= 192 { + let end = buf.add(len); + let blk = len / 192; + let klen = blk * 16; + let buf2 = buf.add(klen * 3); + let limit = buf.add(klen).sub(32); + let mut crc1 = 0u32; + let mut crc2 = 0u32; + + // First vector chunk + let mut x0 = vld1q_u64(buf2 as *const u64); + let mut x1 = vld1q_u64(buf2.add(16) as *const u64); + let mut x2 = vld1q_u64(buf2.add(32) as *const u64); + let mut x3 = vld1q_u64(buf2.add(48) as *const u64); + let mut x4 = vld1q_u64(buf2.add(64) as *const u64); + let mut x5 = vld1q_u64(buf2.add(80) as *const u64); + let mut x6 = vld1q_u64(buf2.add(96) as *const u64); + let mut x7 = vld1q_u64(buf2.add(112) as *const u64); + let mut x8 = vld1q_u64(buf2.add(128) as *const u64); + + // ISO-HDLC specific constants + let k_vals: [u64; 2] = [0x26b70c3d, 0x3f41287a]; + let mut k = vld1q_u64(k_vals.as_ptr()); + let mut buf2 = buf2.add(144); + + // Main loop + while buf <= limit { + let y0 = clmul_lo(x0, k); + x0 = clmul_hi(x0, k); + let y1 = clmul_lo(x1, k); + x1 = clmul_hi(x1, k); + let y2 = clmul_lo(x2, k); + x2 = clmul_hi(x2, k); + let y3 = clmul_lo(x3, k); + x3 = clmul_hi(x3, k); + let y4 = clmul_lo(x4, k); + x4 = clmul_hi(x4, k); + let y5 = clmul_lo(x5, k); + x5 = clmul_hi(x5, k); + let y6 = clmul_lo(x6, k); + x6 = clmul_hi(x6, k); + let y7 = clmul_lo(x7, k); + x7 = clmul_hi(x7, k); + let y8 = clmul_lo(x8, k); + x8 = clmul_hi(x8, k); + + x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64)); + x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64)); + x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64)); + x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64)); + x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64)); + x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64)); + x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64)); + x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64)); + x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64)); + + crc0 = __crc32d(crc0, *(buf as *const u64)); + crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64)); + crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64)); + crc0 = __crc32d(crc0, *(buf.add(8) as *const u64)); + crc1 = __crc32d(crc1, *(buf.add(klen + 8) as *const u64)); + crc2 = __crc32d(crc2, *(buf.add(klen * 2 + 8) as *const u64)); + + buf = buf.add(16); + buf2 = buf2.add(144); + } + + // Reduce x0 ... x8 to just x0 + let k_vals: [u64; 2] = [0xae689191, 0xccaa009e]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo(x0, k); + x0 = clmul_hi(x0, k); + x0 = veor3q_u64(x0, y0, x1); + x1 = x2; + x2 = x3; + x3 = x4; + x4 = x5; + x5 = x6; + x6 = x7; + x7 = x8; + + let y0 = clmul_lo(x0, k); + x0 = clmul_hi(x0, k); + let y2 = clmul_lo(x2, k); + x2 = clmul_hi(x2, k); + let y4 = clmul_lo(x4, k); + x4 = clmul_hi(x4, k); + let y6 = clmul_lo(x6, k); + x6 = clmul_hi(x6, k); + + x0 = veor3q_u64(x0, y0, x1); + x2 = veor3q_u64(x2, y2, x3); + x4 = veor3q_u64(x4, y4, x5); + x6 = veor3q_u64(x6, y6, x7); + + let k_vals: [u64; 2] = [0xf1da05aa, 0x81256527]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo(x0, k); + x0 = clmul_hi(x0, k); + let y4 = clmul_lo(x4, k); + x4 = clmul_hi(x4, k); + + x0 = veor3q_u64(x0, y0, x2); + x4 = veor3q_u64(x4, y4, x6); + + let k_vals: [u64; 2] = [0x8f352d95, 0x1d9513d7]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo(x0, k); + x0 = clmul_hi(x0, k); + x0 = veor3q_u64(x0, y0, x4); + + // Final scalar chunk + crc0 = __crc32d(crc0, *(buf as *const u64)); + crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64)); + crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64)); + crc0 = __crc32d(crc0, *(buf.add(8) as *const u64)); + crc1 = __crc32d(crc1, *(buf.add(klen + 8) as *const u64)); + crc2 = __crc32d(crc2, *(buf.add(klen * 2 + 8) as *const u64)); + + let vc0 = crc_shift_iso_hdlc(crc0, klen * 2 + blk * 144); + let vc1 = crc_shift_iso_hdlc(crc1, klen + blk * 144); + let vc2 = crc_shift_iso_hdlc(crc2, blk * 144); + let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0); + + // Reduce 128 bits to 32 bits, and multiply by x^32 + crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1)); + + buf = buf2; + len = end.offset_from(buf) as usize; + } + + if len >= 32 { + let klen = ((len - 8) / 24) * 8; + let mut crc1 = 0u32; + let mut crc2 = 0u32; + + // Main loop + loop { + crc0 = __crc32d(crc0, *(buf as *const u64)); + crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64)); + crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64)); + buf = buf.add(8); + len -= 24; + if len < 32 { + break; + } + } + + let vc0 = crc_shift_iso_hdlc(crc0, klen * 2 + 8); + let vc1 = crc_shift_iso_hdlc(crc1, klen + 8); + let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0); + + // Final 8 bytes + buf = buf.add(klen * 2); + crc0 = crc2; + crc0 = __crc32d(crc0, *(buf as *const u64) ^ vc); + buf = buf.add(8); + len -= 8; + } + + while len >= 8 { + crc0 = __crc32d(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + while len > 0 { + crc0 = __crc32b(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + crc0 +} + +#[inline] +#[target_feature(enable = "aes")] +unsafe fn crc_shift_iso_hdlc(crc: u32, nbytes: usize) -> uint64x2_t { + clmul_scalar(crc, xnmodp_iso_hdlc((nbytes * 8 - 33) as u64)) +} + +// x^n mod P, in log(n) time +#[inline] +#[target_feature(enable = "crc,aes")] +unsafe fn xnmodp_iso_hdlc(mut n: u64) -> u32 { + let mut stack = !1u64; + let mut acc: u32; + let mut low: u32; + + while n > 191 { + stack = (stack << 1) + (n & 1); + n = (n >> 1) - 16; + } + stack = !stack; + acc = 0x80000000u32 >> (n & 31); + n >>= 5; + + while n > 0 { + // ARM CRC32 instruction (ISO-HDLC uses standard CRC32, not CRC32C) + acc = __crc32w(acc, 0); + n -= 1; + } + + while { + low = (stack & 1) as u32; + stack >>= 1; + stack != 0 + } { + // Convert to polynomial type and square it + let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64)); + let squared = vmull_p8(x, x); + let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0); + acc = __crc32d(0, y << low); + } + acc +} diff --git a/src/crc32/fusion/aarch64/iso_hdlc/mod.rs b/src/crc32/fusion/aarch64/iso_hdlc/mod.rs new file mode 100644 index 0000000..2f3db58 --- /dev/null +++ b/src/crc32/fusion/aarch64/iso_hdlc/mod.rs @@ -0,0 +1,9 @@ +// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. + +//! This module provides aarch64-specific implementations of CRC-32/ISO-HDLC calculations using +//! fusion techniques. + +#![cfg(target_arch = "aarch64")] + +pub(crate) mod crc_pmull; +pub(crate) mod crc_pmull_sha3; diff --git a/src/crc32/fusion/aarch64/mod.rs b/src/crc32/fusion/aarch64/mod.rs new file mode 100644 index 0000000..de15ab4 --- /dev/null +++ b/src/crc32/fusion/aarch64/mod.rs @@ -0,0 +1,275 @@ +//! Provides CRC-32/ISCSI and CRC-32/ISO-HDLC calculations using a fusion of native CLMUL +//! instructions and native CRC calculation instructions on aarch64. +//! +//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/ +//! +//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +//! with the help of Claude.ai. +//! +//! Modified as necessary for this Rust implementation. +//! +//! MIT licensed. + +#![cfg(target_arch = "aarch64")] + +mod iscsi; +mod iso_hdlc; + +use std::arch::aarch64::*; +use std::arch::is_aarch64_feature_detected; + +use iscsi::crc_pmull::crc32_iscsi_v12e_v1; +use iscsi::crc_pmull_sha3::crc32_iscsi_eor3_v9s3x2e_s3; +use iso_hdlc::crc_pmull::crc32_iso_hdlc_v12e_v1; +use iso_hdlc::crc_pmull_sha3::crc32_iso_hdlc_eor3_v9s3x2e_s3; + +#[inline(always)] +pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 { + if is_aarch64_feature_detected!("sha3") { + unsafe { crc32_iscsi_aes_sha3(crc, data) } + } else { + unsafe { crc32_iscsi_aes(crc, data) } + } +} + +#[inline(always)] +pub fn crc32_iso_hdlc(crc: u32, data: &[u8]) -> u32 { + if is_aarch64_feature_detected!("sha3") { + unsafe { crc32_iso_hdlc_aes_sha3(crc, data) } + } else { + unsafe { crc32_iso_hdlc_aes(crc, data) } + } +} + +/// Safe wrapper for CRC32 iSCSI calculation +#[inline] +#[target_feature(enable = "crc,aes,sha3")] +unsafe fn crc32_iscsi_aes_sha3(crc: u32, data: &[u8]) -> u32 { + unsafe { + const LARGE_BUFFER_THRESHOLD: usize = 1024; + + // Select implementation based on buffer size + if data.len() <= LARGE_BUFFER_THRESHOLD { + crc32_iscsi_v12e_v1(crc, data.as_ptr(), data.len()) + } else { + crc32_iscsi_eor3_v9s3x2e_s3(crc, data.as_ptr(), data.len()) + } + } +} + +#[inline] +#[target_feature(enable = "crc,aes")] +unsafe fn crc32_iscsi_aes(crc: u32, data: &[u8]) -> u32 { + unsafe { crc32_iscsi_v12e_v1(crc, data.as_ptr(), data.len()) } +} + +/// Safe wrapper for CRC32 ISO-HDLC calculation +#[inline] +#[target_feature(enable = "crc,aes,sha3")] +unsafe fn crc32_iso_hdlc_aes_sha3(crc: u32, data: &[u8]) -> u32 { + unsafe { + const LARGE_BUFFER_THRESHOLD: usize = 1024; + + // Select implementation based on buffer size + if data.len() <= LARGE_BUFFER_THRESHOLD { + crc32_iso_hdlc_v12e_v1(crc, data.as_ptr(), data.len()) + } else { + crc32_iso_hdlc_eor3_v9s3x2e_s3(crc, data.as_ptr(), data.len()) + } + } +} + +#[inline] +#[target_feature(enable = "crc,aes")] +unsafe fn crc32_iso_hdlc_aes(crc: u32, data: &[u8]) -> u32 { + unsafe { crc32_iso_hdlc_v12e_v1(crc, data.as_ptr(), data.len()) } +} + +#[inline] +#[target_feature(enable = "aes")] +unsafe fn clmul_lo(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { + // Polynomial multiply low parts - convert u128 result to uint64x2_t + let result = vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0)); + vreinterpretq_u64_p128(result) +} + +#[inline] +#[target_feature(enable = "aes")] +unsafe fn clmul_hi(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { + // Polynomial multiply high parts - convert u128 result to uint64x2_t + let result = vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1)); + vreinterpretq_u64_p128(result) +} + +#[inline] +#[target_feature(enable = "aes")] +unsafe fn clmul_scalar(a: u32, b: u32) -> uint64x2_t { + // Polynomial multiply scalars - convert u128 result to uint64x2_t + let result = vmull_p64(a as u64, b as u64); + vreinterpretq_u64_p128(result) +} + +#[inline] +#[target_feature(enable = "aes")] +unsafe fn clmul_lo_and_xor(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t { + veorq_u64(clmul_lo(a, b), c) +} + +#[inline] +#[target_feature(enable = "aes")] +unsafe fn clmul_hi_and_xor(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t { + veorq_u64(clmul_hi(a, b), c) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test::consts::TEST_CHECK_STRING; + use crc::{Crc, Table}; + use rand::{rng, Rng}; + + const RUST_CRC32_ISO_HDLC: Crc> = + Crc::>::new(&crc::CRC_32_ISO_HDLC); + + const RUST_CRC32_ISCSI: Crc> = Crc::>::new(&crc::CRC_32_ISCSI); + + #[test] + fn test_crc32_iso_hdlc_check() { + assert_eq!( + crc32_iso_hdlc(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff, + 0xcbf43926 + ); + } + + #[test] + fn test_crc32_iso_hdlc_small_all_lengths() { + for len in 1..=255 { + crc32_iso_hdlc_random(len) + } + } + + #[test] + fn test_crc32_iso_hdlc_medium_lengths() { + // Test each length from 256 to 1024, which should fold and include handling remainders + for len in 256..=1024 { + crc32_iso_hdlc_random(len) + } + } + + #[test] + fn test_crc32_iso_hdlc_large_lengths() { + // Test 1 MiB just before, at, and just after the folding boundaries + for len in 1048575..1048577 { + crc32_iso_hdlc_random(len) + } + } + + #[test] + fn test_crc32_iscsi_check() { + assert_eq!( + crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff, + 0xe3069283 + ); + } + + #[test] + fn test_crc32_iscsi_small_all_lengths() { + for len in 1..=255 { + crc32_iscsi_random(len); + } + } + + #[test] + fn test_crc32_iscsi_medium_lengths() { + // Test each length from 256 to 1024, which should fold and include handling remainders + for len in 256..=1024 { + crc32_iscsi_random(len); + } + } + + #[test] + fn test_crc32_iscsi_large_lengths() { + // Test 1 MiB just before, at, and just after the folding boundaries + for len in 1048575..1048577 { + crc32_iscsi_random(len); + } + } + + #[cfg(target_feature = "sha3")] + fn crc32_iso_hdlc_random(len: usize) { + let mut data = vec![0u8; len]; + rng().fill(&mut data[..]); + + let checksum = RUST_CRC32_ISO_HDLC.checksum(&data); + + assert_eq!(crc32_iso_hdlc(0xffffffff, &data) ^ 0xffffffff, checksum); + + unsafe { + assert_eq!( + crc32_iso_hdlc_eor3_v9s3x2e_s3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + + assert_eq!( + crc32_iso_hdlc_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + } + } + + #[cfg(not(target_feature = "sha3"))] + fn crc32_iso_hdlc_random(len: usize) { + let mut data = vec![0u8; len]; + rng().fill(&mut data[..]); + + let checksum = RUST_CRC32_ISO_HDLC.checksum(&data); + + assert_eq!(crc32_iso_hdlc(0xffffffff, &data) ^ 0xffffffff, checksum); + + unsafe { + assert_eq!( + crc32_iso_hdlc_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + } + } + + #[cfg(target_feature = "sha3")] + fn crc32_iscsi_random(len: usize) { + let mut data = vec![0u8; len]; + rng().fill(&mut data[..]); + + let checksum = RUST_CRC32_ISCSI.checksum(&data); + + assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum); + + unsafe { + assert_eq!( + crc32_iscsi_eor3_v9s3x2e_s3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + + assert_eq!( + crc32_iscsi_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + } + } + + #[cfg(not(target_feature = "sha3"))] + fn crc32_iscsi_random(len: usize) { + let mut data = vec![0u8; len]; + rng().fill(&mut data[..]); + + let checksum = RUST_CRC32_ISCSI.checksum(&data); + + assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum); + + unsafe { + assert_eq!( + crc32_iscsi_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + } + } +} diff --git a/src/crc32/fusion/mod.rs b/src/crc32/fusion/mod.rs index d75a64b..774ce3b 100644 --- a/src/crc32/fusion/mod.rs +++ b/src/crc32/fusion/mod.rs @@ -11,24 +11,23 @@ mod aarch64; mod x86; +/// Only AArch64 has native CRC-32/ISO-HDLC instructions #[inline(always)] -#[allow(unused)] +#[cfg(target_arch = "aarch64")] pub(crate) fn crc32_iso_hdlc(state: u32, data: &[u8]) -> u32 { - #[cfg(target_arch = "aarch64")] - return aarch64::crc32_iso_hdlc(state, data); - - #[cfg(not(target_arch = "aarch64"))] - panic!("CRC-32/ISO-HDLC with fusion is only supported on AArch64 architecture"); + aarch64::crc32_iso_hdlc(state, data) } +/// Both AArch64 and x86 have native CRC-32/ISCSI instructions #[inline(always)] pub(crate) fn crc32_iscsi(state: u32, data: &[u8]) -> u32 { #[cfg(target_arch = "aarch64")] - return aarch64::crc32_iscsi(state, data); - - #[cfg(target_arch = "x86_64")] - return x86::crc32_iscsi(state, data); + { + aarch64::crc32_iscsi(state, data) + } - #[cfg(all(not(target_arch = "aarch64"), not(target_arch = "x86_64")))] - panic!("CRC-32/ISCSI with fusion is only supported on AArch64 and X86_64 architectures"); + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + x86::crc32_iscsi(state, data) + } } diff --git a/src/crc32/fusion/x86.rs b/src/crc32/fusion/x86.rs deleted file mode 100644 index c288072..0000000 --- a/src/crc32/fusion/x86.rs +++ /dev/null @@ -1,740 +0,0 @@ -//! Provides CRC-32/ISCSI and CRC-32/ISO-HDLC calculations using a fusion of native CLMUL -//! instructions and native CRC calculation instructions on x86_64. -//! -//! https://www.corsix.org/content/fast-crc32c-4k -//! https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq -//! -//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ -//! with the help of Claude.ai using: -//! -//! ./generate -i avx512_vpclmulqdq -p crc32c -a v3x2 -//! ./generate -i avx512 -p crc32c -a v4s3x3 -//! ./generate -i sse -p crc32c -a v4s3x3 -//! -//! Modified as necessary for this Rust implementation. -//! -//! MIT licensed. - -#![cfg(target_arch = "x86_64")] - -use std::arch::x86_64::*; - -/// Safe wrapper for CRC32 iSCSI calculation using AVX-512 -#[rustversion::before(1.89)] -#[inline(always)] -pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 { - unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) } -} - -#[rustversion::since(1.89)] -#[inline(always)] -pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 { - if is_x86_feature_detected!("vpclmulqdq") - && is_x86_feature_detected!("avx512f") - && is_x86_feature_detected!("avx512vl") - { - unsafe { - return crc32_iscsi_avx512_vpclmulqdq_v3x2(crc, data.as_ptr(), data.len()); - } - } - - if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") { - unsafe { - return crc32_iscsi_avx512_v4s3x3(crc, data.as_ptr(), data.len()); - } - } - - unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) } -} - -#[rustversion::since(1.89)] -#[inline] -#[target_feature(enable = "avx512f,avx512vl,vpclmulqdq")] -unsafe fn clmul_lo_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i { - _mm512_clmulepi64_epi128(a, b, 0) -} - -#[rustversion::since(1.89)] -#[inline] -#[target_feature(enable = "avx512f,avx512vl,vpclmulqdq")] -unsafe fn clmul_hi_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i { - _mm512_clmulepi64_epi128(a, b, 17) -} - -#[inline] -#[target_feature(enable = "pclmulqdq")] -unsafe fn clmul_lo_sse(a: __m128i, b: __m128i) -> __m128i { - _mm_clmulepi64_si128(a, b, 0) -} - -#[inline] -#[target_feature(enable = "pclmulqdq")] -unsafe fn clmul_hi_sse(a: __m128i, b: __m128i) -> __m128i { - _mm_clmulepi64_si128(a, b, 17) -} - -#[inline] -#[target_feature(enable = "pclmulqdq")] -unsafe fn clmul_scalar_sse(a: u32, b: u32) -> __m128i { - _mm_clmulepi64_si128(_mm_cvtsi32_si128(a as i32), _mm_cvtsi32_si128(b as i32), 0) -} - -// x^n mod P, in log(n) time -#[target_feature(enable = "sse4.2,pclmulqdq")] -unsafe fn xnmodp_iscsi_sse(mut n: u64) -> u32 { - let mut stack = !1u64; - let mut acc: u32; - let mut low: u32; - - while n > 191 { - stack = (stack << 1) + (n & 1); - n = (n >> 1) - 16; - } - stack = !stack; - acc = 0x80000000u32 >> (n & 31); - n >>= 5; - - while n > 0 { - // Use hardware CRC32C instruction - acc = _mm_crc32_u32(acc, 0); - n -= 1; - } - - while { - low = (stack & 1) as u32; - stack >>= 1; - stack != 0 - } { - let x = _mm_cvtsi32_si128(acc as i32); - let y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0)) as u64; - acc = _mm_crc32_u64(0, y << low) as u32; - } - acc -} - -#[inline] -#[target_feature(enable = "pclmulqdq")] -unsafe fn crc_shift_iscsi_sse(crc: u32, nbytes: usize) -> __m128i { - clmul_scalar_sse(crc, xnmodp_iscsi_sse((nbytes * 8 - 33) as u64)) -} - -#[inline] -#[target_feature(enable = "sse4.1")] -unsafe fn mm_extract_epi64(val: __m128i, idx: i32) -> u64 { - if idx == 0 { - _mm_cvtsi128_si64(val) as u64 - } else { - _mm_cvtsi128_si64(_mm_srli_si128(val, 8)) as u64 - } -} - -#[inline] -#[target_feature(enable = "sse4.2")] -unsafe fn mm_crc32_u64(crc: u32, val: u64) -> u32 { - _mm_crc32_u64(crc.into(), val) as u32 -} - -/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ -/// using: -/// -/// ./generate -i avx512_vpclmulqdq -p crc32c -a v3x2 -#[rustversion::since(1.89)] -#[inline] -#[target_feature(enable = "avx512f,avx512vl,vpclmulqdq,sse4.2")] -pub unsafe fn crc32_iscsi_avx512_vpclmulqdq_v3x2( - mut crc0: u32, - mut buf: *const u8, - mut len: usize, -) -> u32 { - // Align to 8-byte boundary - while len > 0 && (buf as usize & 7) != 0 { - crc0 = _mm_crc32_u8(crc0, *buf); - buf = buf.add(1); - len -= 1; - } - - // Align to 64-byte boundary (cache line) - while (buf as usize & 56) != 0 && len >= 8 { - crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; - buf = buf.add(8); - len -= 8; - } - - if len >= 384 { - // First vector chunk - load three 512-bit vectors (192 bytes total) - let mut x0 = _mm512_loadu_si512(buf as *const __m512i); - let mut x1 = _mm512_loadu_si512(buf.add(64) as *const __m512i); - let mut x2 = _mm512_loadu_si512(buf.add(128) as *const __m512i); - - // Create the multiplication constant vector - // Pattern: [0xa87ab8a8, 0, 0xab7aff2a, 0] repeated across all 128-bit lanes - let k_128 = _mm_setr_epi32(0xa87ab8a8u32 as i32, 0, 0xab7aff2au32 as i32, 0); - let mut k = _mm512_broadcast_i32x4(k_128); - - // XOR the CRC into the first vector's low 32 bits - let crc_vec = _mm512_castsi128_si512(_mm_cvtsi32_si128(crc0 as i32)); - x0 = _mm512_xor_si512(crc_vec, x0); - - // First round of polynomial multiplication - let mut y0 = clmul_lo_avx512_vpclmulqdq(x0, k); - x0 = clmul_hi_avx512_vpclmulqdq(x0, k); - let mut y1 = clmul_lo_avx512_vpclmulqdq(x1, k); - x1 = clmul_hi_avx512_vpclmulqdq(x1, k); - let mut y2 = clmul_lo_avx512_vpclmulqdq(x2, k); - x2 = clmul_hi_avx512_vpclmulqdq(x2, k); - - // XOR with next chunk of data using ternary logic (A XOR B XOR C) - // 0x96 = A XOR B XOR C in ternary logic notation - x0 = _mm512_ternarylogic_epi64( - x0, - y0, - _mm512_loadu_si512(buf.add(192) as *const __m512i), - 0x96, - ); - x1 = _mm512_ternarylogic_epi64( - x1, - y1, - _mm512_loadu_si512(buf.add(256) as *const __m512i), - 0x96, - ); - x2 = _mm512_ternarylogic_epi64( - x2, - y2, - _mm512_loadu_si512(buf.add(320) as *const __m512i), - 0x96, - ); - - buf = buf.add(384); - len -= 384; - - // Main loop - process 384 bytes at a time - while len >= 384 { - // First folding step - y0 = clmul_lo_avx512_vpclmulqdq(x0, k); - x0 = clmul_hi_avx512_vpclmulqdq(x0, k); - y1 = clmul_lo_avx512_vpclmulqdq(x1, k); - x1 = clmul_hi_avx512_vpclmulqdq(x1, k); - y2 = clmul_lo_avx512_vpclmulqdq(x2, k); - x2 = clmul_hi_avx512_vpclmulqdq(x2, k); - - x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512(buf as *const __m512i), 0x96); - x1 = _mm512_ternarylogic_epi64( - x1, - y1, - _mm512_loadu_si512(buf.add(64) as *const __m512i), - 0x96, - ); - x2 = _mm512_ternarylogic_epi64( - x2, - y2, - _mm512_loadu_si512(buf.add(128) as *const __m512i), - 0x96, - ); - - // Second folding step - y0 = clmul_lo_avx512_vpclmulqdq(x0, k); - x0 = clmul_hi_avx512_vpclmulqdq(x0, k); - y1 = clmul_lo_avx512_vpclmulqdq(x1, k); - x1 = clmul_hi_avx512_vpclmulqdq(x1, k); - y2 = clmul_lo_avx512_vpclmulqdq(x2, k); - x2 = clmul_hi_avx512_vpclmulqdq(x2, k); - - x0 = _mm512_ternarylogic_epi64( - x0, - y0, - _mm512_loadu_si512(buf.add(192) as *const __m512i), - 0x96, - ); - x1 = _mm512_ternarylogic_epi64( - x1, - y1, - _mm512_loadu_si512(buf.add(256) as *const __m512i), - 0x96, - ); - x2 = _mm512_ternarylogic_epi64( - x2, - y2, - _mm512_loadu_si512(buf.add(320) as *const __m512i), - 0x96, - ); - - buf = buf.add(384); - len -= 384; - } - - // Reduce x0, x1, x2 to just x0 - let k_128 = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0); - k = _mm512_broadcast_i32x4(k_128); - - y0 = clmul_lo_avx512_vpclmulqdq(x0, k); - x0 = clmul_hi_avx512_vpclmulqdq(x0, k); - x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96); - x1 = x2; - - y0 = clmul_lo_avx512_vpclmulqdq(x0, k); - x0 = clmul_hi_avx512_vpclmulqdq(x0, k); - x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96); - - // Reduce 512 bits to 128 bits - // Multiple reduction constants for different parts of the 512-bit vector - k = _mm512_setr_epi32( - 0x1c291d04u32 as i32, - 0, - 0xddc0152bu32 as i32, - 0, // Lane 0 - 0x3da6d0cbu32 as i32, - 0, - 0xba4fc28eu32 as i32, - 0, // Lane 1 - 0xf20c0dfeu32 as i32, - 0, - 0x493c7d27u32 as i32, - 0, // Lane 2 - 0, - 0, - 0, - 0, // Lane 3 (unused) - ); - - y0 = clmul_lo_avx512_vpclmulqdq(x0, k); - k = clmul_hi_avx512_vpclmulqdq(x0, k); - y0 = _mm512_xor_si512(y0, k); - - // Extract 128-bit lanes and combine them - let lane0 = _mm512_castsi512_si128(y0); - let lane1 = _mm512_extracti32x4_epi32(y0, 1); - let lane2 = _mm512_extracti32x4_epi32(y0, 2); - let lane3 = _mm512_extracti32x4_epi32(x0, 3); - - // Combine all lanes using ternary logic - let mut z0 = _mm_ternarylogic_epi64(lane0, lane1, lane2, 0x96); - z0 = _mm_xor_si128(z0, lane3); - - // Reduce 128 bits to 32 bits using CRC32 instructions - crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0) as u64) as u32; - crc0 = _mm_crc32_u64(crc0.into(), _mm_extract_epi64(z0, 1) as u64) as u32; - } - - // Process remaining 8-byte chunks - while len >= 8 { - crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; - buf = buf.add(8); - len -= 8; - } - - // Process remaining bytes - while len > 0 { - crc0 = _mm_crc32_u8(crc0, *buf); - buf = buf.add(1); - len -= 1; - } - - crc0 -} - -/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ -/// using: -/// -/// ./generate -i avx512 -p crc32c -a v4s3x3 -#[rustversion::since(1.89)] -#[inline] -#[target_feature(enable = "avx2,avx512f,avx512vl,pclmulqdq")] -pub unsafe fn crc32_iscsi_avx512_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 { - // Align to 8-byte boundary using hardware CRC32C instructions - while len > 0 && (buf as usize & 7) != 0 { - crc0 = _mm_crc32_u8(crc0, *buf); - buf = buf.add(1); - len -= 1; - } - - // Handle 8-byte alignment - if (buf as usize & 8) != 0 && len >= 8 { - crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; - buf = buf.add(8); - len -= 8; - } - - if len >= 144 { - let blk = (len - 8) / 136; - let klen = blk * 24; - let buf2 = buf; - let mut crc1 = 0u32; - let mut crc2 = 0u32; - - // First vector chunk - load four 128-bit vectors (64 bytes total) - let mut x0 = _mm_loadu_si128(buf2 as *const __m128i); - let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i); - let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i); - let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i); - - // iSCSI-specific folding constant (different from ISO-HDLC) - let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0); - - // XOR the CRC into the first vector's low 32 bits - x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0); - crc0 = 0; - - let mut buf2 = buf2.add(64); - len -= 136; - buf = buf.add(blk * 64); - - // Main loop - process 144 bytes at a time - while len >= 144 { - let y0 = clmul_lo_sse(x0, k); - x0 = clmul_hi_sse(x0, k); - let y1 = clmul_lo_sse(x1, k); - x1 = clmul_hi_sse(x1, k); - let y2 = clmul_lo_sse(x2, k); - x2 = clmul_hi_sse(x2, k); - let y3 = clmul_lo_sse(x3, k); - x3 = clmul_hi_sse(x3, k); - - // XOR with next chunk of data using ternary logic (A XOR B XOR C) - x0 = _mm_ternarylogic_epi64(x0, y0, _mm_loadu_si128(buf2 as *const __m128i), 0x96); - x1 = _mm_ternarylogic_epi64( - x1, - y1, - _mm_loadu_si128(buf2.add(16) as *const __m128i), - 0x96, - ); - x2 = _mm_ternarylogic_epi64( - x2, - y2, - _mm_loadu_si128(buf2.add(32) as *const __m128i), - 0x96, - ); - x3 = _mm_ternarylogic_epi64( - x3, - y3, - _mm_loadu_si128(buf2.add(48) as *const __m128i), - 0x96, - ); - - // Process scalar data in parallel using hardware CRC32C - crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; - crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen) as *const u64)) as u32; - crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2) as *const u64)) as u32; - crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(8) as *const u64)) as u32; - crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 8) as *const u64)) as u32; - crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 8) as *const u64)) as u32; - crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(16) as *const u64)) as u32; - crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 16) as *const u64)) as u32; - crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 16) as *const u64)) as u32; - - buf = buf.add(24); - buf2 = buf2.add(64); - len -= 136; - } - - // Reduce x0 ... x3 to just x0 using iSCSI-specific constants - k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0); - - let y0 = clmul_lo_sse(x0, k); - x0 = clmul_hi_sse(x0, k); - let y2 = clmul_lo_sse(x2, k); - x2 = clmul_hi_sse(x2, k); - - x0 = _mm_ternarylogic_epi64(x0, y0, x1, 0x96); - x2 = _mm_ternarylogic_epi64(x2, y2, x3, 0x96); - - k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0); - - let y0 = clmul_lo_sse(x0, k); - x0 = clmul_hi_sse(x0, k); - x0 = _mm_ternarylogic_epi64(x0, y0, x2, 0x96); - - // Final scalar chunk - crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; - crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen) as *const u64)) as u32; - crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2) as *const u64)) as u32; - crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(8) as *const u64)) as u32; - crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 8) as *const u64)) as u32; - crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 8) as *const u64)) as u32; - crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(16) as *const u64)) as u32; - crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 16) as *const u64)) as u32; - crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 16) as *const u64)) as u32; - buf = buf.add(24); - - let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8); - let vc1 = crc_shift_iscsi_sse(crc1, klen + 8); - let mut vc = _mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0) as u64; - - // Reduce 128 bits to 32 bits, and multiply by x^32 - let x0_low = _mm_extract_epi64(x0, 0) as u64; - let x0_high = _mm_extract_epi64(x0, 1) as u64; - let vec_crc = _mm_crc32_u64(_mm_crc32_u64(0, x0_low), x0_high); - vc ^= _mm_extract_epi64(crc_shift_iscsi_sse(vec_crc as u32, klen * 3 + 8), 0) as u64; - - // Final 8 bytes - buf = buf.add(klen * 2); - crc0 = crc2; - crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64) ^ vc) as u32; - buf = buf.add(8); - len -= 8; - } - - // Process remaining 8-byte chunks using hardware CRC32C - while len >= 8 { - crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; - buf = buf.add(8); - len -= 8; - } - - // Process remaining bytes using hardware CRC32C - while len > 0 { - crc0 = _mm_crc32_u8(crc0, *buf); - buf = buf.add(1); - len -= 1; - } - - crc0 -} - -/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ -/// using: -/// -/// ./generate -i sse -p crc32c -a v4s3x3 -#[inline] -#[target_feature(enable = "sse4.2,pclmulqdq")] -pub unsafe fn crc32_iscsi_sse_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 { - // Align to 8-byte boundary using hardware CRC32C instructions - while len > 0 && (buf as usize & 7) != 0 { - crc0 = _mm_crc32_u8(crc0, *buf); - buf = buf.add(1); - len -= 1; - } - - // Handle 8-byte alignment - if (buf as usize & 8) != 0 && len >= 8 { - crc0 = mm_crc32_u64(crc0, *(buf as *const u64)); - buf = buf.add(8); - len -= 8; - } - - if len >= 144 { - let blk = (len - 8) / 136; - let klen = blk * 24; - let buf2 = buf; - let mut crc1 = 0u32; - let mut crc2 = 0u32; - - // First vector chunk - load four 128-bit vectors (64 bytes total) - let mut x0 = _mm_loadu_si128(buf2 as *const __m128i); - let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i); - let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i); - let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i); - - // iSCSI-specific folding constant (same as AVX-512 version) - let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0); - - // XOR the CRC into the first vector's low 32 bits - x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0); - crc0 = 0; - - let mut buf2 = buf2.add(64); - len -= 136; - buf = buf.add(blk * 64); - - // Main loop - process 144 bytes at a time - while len >= 144 { - let mut y0 = clmul_lo_sse(x0, k); - x0 = clmul_hi_sse(x0, k); - let mut y1 = clmul_lo_sse(x1, k); - x1 = clmul_hi_sse(x1, k); - let mut y2 = clmul_lo_sse(x2, k); - x2 = clmul_hi_sse(x2, k); - let mut y3 = clmul_lo_sse(x3, k); - x3 = clmul_hi_sse(x3, k); - - // XOR operations using separate XOR instructions (no ternary logic in SSE) - y0 = _mm_xor_si128(y0, _mm_loadu_si128(buf2 as *const __m128i)); - x0 = _mm_xor_si128(x0, y0); - y1 = _mm_xor_si128(y1, _mm_loadu_si128(buf2.add(16) as *const __m128i)); - x1 = _mm_xor_si128(x1, y1); - y2 = _mm_xor_si128(y2, _mm_loadu_si128(buf2.add(32) as *const __m128i)); - x2 = _mm_xor_si128(x2, y2); - y3 = _mm_xor_si128(y3, _mm_loadu_si128(buf2.add(48) as *const __m128i)); - x3 = _mm_xor_si128(x3, y3); - - // Process scalar data in parallel using hardware CRC32C - crc0 = mm_crc32_u64(crc0, *(buf as *const u64)); - crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64)); - crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64)); - crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64)); - crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64)); - crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64)); - crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64)); - crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64)); - crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64)); - - buf = buf.add(24); - buf2 = buf2.add(64); - len -= 136; - } - - // Reduce x0 ... x3 to just x0 using iSCSI-specific constants - k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0); - - let mut y0 = clmul_lo_sse(x0, k); - x0 = clmul_hi_sse(x0, k); - let mut y2 = clmul_lo_sse(x2, k); - x2 = clmul_hi_sse(x2, k); - - y0 = _mm_xor_si128(y0, x1); - x0 = _mm_xor_si128(x0, y0); - y2 = _mm_xor_si128(y2, x3); - x2 = _mm_xor_si128(x2, y2); - - k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0); - - y0 = clmul_lo_sse(x0, k); - x0 = clmul_hi_sse(x0, k); - y0 = _mm_xor_si128(y0, x2); - x0 = _mm_xor_si128(x0, y0); - - // Final scalar chunk - crc0 = mm_crc32_u64(crc0, *(buf as *const u64)); - crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64)); - crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64)); - crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64)); - crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64)); - crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64)); - crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64)); - crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64)); - crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64)); - buf = buf.add(24); - - let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8); - let vc1 = crc_shift_iscsi_sse(crc1, klen + 8); - let mut vc = mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0); - - // Reduce 128 bits to 32 bits, and multiply by x^32 - // Extract the two 64-bit parts of x0 and combine them - let x0_low = mm_extract_epi64(x0, 0); - let x0_high = mm_extract_epi64(x0, 1); - let x0_combined = mm_extract_epi64( - crc_shift_iscsi_sse(mm_crc32_u64(mm_crc32_u64(0, x0_low), x0_high), klen * 3 + 8), - 0, - ); - vc ^= x0_combined; - - // Final 8 bytes - buf = buf.add(klen * 2); - crc0 = crc2; - crc0 = mm_crc32_u64(crc0, *(buf as *const u64) ^ vc); - buf = buf.add(8); - len -= 8; - } - - // Process remaining 8-byte chunks using hardware CRC32C - while len >= 8 { - crc0 = mm_crc32_u64(crc0, *(buf as *const u64)); - buf = buf.add(8); - len -= 8; - } - - // Process remaining bytes using hardware CRC32C - while len > 0 { - crc0 = _mm_crc32_u8(crc0, *buf); - buf = buf.add(1); - len -= 1; - } - - crc0 -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::test::consts::TEST_CHECK_STRING; - use crc::{Crc, Table}; - use rand::{rng, Rng}; - - const RUST_CRC32_ISCSI: Crc> = Crc::>::new(&crc::CRC_32_ISCSI); - - #[test] - fn test_crc32_iscsi_check() { - assert_eq!( - crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff, - 0xe3069283 - ); - } - - #[test] - fn test_crc32_iscsi_small_all_lengths() { - for len in 1..=255 { - test_crc32_iscsi_random(len); - } - } - - #[test] - fn test_crc32_iscsi_medium_lengths() { - // Test each length from 256 to 1024, which should fold and include handling remainders - for len in 256..=1024 { - test_crc32_iscsi_random(len); - } - } - - #[test] - fn test_crc32_iscsi_large_lengths() { - // Test 1 MiB just before, at, and just after the folding boundaries - for len in 1048575..1048577 { - test_crc32_iscsi_random(len); - } - } - - #[rustversion::since(1.89)] - fn test_crc32_iscsi_random(len: usize) { - let mut data = vec![0u8; len]; - rng().fill(&mut data[..]); - - let checksum = RUST_CRC32_ISCSI.checksum(&data); - - assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum); - - unsafe { - if is_x86_feature_detected!("vpclmulqdq") - && is_x86_feature_detected!("avx512vl") - && is_x86_feature_detected!("avx512f") - { - assert_eq!( - crc32_iscsi_avx512_vpclmulqdq_v3x2(0xffffffff, data.as_ptr(), data.len()) - ^ 0xffffffff, - checksum - ); - } - - if is_x86_feature_detected!("avx512vl") - && is_x86_feature_detected!("avx512f") - && is_x86_feature_detected!("pclmulqdq") - { - assert_eq!( - crc32_iscsi_avx512_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, - checksum - ); - } - - assert_eq!( - crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, - checksum - ); - } - } - - #[rustversion::before(1.89)] - fn test_crc32_iscsi_random(len: usize) { - let mut data = vec![0u8; len]; - rng().fill(&mut data[..]); - - let checksum = RUST_CRC32_ISCSI.checksum(&data); - - assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum); - - unsafe { - assert_eq!( - crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, - checksum - ); - } - } -} diff --git a/src/crc32/fusion/x86/iscsi/avx512_pclmulqdq.rs b/src/crc32/fusion/x86/iscsi/avx512_pclmulqdq.rs new file mode 100644 index 0000000..b3bc0ba --- /dev/null +++ b/src/crc32/fusion/x86/iscsi/avx512_pclmulqdq.rs @@ -0,0 +1,170 @@ +//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +//! with the help of Claude.ai. +//! +//! MIT licensed. + +#![cfg(target_arch = "x86_64")] + +/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +/// with the help of Claude.ai using: +/// +/// ./generate -i avx512 -p crc32c -a v4s3x3 +/// +/// Modified as necessary for this Rust implementation. +/// +/// Uses AVX-512 instructions so only available after Rust 1.89 (when AVX-512 stabilized) +#[rustversion::since(1.89)] +#[inline] +#[target_feature(enable = "avx512vl,pclmulqdq")] +pub unsafe fn crc32_iscsi_avx512_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 { + use crate::fusion::x86::*; + + // Align to 8-byte boundary using hardware CRC32C instructions + while len > 0 && (buf as usize & 7) != 0 { + crc0 = _mm_crc32_u8(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + // Handle 8-byte alignment + if (buf as usize & 8) != 0 && len >= 8 { + crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; + buf = buf.add(8); + len -= 8; + } + + if len >= 144 { + let blk = (len - 8) / 136; + let klen = blk * 24; + let buf2 = buf; + let mut crc1 = 0u32; + let mut crc2 = 0u32; + + // First vector chunk - load four 128-bit vectors (64 bytes total) + let mut x0 = _mm_loadu_si128(buf2 as *const __m128i); + let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i); + let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i); + let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i); + + // iSCSI-specific folding constant (different from ISO-HDLC) + let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0); + + // XOR the CRC into the first vector's low 32 bits + x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0); + crc0 = 0; + + let mut buf2 = buf2.add(64); + len -= 136; + buf = buf.add(blk * 64); + + // Main loop - process 144 bytes at a time + while len >= 144 { + let y0 = clmul_lo_sse(x0, k); + x0 = clmul_hi_sse(x0, k); + let y1 = clmul_lo_sse(x1, k); + x1 = clmul_hi_sse(x1, k); + let y2 = clmul_lo_sse(x2, k); + x2 = clmul_hi_sse(x2, k); + let y3 = clmul_lo_sse(x3, k); + x3 = clmul_hi_sse(x3, k); + + // XOR with next chunk of data using ternary logic (A XOR B XOR C) + x0 = _mm_ternarylogic_epi64(x0, y0, _mm_loadu_si128(buf2 as *const __m128i), 0x96); + x1 = _mm_ternarylogic_epi64( + x1, + y1, + _mm_loadu_si128(buf2.add(16) as *const __m128i), + 0x96, + ); + x2 = _mm_ternarylogic_epi64( + x2, + y2, + _mm_loadu_si128(buf2.add(32) as *const __m128i), + 0x96, + ); + x3 = _mm_ternarylogic_epi64( + x3, + y3, + _mm_loadu_si128(buf2.add(48) as *const __m128i), + 0x96, + ); + + // Process scalar data in parallel using hardware CRC32C + crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; + crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen) as *const u64)) as u32; + crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2) as *const u64)) as u32; + crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(8) as *const u64)) as u32; + crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 8) as *const u64)) as u32; + crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 8) as *const u64)) as u32; + crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(16) as *const u64)) as u32; + crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 16) as *const u64)) as u32; + crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 16) as *const u64)) as u32; + + buf = buf.add(24); + buf2 = buf2.add(64); + len -= 136; + } + + // Reduce x0 ... x3 to just x0 using iSCSI-specific constants + k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0); + + let y0 = clmul_lo_sse(x0, k); + x0 = clmul_hi_sse(x0, k); + let y2 = clmul_lo_sse(x2, k); + x2 = clmul_hi_sse(x2, k); + + x0 = _mm_ternarylogic_epi64(x0, y0, x1, 0x96); + x2 = _mm_ternarylogic_epi64(x2, y2, x3, 0x96); + + k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0); + + let y0 = clmul_lo_sse(x0, k); + x0 = clmul_hi_sse(x0, k); + x0 = _mm_ternarylogic_epi64(x0, y0, x2, 0x96); + + // Final scalar chunk + crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; + crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen) as *const u64)) as u32; + crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2) as *const u64)) as u32; + crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(8) as *const u64)) as u32; + crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 8) as *const u64)) as u32; + crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 8) as *const u64)) as u32; + crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(16) as *const u64)) as u32; + crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 16) as *const u64)) as u32; + crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 16) as *const u64)) as u32; + buf = buf.add(24); + + let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8); + let vc1 = crc_shift_iscsi_sse(crc1, klen + 8); + let mut vc = _mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0) as u64; + + // Reduce 128 bits to 32 bits, and multiply by x^32 + let x0_low = _mm_extract_epi64(x0, 0) as u64; + let x0_high = _mm_extract_epi64(x0, 1) as u64; + let vec_crc = _mm_crc32_u64(_mm_crc32_u64(0, x0_low), x0_high); + vc ^= _mm_extract_epi64(crc_shift_iscsi_sse(vec_crc as u32, klen * 3 + 8), 0) as u64; + + // Final 8 bytes + buf = buf.add(klen * 2); + crc0 = crc2; + crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64) ^ vc) as u32; + buf = buf.add(8); + len -= 8; + } + + // Process remaining 8-byte chunks using hardware CRC32C + while len >= 8 { + crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; + buf = buf.add(8); + len -= 8; + } + + // Process remaining bytes using hardware CRC32C + while len > 0 { + crc0 = _mm_crc32_u8(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + crc0 +} diff --git a/src/crc32/fusion/x86/iscsi/avx512_vpclmulqdq.rs b/src/crc32/fusion/x86/iscsi/avx512_vpclmulqdq.rs new file mode 100644 index 0000000..56a7cb0 --- /dev/null +++ b/src/crc32/fusion/x86/iscsi/avx512_vpclmulqdq.rs @@ -0,0 +1,211 @@ +//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +//! with the help of Claude.ai. +//! +//! MIT licensed. + +#![cfg(target_arch = "x86_64")] + +/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +/// with the help of Claude.ai using: +/// +/// ./generate -i avx512_vpclmulqdq -p crc32c -a v3x2 +/// +/// Modified as necessary for this Rust implementation. +/// +/// Uses AVX-512 VPCLMULQDQ instructions, so only available after Rust 1.89 (when AVX-512 +/// stabilized) +#[rustversion::since(1.89)] +#[inline] +#[target_feature(enable = "avx512vl,vpclmulqdq")] +pub unsafe fn crc32_iscsi_avx512_vpclmulqdq_v3x2( + mut crc0: u32, + mut buf: *const u8, + mut len: usize, +) -> u32 { + use crate::fusion::x86::*; + + // Align to 8-byte boundary + while len > 0 && (buf as usize & 7) != 0 { + crc0 = _mm_crc32_u8(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + // Align to 64-byte boundary (cache line) + while (buf as usize & 56) != 0 && len >= 8 { + crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; + buf = buf.add(8); + len -= 8; + } + + if len >= 384 { + // First vector chunk - load three 512-bit vectors (192 bytes total) + let mut x0 = _mm512_loadu_si512(buf as *const __m512i); + let mut x1 = _mm512_loadu_si512(buf.add(64) as *const __m512i); + let mut x2 = _mm512_loadu_si512(buf.add(128) as *const __m512i); + + // Create the multiplication constant vector + // Pattern: [0xa87ab8a8, 0, 0xab7aff2a, 0] repeated across all 128-bit lanes + let k_128 = _mm_setr_epi32(0xa87ab8a8u32 as i32, 0, 0xab7aff2au32 as i32, 0); + let mut k = _mm512_broadcast_i32x4(k_128); + + // XOR the CRC into the first vector's low 32 bits + let crc_vec = _mm512_castsi128_si512(_mm_cvtsi32_si128(crc0 as i32)); + x0 = _mm512_xor_si512(crc_vec, x0); + + // First round of polynomial multiplication + let mut y0 = clmul_lo_avx512_vpclmulqdq(x0, k); + x0 = clmul_hi_avx512_vpclmulqdq(x0, k); + let mut y1 = clmul_lo_avx512_vpclmulqdq(x1, k); + x1 = clmul_hi_avx512_vpclmulqdq(x1, k); + let mut y2 = clmul_lo_avx512_vpclmulqdq(x2, k); + x2 = clmul_hi_avx512_vpclmulqdq(x2, k); + + // XOR with next chunk of data using ternary logic (A XOR B XOR C) + // 0x96 = A XOR B XOR C in ternary logic notation + x0 = _mm512_ternarylogic_epi64( + x0, + y0, + _mm512_loadu_si512(buf.add(192) as *const __m512i), + 0x96, + ); + x1 = _mm512_ternarylogic_epi64( + x1, + y1, + _mm512_loadu_si512(buf.add(256) as *const __m512i), + 0x96, + ); + x2 = _mm512_ternarylogic_epi64( + x2, + y2, + _mm512_loadu_si512(buf.add(320) as *const __m512i), + 0x96, + ); + + buf = buf.add(384); + len -= 384; + + // Main loop - process 384 bytes at a time + while len >= 384 { + // First folding step + y0 = clmul_lo_avx512_vpclmulqdq(x0, k); + x0 = clmul_hi_avx512_vpclmulqdq(x0, k); + y1 = clmul_lo_avx512_vpclmulqdq(x1, k); + x1 = clmul_hi_avx512_vpclmulqdq(x1, k); + y2 = clmul_lo_avx512_vpclmulqdq(x2, k); + x2 = clmul_hi_avx512_vpclmulqdq(x2, k); + + x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512(buf as *const __m512i), 0x96); + x1 = _mm512_ternarylogic_epi64( + x1, + y1, + _mm512_loadu_si512(buf.add(64) as *const __m512i), + 0x96, + ); + x2 = _mm512_ternarylogic_epi64( + x2, + y2, + _mm512_loadu_si512(buf.add(128) as *const __m512i), + 0x96, + ); + + // Second folding step + y0 = clmul_lo_avx512_vpclmulqdq(x0, k); + x0 = clmul_hi_avx512_vpclmulqdq(x0, k); + y1 = clmul_lo_avx512_vpclmulqdq(x1, k); + x1 = clmul_hi_avx512_vpclmulqdq(x1, k); + y2 = clmul_lo_avx512_vpclmulqdq(x2, k); + x2 = clmul_hi_avx512_vpclmulqdq(x2, k); + + x0 = _mm512_ternarylogic_epi64( + x0, + y0, + _mm512_loadu_si512(buf.add(192) as *const __m512i), + 0x96, + ); + x1 = _mm512_ternarylogic_epi64( + x1, + y1, + _mm512_loadu_si512(buf.add(256) as *const __m512i), + 0x96, + ); + x2 = _mm512_ternarylogic_epi64( + x2, + y2, + _mm512_loadu_si512(buf.add(320) as *const __m512i), + 0x96, + ); + + buf = buf.add(384); + len -= 384; + } + + // Reduce x0, x1, x2 to just x0 + let k_128 = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0); + k = _mm512_broadcast_i32x4(k_128); + + y0 = clmul_lo_avx512_vpclmulqdq(x0, k); + x0 = clmul_hi_avx512_vpclmulqdq(x0, k); + x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96); + x1 = x2; + + y0 = clmul_lo_avx512_vpclmulqdq(x0, k); + x0 = clmul_hi_avx512_vpclmulqdq(x0, k); + x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96); + + // Reduce 512 bits to 128 bits + // Multiple reduction constants for different parts of the 512-bit vector + k = _mm512_setr_epi32( + 0x1c291d04u32 as i32, + 0, + 0xddc0152bu32 as i32, + 0, // Lane 0 + 0x3da6d0cbu32 as i32, + 0, + 0xba4fc28eu32 as i32, + 0, // Lane 1 + 0xf20c0dfeu32 as i32, + 0, + 0x493c7d27u32 as i32, + 0, // Lane 2 + 0, + 0, + 0, + 0, // Lane 3 (unused) + ); + + y0 = clmul_lo_avx512_vpclmulqdq(x0, k); + k = clmul_hi_avx512_vpclmulqdq(x0, k); + y0 = _mm512_xor_si512(y0, k); + + // Extract 128-bit lanes and combine them + let lane0 = _mm512_castsi512_si128(y0); + let lane1 = _mm512_extracti32x4_epi32(y0, 1); + let lane2 = _mm512_extracti32x4_epi32(y0, 2); + let lane3 = _mm512_extracti32x4_epi32(x0, 3); + + // Combine all lanes using ternary logic + let mut z0 = _mm_ternarylogic_epi64(lane0, lane1, lane2, 0x96); + z0 = _mm_xor_si128(z0, lane3); + + // Reduce 128 bits to 32 bits using CRC32 instructions + crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0) as u64) as u32; + crc0 = _mm_crc32_u64(crc0.into(), _mm_extract_epi64(z0, 1) as u64) as u32; + } + + // Process remaining 8-byte chunks + while len >= 8 { + crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; + buf = buf.add(8); + len -= 8; + } + + // Process remaining bytes + while len > 0 { + crc0 = _mm_crc32_u8(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + crc0 +} diff --git a/src/crc32/fusion/x86/iscsi/mod.rs b/src/crc32/fusion/x86/iscsi/mod.rs new file mode 100644 index 0000000..eaffcde --- /dev/null +++ b/src/crc32/fusion/x86/iscsi/mod.rs @@ -0,0 +1,10 @@ +// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. + +//! This module provides x86-specific implementations of CRC-32/ISCSI calculations using +//! fusion techniques. + +#![cfg(any(target_arch = "x86", target_arch = "x86_64"))] + +pub(crate) mod avx512_pclmulqdq; +pub(crate) mod avx512_vpclmulqdq; +pub(crate) mod sse_pclmulqdq; diff --git a/src/crc32/fusion/x86/iscsi/sse_pclmulqdq.rs b/src/crc32/fusion/x86/iscsi/sse_pclmulqdq.rs new file mode 100644 index 0000000..bda9ae8 --- /dev/null +++ b/src/crc32/fusion/x86/iscsi/sse_pclmulqdq.rs @@ -0,0 +1,169 @@ +//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +//! with the help of Claude.ai. +//! +//! MIT licensed. + +#![cfg(any(target_arch = "x86_64", target_arch = "x86"))] + +#[cfg(target_arch = "x86")] +use std::arch::x86::*; + +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +use crate::fusion::x86::*; + +/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +/// with the help of Claude.ai using: +/// +/// ./generate -i sse -p crc32c -a v4s3x3 +/// +/// Modified as necessary for this Rust implementation. +#[inline] +#[target_feature(enable = "sse4.2,pclmulqdq")] +pub unsafe fn crc32_iscsi_sse_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 { + // Align to 8-byte boundary using hardware CRC32C instructions + while len > 0 && (buf as usize & 7) != 0 { + crc0 = _mm_crc32_u8(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + // Handle 8-byte alignment + if (buf as usize & 8) != 0 && len >= 8 { + crc0 = mm_crc32_u64(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + if len >= 144 { + let blk = (len - 8) / 136; + let klen = blk * 24; + let buf2 = buf; + let mut crc1 = 0u32; + let mut crc2 = 0u32; + + // First vector chunk - load four 128-bit vectors (64 bytes total) + let mut x0 = _mm_loadu_si128(buf2 as *const __m128i); + let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i); + let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i); + let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i); + + // iSCSI-specific folding constant (same as AVX-512 version) + let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0); + + // XOR the CRC into the first vector's low 32 bits + x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0); + crc0 = 0; + + let mut buf2 = buf2.add(64); + len -= 136; + buf = buf.add(blk * 64); + + // Main loop - process 144 bytes at a time + while len >= 144 { + let mut y0 = clmul_lo_sse(x0, k); + x0 = clmul_hi_sse(x0, k); + let mut y1 = clmul_lo_sse(x1, k); + x1 = clmul_hi_sse(x1, k); + let mut y2 = clmul_lo_sse(x2, k); + x2 = clmul_hi_sse(x2, k); + let mut y3 = clmul_lo_sse(x3, k); + x3 = clmul_hi_sse(x3, k); + + // XOR operations using separate XOR instructions (no ternary logic in SSE) + y0 = _mm_xor_si128(y0, _mm_loadu_si128(buf2 as *const __m128i)); + x0 = _mm_xor_si128(x0, y0); + y1 = _mm_xor_si128(y1, _mm_loadu_si128(buf2.add(16) as *const __m128i)); + x1 = _mm_xor_si128(x1, y1); + y2 = _mm_xor_si128(y2, _mm_loadu_si128(buf2.add(32) as *const __m128i)); + x2 = _mm_xor_si128(x2, y2); + y3 = _mm_xor_si128(y3, _mm_loadu_si128(buf2.add(48) as *const __m128i)); + x3 = _mm_xor_si128(x3, y3); + + // Process scalar data in parallel using hardware CRC32C + crc0 = mm_crc32_u64(crc0, *(buf as *const u64)); + crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64)); + crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64)); + crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64)); + crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64)); + crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64)); + crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64)); + crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64)); + crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64)); + + buf = buf.add(24); + buf2 = buf2.add(64); + len -= 136; + } + + // Reduce x0 ... x3 to just x0 using iSCSI-specific constants + k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0); + + let mut y0 = clmul_lo_sse(x0, k); + x0 = clmul_hi_sse(x0, k); + let mut y2 = clmul_lo_sse(x2, k); + x2 = clmul_hi_sse(x2, k); + + y0 = _mm_xor_si128(y0, x1); + x0 = _mm_xor_si128(x0, y0); + y2 = _mm_xor_si128(y2, x3); + x2 = _mm_xor_si128(x2, y2); + + k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0); + + y0 = clmul_lo_sse(x0, k); + x0 = clmul_hi_sse(x0, k); + y0 = _mm_xor_si128(y0, x2); + x0 = _mm_xor_si128(x0, y0); + + // Final scalar chunk + crc0 = mm_crc32_u64(crc0, *(buf as *const u64)); + crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64)); + crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64)); + crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64)); + crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64)); + crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64)); + crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64)); + crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64)); + crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64)); + buf = buf.add(24); + + let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8); + let vc1 = crc_shift_iscsi_sse(crc1, klen + 8); + let mut vc = mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0); + + // Reduce 128 bits to 32 bits, and multiply by x^32 + // Extract the two 64-bit parts of x0 and combine them + let x0_low = mm_extract_epi64(x0, 0); + let x0_high = mm_extract_epi64(x0, 1); + let x0_combined = mm_extract_epi64( + crc_shift_iscsi_sse(mm_crc32_u64(mm_crc32_u64(0, x0_low), x0_high), klen * 3 + 8), + 0, + ); + vc ^= x0_combined; + + // Final 8 bytes + buf = buf.add(klen * 2); + crc0 = crc2; + crc0 = mm_crc32_u64(crc0, *(buf as *const u64) ^ vc); + buf = buf.add(8); + len -= 8; + } + + // Process remaining 8-byte chunks using hardware CRC32C + while len >= 8 { + crc0 = mm_crc32_u64(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + // Process remaining bytes using hardware CRC32C + while len > 0 { + crc0 = _mm_crc32_u8(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + crc0 +} diff --git a/src/crc32/fusion/x86/mod.rs b/src/crc32/fusion/x86/mod.rs new file mode 100644 index 0000000..a8043b3 --- /dev/null +++ b/src/crc32/fusion/x86/mod.rs @@ -0,0 +1,289 @@ +//! Provides CRC-32/ISCSI calculations using a fusion of native CLMUL +//! instructions and native CRC calculation instructions on x86 / x86_64. +//! +//! https://www.corsix.org/content/fast-crc32c-4k +//! https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq +//! +//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +//! with the help of Claude.ai. +//! +//! Modified as necessary for this Rust implementation. +//! +//! MIT licensed. + +#![cfg(any(target_arch = "x86", target_arch = "x86_64"))] + +mod iscsi; + +use iscsi::sse_pclmulqdq::crc32_iscsi_sse_v4s3x3; + +#[cfg(target_arch = "x86_64")] +#[rustversion::since(1.89)] +use iscsi::avx512_pclmulqdq::crc32_iscsi_avx512_v4s3x3; +#[cfg(target_arch = "x86_64")] +#[rustversion::since(1.89)] +use iscsi::avx512_vpclmulqdq::crc32_iscsi_avx512_vpclmulqdq_v3x2; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +/// CRC32 iSCSI calculation for Rust versions before 1.89 (pre-AVX-512 support) +/// +/// +/// This function is called by the wrapper layer after feature detection has been performed. +/// For older Rust versions, only SSE implementation is available. +#[rustversion::before(1.89)] +#[inline(always)] +pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 { + // Only SSE implementation is available for Rust versions before 1.89 + // Runtime feature detection is handled by the wrapper layer + unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) } +} + +/// CRC32 iSCSI calculation using the highest available instruction set after Rust 1.89 +/// (post-AVX-512 support) +/// +/// This function is called by the wrapper layer after feature detection has been performed. +/// The wrapper layer ensures that only the appropriate implementation is called based on +/// cached feature detection results, removing runtime checks from the hot path. +#[rustversion::since(1.89)] +#[inline(always)] +pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 { + #[cfg(target_arch = "x86_64")] + { + // AVX512 + VPCLMULQDQ + + if is_x86_feature_detected!("avx512vl") && is_x86_feature_detected!("vpclmulqdq") { + unsafe { + return crc32_iscsi_avx512_vpclmulqdq_v3x2(crc, data.as_ptr(), data.len()); + } + } + + // AVX512 + if is_x86_feature_detected!("avx512vl") { + unsafe { + return crc32_iscsi_avx512_v4s3x3(crc, data.as_ptr(), data.len()); + } + } + } + + // Fallback to SSE implementation + unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) } +} + +#[rustversion::since(1.89)] +#[cfg(target_arch = "x86_64")] +#[inline] +#[target_feature(enable = "avx512vl,vpclmulqdq")] +unsafe fn clmul_lo_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i { + _mm512_clmulepi64_epi128(a, b, 0) +} + +#[rustversion::since(1.89)] +#[cfg(target_arch = "x86_64")] +#[inline] +#[target_feature(enable = "avx512vl,vpclmulqdq")] +unsafe fn clmul_hi_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i { + _mm512_clmulepi64_epi128(a, b, 17) +} + +#[inline] +#[target_feature(enable = "pclmulqdq")] +unsafe fn clmul_lo_sse(a: __m128i, b: __m128i) -> __m128i { + _mm_clmulepi64_si128(a, b, 0) +} + +#[inline] +#[target_feature(enable = "pclmulqdq")] +unsafe fn clmul_hi_sse(a: __m128i, b: __m128i) -> __m128i { + _mm_clmulepi64_si128(a, b, 17) +} + +#[inline] +#[target_feature(enable = "pclmulqdq")] +unsafe fn clmul_scalar_sse(a: u32, b: u32) -> __m128i { + _mm_clmulepi64_si128(_mm_cvtsi32_si128(a as i32), _mm_cvtsi32_si128(b as i32), 0) +} + +// x^n mod P, in log(n) time +#[target_feature(enable = "sse4.2,pclmulqdq")] +unsafe fn xnmodp_iscsi_sse(mut n: u64) -> u32 { + let mut stack = !1u64; + let mut acc: u32; + let mut low: u32; + + while n > 191 { + stack = (stack << 1) + (n & 1); + n = (n >> 1) - 16; + } + stack = !stack; + acc = 0x80000000u32 >> (n & 31); + n >>= 5; + + while n > 0 { + // Use hardware CRC32C instruction + acc = _mm_crc32_u32(acc, 0); + n -= 1; + } + + while { + low = (stack & 1) as u32; + stack >>= 1; + stack != 0 + } { + let x = _mm_cvtsi32_si128(acc as i32); + let clmul_result = _mm_clmulepi64_si128(x, x, 0); + let y = mm_extract_epi64(clmul_result, 0); + acc = mm_crc32_u64(0, y << low); + } + acc +} + +#[inline] +#[target_feature(enable = "pclmulqdq")] +unsafe fn crc_shift_iscsi_sse(crc: u32, nbytes: usize) -> __m128i { + clmul_scalar_sse(crc, xnmodp_iscsi_sse((nbytes * 8 - 33) as u64)) +} + +#[inline] +#[target_feature(enable = "sse4.1")] +unsafe fn mm_extract_epi64(val: __m128i, idx: i32) -> u64 { + #[cfg(target_arch = "x86_64")] + { + if idx == 0 { + _mm_cvtsi128_si64(val) as u64 + } else { + _mm_cvtsi128_si64(_mm_srli_si128(val, 8)) as u64 + } + } + #[cfg(target_arch = "x86")] + { + // On 32-bit x86, extract two 32-bit values and combine them + let shifted = if idx == 0 { + val + } else { + _mm_srli_si128(val, 8) + }; + let low = _mm_cvtsi128_si32(shifted) as u32; + let high = _mm_cvtsi128_si32(_mm_srli_si128(shifted, 4)) as u32; + (low as u64) | ((high as u64) << 32) + } +} + +#[inline] +#[target_feature(enable = "sse4.2")] +unsafe fn mm_crc32_u64(crc: u32, val: u64) -> u32 { + #[cfg(target_arch = "x86_64")] + { + _mm_crc32_u64(crc.into(), val) as u32 + } + #[cfg(target_arch = "x86")] + { + // On 32-bit x86, process 64-bit value as two 32-bit CRC operations + let low = val as u32; + let high = (val >> 32) as u32; + let crc = _mm_crc32_u32(crc, low); + _mm_crc32_u32(crc, high) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test::consts::TEST_CHECK_STRING; + use crc::{Crc, Table}; + use rand::{rng, Rng}; + + const RUST_CRC32_ISCSI: Crc> = Crc::>::new(&crc::CRC_32_ISCSI); + + #[test] + fn test_crc32_iscsi_check() { + assert_eq!( + crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff, + 0xe3069283 + ); + } + + #[test] + fn test_crc32_iscsi_small_all_lengths() { + for len in 1..=255 { + test_crc32_iscsi_random(len); + } + } + + #[test] + fn test_crc32_iscsi_medium_lengths() { + // Test each length from 256 to 1024, which should fold and include handling remainders + for len in 256..=1024 { + test_crc32_iscsi_random(len); + } + } + + #[test] + fn test_crc32_iscsi_large_lengths() { + // Test 1 MiB just before, at, and just after the folding boundaries + for len in 1048575..1048577 { + test_crc32_iscsi_random(len); + } + } + + #[rustversion::since(1.89)] + fn test_crc32_iscsi_random(len: usize) { + let mut data = vec![0u8; len]; + rng().fill(&mut data[..]); + + let checksum = RUST_CRC32_ISCSI.checksum(&data); + + assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum); + + unsafe { + #[cfg(target_arch = "x86_64")] + { + if is_x86_feature_detected!("vpclmulqdq") + && is_x86_feature_detected!("avx512vl") + && is_x86_feature_detected!("avx512f") + { + assert_eq!( + crc32_iscsi_avx512_vpclmulqdq_v3x2(0xffffffff, data.as_ptr(), data.len()) + ^ 0xffffffff, + checksum + ); + } + + if is_x86_feature_detected!("avx512vl") + && is_x86_feature_detected!("avx512f") + && is_x86_feature_detected!("pclmulqdq") + { + assert_eq!( + crc32_iscsi_avx512_v4s3x3(0xffffffff, data.as_ptr(), data.len()) + ^ 0xffffffff, + checksum + ); + } + } + + assert_eq!( + crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + } + } + + #[rustversion::before(1.89)] + fn test_crc32_iscsi_random(len: usize) { + let mut data = vec![0u8; len]; + rng().fill(&mut data[..]); + + let checksum = RUST_CRC32_ISCSI.checksum(&data); + + assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum); + + unsafe { + assert_eq!( + crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + } + } +} diff --git a/src/crc32/mod.rs b/src/crc32/mod.rs index 518f5f2..78b2c45 100644 --- a/src/crc32/mod.rs +++ b/src/crc32/mod.rs @@ -5,5 +5,5 @@ pub mod algorithm; pub mod consts; -#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))] +#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))] pub(crate) mod fusion; diff --git a/src/feature_detection.rs b/src/feature_detection.rs new file mode 100644 index 0000000..b170a74 --- /dev/null +++ b/src/feature_detection.rs @@ -0,0 +1,1238 @@ +// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. + +//! Feature detection system for safe and efficient hardware acceleration across different +//! platforms. + +use std::sync::OnceLock; + +/// Global ArchOps instance cache - initialized once based on feature detection results +static ARCH_OPS_INSTANCE: OnceLock = OnceLock::new(); + +/// Performance tiers representing different hardware capability levels +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(dead_code)] // Some variants may not be constructed on all target architectures +pub enum PerformanceTier { + // AArch64 tiers + AArch64AesSha3, + AArch64Aes, + + // x86_64 tiers + X86_64Avx512Vpclmulqdq, + X86_64Avx512Pclmulqdq, + X86_64SsePclmulqdq, + + // x86 tiers + X86SsePclmulqdq, + + // Fallback + SoftwareTable, +} + +/// Architecture-specific capabilities +#[derive(Debug, Clone, Copy)] +#[allow(dead_code)] // Some fields may not be read on all target architectures +pub struct ArchCapabilities { + // AArch64 features + pub has_aes: bool, // provides PMULL support for CRC calculations (NEON is implicit) + pub has_sha3: bool, // requires 'aes', provides EOR3 for XOR3 operations + + // x86/x86_64 features + pub has_sse41: bool, + pub has_pclmulqdq: bool, + pub has_avx512vl: bool, // implicitly enables avx512f, has XOR3 operations + pub has_vpclmulqdq: bool, + + // Rust version gates + pub rust_version_supports_avx512: bool, +} + +/// Helper function to convert a performance tier to a human-readable target string +/// Format: {architecture}-{intrinsics-family}-{intrinsics-features} +#[inline(always)] +fn tier_to_target_string(tier: PerformanceTier) -> String { + match tier { + PerformanceTier::AArch64AesSha3 => "aarch64-neon-pmull-sha3".to_string(), + PerformanceTier::AArch64Aes => "aarch64-neon-pmull".to_string(), + PerformanceTier::X86_64Avx512Vpclmulqdq => "x86_64-avx512-vpclmulqdq".to_string(), + PerformanceTier::X86_64Avx512Pclmulqdq => "x86_64-avx512-pclmulqdq".to_string(), + PerformanceTier::X86_64SsePclmulqdq => "x86_64-sse-pclmulqdq".to_string(), + PerformanceTier::X86SsePclmulqdq => "x86-sse-pclmulqdq".to_string(), + PerformanceTier::SoftwareTable => "software-fallback-tables".to_string(), + } +} + +/// Detect architecture-specific capabilities combining compile-time and runtime checks +/// +/// # Safety +/// Uses runtime feature detection which may access CPU-specific registers +unsafe fn detect_arch_capabilities() -> ArchCapabilities { + #[cfg(target_arch = "aarch64")] + { + detect_aarch64_features() + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + detect_x86_features() + } + + #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))] + { + // Other architectures use software fallback + ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: false, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + } + } +} + +/// AArch64-specific feature detection +/// +/// Note: NEON is always available on AArch64 and is implicitly enabled by AES support. +/// AES support provides the PMULL instructions needed for CRC calculations. +#[inline(always)] +#[cfg(target_arch = "aarch64")] +unsafe fn detect_aarch64_features() -> ArchCapabilities { + use std::arch::is_aarch64_feature_detected; + + // AES is available on essentially all AArch64 CPUs and provides the PMULL instructions + let has_aes = is_aarch64_feature_detected!("aes"); + + // SHA3 is available on modern Aarch64 CPUs, and provides the EOR3 instruction for efficient + // XOR3 operations. + let has_sha3 = is_aarch64_feature_detected!("sha3"); + + ArchCapabilities { + has_aes, + has_sha3, + has_sse41: false, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + } +} + +/// x86/x86_64-specific feature detection +#[inline(always)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +unsafe fn detect_x86_features() -> ArchCapabilities { + use std::arch::is_x86_feature_detected; + + // Check Rust version support for VPCLMULQDQ (requires 1.89+) + let rust_version_supports_avx512 = check_rust_version_supports_avx512(); + + // SSE 4.1 and PCLMULQDQ support are the baseline for hardware acceleration + let has_sse41 = is_x86_feature_detected!("sse4.1"); + let has_pclmulqdq = has_sse41 && is_x86_feature_detected!("pclmulqdq"); + + // After Rust 1.89, AVX-512VL and VPCLMULQDQ can be used if available + let has_avx512vl = + has_pclmulqdq && rust_version_supports_avx512 && is_x86_feature_detected!("avx512vl"); + let has_vpclmulqdq = + has_avx512vl && rust_version_supports_avx512 && is_x86_feature_detected!("vpclmulqdq"); + + ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41, + has_pclmulqdq, + has_avx512vl, + has_vpclmulqdq, + rust_version_supports_avx512, + } +} + +/// Check if the current Rust version supports VPCLMULQDQ intrinsics +/// VPCLMULQDQ intrinsics were stabilized in Rust 1.89 +#[rustversion::since(1.89)] +#[inline(always)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub(crate) fn check_rust_version_supports_avx512() -> bool { + true +} + +/// Check if the current Rust version supports VPCLMULQDQ intrinsics +/// VPCLMULQDQ intrinsics were stabilized in Rust 1.89 +#[rustversion::before(1.89)] +#[inline(always)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub(crate) fn check_rust_version_supports_avx512() -> bool { + false +} + +/// Select the appropriate performance tier based on detected capabilities +#[inline(always)] +#[allow(unused)] +pub(crate) fn select_performance_tier(capabilities: &ArchCapabilities) -> PerformanceTier { + #[cfg(target_arch = "aarch64")] + { + if capabilities.has_sha3 && capabilities.has_aes { + return PerformanceTier::AArch64AesSha3; + } + + if capabilities.has_aes { + return PerformanceTier::AArch64Aes; + } + } + + #[cfg(target_arch = "x86_64")] + { + if capabilities.has_vpclmulqdq { + return PerformanceTier::X86_64Avx512Vpclmulqdq; + } + if capabilities.has_avx512vl { + return PerformanceTier::X86_64Avx512Pclmulqdq; + } + if capabilities.has_pclmulqdq { + return PerformanceTier::X86_64SsePclmulqdq; + } + } + + #[cfg(target_arch = "x86")] + { + if capabilities.has_pclmulqdq { + return PerformanceTier::X86SsePclmulqdq; + } + } + + // Fallback to software implementation + PerformanceTier::SoftwareTable +} + +/// Enum that holds the different ArchOps implementations for compile-time dispatch +/// This avoids the need for trait objects while still providing factory-based selection +#[rustversion::since(1.89)] +#[derive(Debug, Clone, Copy)] +pub enum ArchOpsInstance { + #[cfg(target_arch = "aarch64")] + Aarch64Aes(crate::arch::aarch64::aes::Aarch64AesOps), + #[cfg(target_arch = "aarch64")] + Aarch64AesSha3(crate::arch::aarch64::aes_sha3::Aarch64AesSha3Ops), + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + X86SsePclmulqdq(crate::arch::x86::sse::X86SsePclmulqdqOps), + #[cfg(target_arch = "x86_64")] + X86_64Avx512Pclmulqdq(crate::arch::x86_64::avx512::X86_64Avx512PclmulqdqOps), + #[cfg(target_arch = "x86_64")] + X86_64Avx512Vpclmulqdq(crate::arch::x86_64::avx512_vpclmulqdq::X86_64Avx512VpclmulqdqOps), + /// Software fallback - no ArchOps struct needed + SoftwareFallback, +} + +#[rustversion::before(1.89)] +#[derive(Debug, Clone, Copy)] +pub enum ArchOpsInstance { + #[cfg(target_arch = "aarch64")] + Aarch64Aes(crate::arch::aarch64::aes::Aarch64AesOps), + #[cfg(target_arch = "aarch64")] + Aarch64AesSha3(crate::arch::aarch64::aes_sha3::Aarch64AesSha3Ops), + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + X86SsePclmulqdq(crate::arch::x86::sse::X86SsePclmulqdqOps), + /// Software fallback - no ArchOps struct needed + SoftwareFallback, +} + +impl ArchOpsInstance { + #[inline(always)] + #[rustversion::since(1.89)] + pub fn get_tier(&self) -> PerformanceTier { + match self { + #[cfg(target_arch = "aarch64")] + ArchOpsInstance::Aarch64Aes(_) => PerformanceTier::AArch64Aes, + #[cfg(target_arch = "aarch64")] + ArchOpsInstance::Aarch64AesSha3(_) => PerformanceTier::AArch64AesSha3, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + ArchOpsInstance::X86SsePclmulqdq(_) => PerformanceTier::X86SsePclmulqdq, + #[cfg(target_arch = "x86_64")] + ArchOpsInstance::X86_64Avx512Pclmulqdq(_) => PerformanceTier::X86_64Avx512Pclmulqdq, + #[cfg(target_arch = "x86_64")] + ArchOpsInstance::X86_64Avx512Vpclmulqdq(_) => PerformanceTier::X86_64Avx512Vpclmulqdq, + ArchOpsInstance::SoftwareFallback => PerformanceTier::SoftwareTable, + } + } + + #[inline(always)] + #[rustversion::before(1.89)] + pub fn get_tier(&self) -> PerformanceTier { + match self { + #[cfg(target_arch = "aarch64")] + ArchOpsInstance::Aarch64Aes(_) => PerformanceTier::AArch64Aes, + #[cfg(target_arch = "aarch64")] + ArchOpsInstance::Aarch64AesSha3(_) => PerformanceTier::AArch64AesSha3, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + ArchOpsInstance::X86SsePclmulqdq(_) => PerformanceTier::X86SsePclmulqdq, + ArchOpsInstance::SoftwareFallback => PerformanceTier::SoftwareTable, + } + } + + /// Get a human-readable target string describing the active configuration + #[inline(always)] + pub fn get_target_string(&self) -> String { + tier_to_target_string(self.get_tier()) + } +} + +/// Get the global ArchOps instance (thread-safe, initialized once based on feature detection) +/// +/// This function provides access to the cached ArchOps instance that was selected based on +/// feature detection results at library initialization time, eliminating runtime feature +/// detection overhead from hot paths. +pub fn get_arch_ops() -> &'static ArchOpsInstance { + ARCH_OPS_INSTANCE.get_or_init(create_arch_ops) +} + +/// Factory function that creates the appropriate ArchOps struct based on cached feature detection +/// +/// This function uses the cached feature detection results to select the optimal +/// architecture-specific implementation at library initialization time, eliminating +/// runtime feature detection overhead from hot paths. +fn create_arch_ops() -> ArchOpsInstance { + let capabilities = unsafe { detect_arch_capabilities() }; + let tier = select_performance_tier(&capabilities); + + create_arch_ops_from_tier(tier) +} + +/// Helper function to create ArchOpsInstance from a performance tier for Rust 1.89+ (when AVX512 +/// stabilized) +#[rustversion::since(1.89)] +fn create_arch_ops_from_tier(tier: PerformanceTier) -> ArchOpsInstance { + match tier { + #[cfg(target_arch = "aarch64")] + PerformanceTier::AArch64AesSha3 => { + use crate::arch::aarch64::aes_sha3::Aarch64AesSha3Ops; + ArchOpsInstance::Aarch64AesSha3(Aarch64AesSha3Ops::new()) + } + #[cfg(target_arch = "aarch64")] + PerformanceTier::AArch64Aes => { + use crate::arch::aarch64::aes::Aarch64AesOps; + ArchOpsInstance::Aarch64Aes(Aarch64AesOps) + } + #[cfg(target_arch = "x86_64")] + PerformanceTier::X86_64Avx512Vpclmulqdq => { + use crate::arch::x86_64::avx512_vpclmulqdq::X86_64Avx512VpclmulqdqOps; + ArchOpsInstance::X86_64Avx512Vpclmulqdq(X86_64Avx512VpclmulqdqOps::new()) + } + #[cfg(target_arch = "x86_64")] + PerformanceTier::X86_64Avx512Pclmulqdq => { + use crate::arch::x86_64::avx512::X86_64Avx512PclmulqdqOps; + ArchOpsInstance::X86_64Avx512Pclmulqdq(X86_64Avx512PclmulqdqOps::new()) + } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + PerformanceTier::X86_64SsePclmulqdq | PerformanceTier::X86SsePclmulqdq => { + create_x86_sse_pclmulqdq_ops() + } + PerformanceTier::SoftwareTable => { + // Use software fallback + ArchOpsInstance::SoftwareFallback + } + // Handle cases where the performance tier doesn't match the current architecture + _ => { + // This can happen when a tier is selected for a different architecture + // Fall back to software implementation + ArchOpsInstance::SoftwareFallback + } + } +} + +/// Helper function to create ArchOpsInstance from a performance tier for Rust <1.89 (before AVX512 +/// stabilized) +#[rustversion::before(1.89)] +fn create_arch_ops_from_tier(tier: PerformanceTier) -> ArchOpsInstance { + match tier { + #[cfg(target_arch = "aarch64")] + PerformanceTier::AArch64AesSha3 => { + use crate::arch::aarch64::aes_sha3::Aarch64AesSha3Ops; + ArchOpsInstance::Aarch64AesSha3(Aarch64AesSha3Ops::new()) + } + #[cfg(target_arch = "aarch64")] + PerformanceTier::AArch64Aes => { + use crate::arch::aarch64::aes::Aarch64AesOps; + ArchOpsInstance::Aarch64Aes(Aarch64AesOps) + } + #[cfg(target_arch = "x86_64")] + PerformanceTier::X86_64Avx512Vpclmulqdq => { + // VPCLMULQDQ and AVX512 not available in older Rust versions, fall back to SSE + create_x86_sse_pclmulqdq_ops() + } + #[cfg(target_arch = "x86_64")] + PerformanceTier::X86_64Avx512Pclmulqdq => { + // AVX512 not available in older Rust versions, fall back to SSE + create_x86_sse_pclmulqdq_ops() + } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + PerformanceTier::X86_64SsePclmulqdq | PerformanceTier::X86SsePclmulqdq => { + create_x86_sse_pclmulqdq_ops() + } + PerformanceTier::SoftwareTable => { + // Use software fallback + ArchOpsInstance::SoftwareFallback + } + // Handle cases where the performance tier doesn't match the current architecture + _ => { + // This can happen when a tier is selected for a different architecture + // Fall back to software implementation + ArchOpsInstance::SoftwareFallback + } + } +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn create_x86_sse_pclmulqdq_ops() -> ArchOpsInstance { + use crate::arch::x86::sse::X86SsePclmulqdqOps; + ArchOpsInstance::X86SsePclmulqdq(X86SsePclmulqdqOps) +} +/// Test-specific tier selection that works across all architectures for comprehensive testing +#[cfg(test)] +pub fn select_performance_tier_for_test(capabilities: &ArchCapabilities) -> PerformanceTier { + // AArch64 tier selection - SHA3 requires AES to be available + if capabilities.has_sha3 && capabilities.has_aes { + return PerformanceTier::AArch64AesSha3; + } + + if capabilities.has_aes { + return PerformanceTier::AArch64Aes; + } + + // x86_64 tier selection - VPCLMULQDQ requires AVX512VL + if capabilities.has_vpclmulqdq + && capabilities.has_avx512vl + && capabilities.rust_version_supports_avx512 + { + return PerformanceTier::X86_64Avx512Vpclmulqdq; + } + + // AVX512VL requires PCLMULQDQ and SSE4.1 + if capabilities.has_avx512vl + && capabilities.has_pclmulqdq + && capabilities.rust_version_supports_avx512 + { + return PerformanceTier::X86_64Avx512Pclmulqdq; + } + + // PCLMULQDQ requires SSE4.1 + if capabilities.has_pclmulqdq && capabilities.has_sse41 { + return PerformanceTier::X86_64SsePclmulqdq; + } + + // Fallback to software implementation + PerformanceTier::SoftwareTable +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_rust_version_check() { + let supports_vpclmulqdq = check_rust_version_supports_avx512(); + + // Should return a boolean without panicking + let _ = supports_vpclmulqdq; + } + + #[test] + fn test_aarch64_tier_selection() { + // Test that aarch64 tier selection follows the expected hierarchy + + // Test SHA3 + AES (highest tier) - NEON is implicit with AES + let capabilities_sha3 = ArchCapabilities { + has_aes: true, + has_sha3: true, + has_sse41: false, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + assert_eq!( + select_performance_tier_for_test(&capabilities_sha3), + PerformanceTier::AArch64AesSha3 + ); + + // Test AES only (baseline tier) - NEON is implicit with AES + let capabilities_aes = ArchCapabilities { + has_aes: true, + has_sha3: false, + has_sse41: false, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + assert_eq!( + select_performance_tier_for_test(&capabilities_aes), + PerformanceTier::AArch64Aes + ); + + // Test missing AES (should fall back to software) + let capabilities_no_aes = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: false, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + assert_eq!( + select_performance_tier_for_test(&capabilities_no_aes), + PerformanceTier::SoftwareTable + ); + } + + #[test] + fn test_aarch64_feature_hierarchy() { + // Test that AArch64 feature hierarchy is properly maintained + // NEON is always available on AArch64 and implicit with AES + // AES provides PMULL instructions, SHA3 provides EOR3 + + // Create test capabilities with AES support (NEON is implicit) + let capabilities_with_aes = ArchCapabilities { + has_aes: true, + has_sha3: false, + has_sse41: false, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + + // AES support means we have PMULL instructions available for CRC calculations + assert!(capabilities_with_aes.has_aes); + + // SHA3 requires AES to be available first + let capabilities_with_sha3 = ArchCapabilities { + has_aes: true, + has_sha3: true, + has_sse41: false, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + + assert!(capabilities_with_sha3.has_aes); + assert!(capabilities_with_sha3.has_sha3); + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_x86_64_tier_selection() { + // Test that x86_64 tier selection follows the expected hierarchy + + // Test VPCLMULQDQ + AVX512 (highest tier) on Rust 1.89+ + let capabilities_vpclmulqdq = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: true, + has_vpclmulqdq: true, + rust_version_supports_avx512: true, + }; + assert_eq!( + select_performance_tier_for_test(&capabilities_vpclmulqdq), + PerformanceTier::X86_64Avx512Vpclmulqdq + ); + + // Test AVX512 + PCLMULQDQ (mid-tier) on Rust 1.89+ + let capabilities_avx512 = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: true, + has_vpclmulqdq: false, + rust_version_supports_avx512: true, + }; + assert_eq!( + select_performance_tier_for_test(&capabilities_avx512), + PerformanceTier::X86_64Avx512Pclmulqdq + ); + + // Test VPCLMULQDQ + AVX512 (highest tier) on Rust < 1.89 + let capabilities_vpclmulqdq = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: true, + has_vpclmulqdq: true, + rust_version_supports_avx512: false, + }; + assert_eq!( + select_performance_tier_for_test(&capabilities_vpclmulqdq), + PerformanceTier::X86_64SsePclmulqdq + ); + + // Test AVX512 + PCLMULQDQ (mid-tier) on Rust < 1.89 + let capabilities_avx512 = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: true, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + assert_eq!( + select_performance_tier_for_test(&capabilities_avx512), + PerformanceTier::X86_64SsePclmulqdq + ); + + // Test SSE + PCLMULQDQ (baseline tier) + let capabilities_sse = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + assert_eq!( + select_performance_tier_for_test(&capabilities_sse), + PerformanceTier::X86_64SsePclmulqdq + ); + + // Test missing PCLMULQDQ (should fall back to software) + let capabilities_no_pclmul = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + assert_eq!( + select_performance_tier_for_test(&capabilities_no_pclmul), + PerformanceTier::SoftwareTable + ); + } + + #[test] + #[cfg(target_arch = "x86")] + fn test_x86_tier_selection() { + // Test that x86 (32-bit) tier selection works correctly + + // Test SSE + PCLMULQDQ (only available tier for x86) + let capabilities_sse = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + assert_eq!( + select_performance_tier_for_test(&capabilities_sse), + PerformanceTier::X86_64SsePclmulqdq + ); + + // For x86 32-bit testing, we need a special case since AVX512VL indicates x86_64 + // Create capabilities without AVX512VL to simulate x86 32-bit + let capabilities_x86_sse = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: false, // No AVX512 on 32-bit x86 + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + // This should select x86_64 tier since we're testing the general case + assert_eq!( + select_performance_tier_for_test(&capabilities_x86_sse), + PerformanceTier::X86_64SsePclmulqdq + ); + + // Test missing PCLMULQDQ (should fall back to software) + let capabilities_no_pclmul = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + assert_eq!( + select_performance_tier_for_test(&capabilities_no_pclmul), + PerformanceTier::SoftwareTable + ); + } + + #[test] + fn test_x86_feature_hierarchy() { + // Test that x86 feature hierarchy is properly maintained + // SSE4.1 is required for PCLMULQDQ + // AVX512VL requires PCLMULQDQ + // VPCLMULQDQ requires AVX512VL and Rust 1.89+ + + // Test feature dependencies are enforced + let capabilities_full = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: true, + has_vpclmulqdq: true, + rust_version_supports_avx512: true, + }; + + // All x86 features should be available when hierarchy is satisfied + assert!(capabilities_full.has_sse41); + assert!(capabilities_full.has_pclmulqdq); + assert!(capabilities_full.has_avx512vl); + assert!(capabilities_full.has_vpclmulqdq); + assert!(capabilities_full.rust_version_supports_avx512); + } + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_rust_version_gating() { + // Test that VPCLMULQDQ is properly gated by Rust version + let rust_support = check_rust_version_supports_avx512(); + + // Should return a boolean based on Rust version + // This will be true for Rust 1.89+ and false for earlier versions + assert!(rust_support == true || rust_support == false); + } + + // Mock tests for compile-time and runtime feature agreement scenarios + mod mock_feature_agreement_tests { + use super::*; + + #[test] + fn test_rust_version_gating_scenarios() { + // Test VPCLMULQDQ with different Rust version scenarios + + // All features available but Rust version too old + let capabilities_old_rust = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: true, + has_vpclmulqdq: true, // Hardware supports it + rust_version_supports_avx512: false, // But Rust version is too old + }; + + // Should not select VPCLMULQDQ or AVX512 tiers due to Rust version constraint + let tier = select_performance_tier_for_test(&capabilities_old_rust); + assert_ne!(tier, PerformanceTier::X86_64Avx512Vpclmulqdq); + assert_ne!(tier, PerformanceTier::X86_64Avx512Pclmulqdq); + assert_eq!(tier, PerformanceTier::X86_64SsePclmulqdq); + } + + #[test] + fn test_feature_dependency_validation() { + // Test that feature dependencies are properly validated + + // SHA3 without AES should not be possible + let invalid_sha3_caps = ArchCapabilities { + has_aes: false, // Missing required dependency + has_sha3: true, // This should be impossible in real detection + has_sse41: false, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + + // Should fall back to software since AES is required for SHA3 + assert_eq!( + select_performance_tier_for_test(&invalid_sha3_caps), + PerformanceTier::SoftwareTable + ); + + // VPCLMULQDQ without AVX512VL should not be possible + let invalid_vpclmul_caps = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: false, // Missing required dependency + has_vpclmulqdq: true, // This should be impossible in real detection + rust_version_supports_avx512: true, + }; + + // Should fall back to SSE tier since AVX512VL is required for VPCLMULQDQ + assert_eq!( + select_performance_tier_for_test(&invalid_vpclmul_caps), + PerformanceTier::X86_64SsePclmulqdq + ); + } + } + + // Comprehensive tier selection tests across different hardware configurations + mod tier_selection_comprehensive_tests { + use super::*; + + #[test] + fn test_all_aarch64_tier_combinations() { + // Test all possible AArch64 capability combinations + + // No features - software fallback + let no_features = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: false, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + assert_eq!( + select_performance_tier_for_test(&no_features), + PerformanceTier::SoftwareTable + ); + + // AES only - baseline AArch64 tier + let aes_only = ArchCapabilities { + has_aes: true, + has_sha3: false, + has_sse41: false, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + assert_eq!( + select_performance_tier_for_test(&aes_only), + PerformanceTier::AArch64Aes + ); + + // AES + SHA3 - highest AArch64 tier + let aes_sha3 = ArchCapabilities { + has_aes: true, + has_sha3: true, + has_sse41: false, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + assert_eq!( + select_performance_tier_for_test(&aes_sha3), + PerformanceTier::AArch64AesSha3 + ); + } + + #[test] + fn test_all_x86_64_tier_combinations() { + // Test all possible x86_64 capability combinations + + // No features - software fallback + let no_features = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: false, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + assert_eq!( + select_performance_tier_for_test(&no_features), + PerformanceTier::SoftwareTable + ); + + // SSE4.1 only - software fallback (PCLMULQDQ required) + let sse_only = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + assert_eq!( + select_performance_tier_for_test(&sse_only), + PerformanceTier::SoftwareTable + ); + + // SSE4.1 + PCLMULQDQ - baseline x86_64 tier + let sse_pclmul = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + assert_eq!( + select_performance_tier_for_test(&sse_pclmul), + PerformanceTier::X86_64SsePclmulqdq + ); + + // SSE4.1 + PCLMULQDQ + AVX512VL but old Rust - should fall back to SSE tier + let avx512_pclmul_old_rust = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: true, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, // Old Rust version + }; + assert_eq!( + select_performance_tier_for_test(&avx512_pclmul_old_rust), + PerformanceTier::X86_64SsePclmulqdq + ); + + // SSE4.1 + PCLMULQDQ + AVX512VL with new Rust - mid-tier + let avx512_pclmul_new_rust = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: true, + has_vpclmulqdq: false, + rust_version_supports_avx512: true, // New Rust version + }; + assert_eq!( + select_performance_tier_for_test(&avx512_pclmul_new_rust), + PerformanceTier::X86_64Avx512Pclmulqdq + ); + + // All features + old Rust - should fall back to SSE tier + let all_features_old_rust = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: true, + has_vpclmulqdq: true, + rust_version_supports_avx512: false, // Old Rust version + }; + assert_eq!( + select_performance_tier_for_test(&all_features_old_rust), + PerformanceTier::X86_64SsePclmulqdq + ); + + // All features + new Rust - highest tier + let all_features_new_rust = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: true, + has_vpclmulqdq: true, + rust_version_supports_avx512: true, // New Rust version + }; + assert_eq!( + select_performance_tier_for_test(&all_features_new_rust), + PerformanceTier::X86_64Avx512Vpclmulqdq + ); + } + + #[test] + fn test_x86_32bit_tier_combinations() { + // Test x86 (32-bit) capability combinations + + // No features - software fallback + let no_features = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: false, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + assert_eq!( + select_performance_tier_for_test(&no_features), + PerformanceTier::SoftwareTable + ); + + // SSE4.1 + PCLMULQDQ - for x86 32-bit testing, we expect x86_64 tier in our test function + // since the test function doesn't distinguish between x86 and x86_64 architectures + let sse_pclmul = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: false, // AVX512 not available on 32-bit x86 + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + // The test function will return x86_64 tier since it doesn't distinguish architectures + assert_eq!( + select_performance_tier_for_test(&sse_pclmul), + PerformanceTier::X86_64SsePclmulqdq + ); + } + + #[test] + fn test_target_string_consistency() { + // Test that target strings are consistent with selected tiers + + let test_cases = [ + (PerformanceTier::AArch64AesSha3, "aarch64-neon-pmull-sha3"), + (PerformanceTier::AArch64Aes, "aarch64-neon-pmull"), + ( + PerformanceTier::X86_64Avx512Vpclmulqdq, + "x86_64-avx512-vpclmulqdq", + ), + ( + PerformanceTier::X86_64Avx512Pclmulqdq, + "x86_64-avx512-pclmulqdq", + ), + (PerformanceTier::X86_64SsePclmulqdq, "x86_64-sse-pclmulqdq"), + (PerformanceTier::X86SsePclmulqdq, "x86-sse-pclmulqdq"), + (PerformanceTier::SoftwareTable, "software-fallback-tables"), + ]; + + for (tier, expected_string) in test_cases { + assert_eq!(tier_to_target_string(tier), expected_string); + } + } + } + + // Tests for graceful degradation between performance tiers + mod graceful_degradation_tests { + use super::*; + + #[test] + fn test_aarch64_degradation_path() { + // Test the degradation path for AArch64: SHA3+AES -> AES -> Software + + // Start with highest tier capabilities + let mut capabilities = ArchCapabilities { + has_aes: true, + has_sha3: true, + has_sse41: false, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + + // Should select highest tier + assert_eq!( + select_performance_tier_for_test(&capabilities), + PerformanceTier::AArch64AesSha3 + ); + + // Remove SHA3 - should degrade to AES tier + capabilities.has_sha3 = false; + assert_eq!( + select_performance_tier_for_test(&capabilities), + PerformanceTier::AArch64Aes + ); + + // Remove AES - should degrade to software + capabilities.has_aes = false; + assert_eq!( + select_performance_tier_for_test(&capabilities), + PerformanceTier::SoftwareTable + ); + } + + #[test] + fn test_x86_64_degradation_path() { + // Test the degradation path for x86_64: VPCLMULQDQ -> AVX512 -> SSE -> Software + + // Start with highest tier capabilities + let mut capabilities = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: true, + has_vpclmulqdq: true, + rust_version_supports_avx512: true, + }; + + // Should select highest tier + assert_eq!( + select_performance_tier_for_test(&capabilities), + PerformanceTier::X86_64Avx512Vpclmulqdq + ); + + // Remove VPCLMULQDQ - should degrade to AVX512 tier + capabilities.has_vpclmulqdq = false; + assert_eq!( + select_performance_tier_for_test(&capabilities), + PerformanceTier::X86_64Avx512Pclmulqdq + ); + + // Remove AVX512VL - should degrade to SSE tier + capabilities.has_avx512vl = false; + assert_eq!( + select_performance_tier_for_test(&capabilities), + PerformanceTier::X86_64SsePclmulqdq + ); + + // Remove PCLMULQDQ - should degrade to software + capabilities.has_pclmulqdq = false; + assert_eq!( + select_performance_tier_for_test(&capabilities), + PerformanceTier::SoftwareTable + ); + + // Remove SSE4.1 - should still be software (already at lowest tier) + capabilities.has_sse41 = false; + assert_eq!( + select_performance_tier_for_test(&capabilities), + PerformanceTier::SoftwareTable + ); + } + + #[test] + fn test_rust_version_degradation() { + // Test degradation when Rust version doesn't support VPCLMULQDQ + + let capabilities_with_vpclmulqdq = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: true, + has_vpclmulqdq: true, + rust_version_supports_avx512: false, // Old Rust version + }; + + // Should degrade from VPCLMULQDQ tier to SSE tier due to Rust version + assert_eq!( + select_performance_tier_for_test(&capabilities_with_vpclmulqdq), + PerformanceTier::X86_64SsePclmulqdq + ); + } + + #[test] + fn test_partial_feature_availability() { + // Test scenarios where only some features in a tier are available + + // AArch64: SHA3 available but AES not (impossible in real hardware, but test safety) + let aarch64_partial = ArchCapabilities { + has_aes: false, + has_sha3: true, // This would be impossible in real detection + has_sse41: false, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + // Should fall back to software since AES is required for SHA3 + assert_eq!( + select_performance_tier_for_test(&aarch64_partial), + PerformanceTier::SoftwareTable + ); + + // x86_64: VPCLMULQDQ available but AVX512VL not (impossible in real hardware) + let x86_64_partial = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, + has_pclmulqdq: true, + has_avx512vl: false, + has_vpclmulqdq: true, // This would be impossible in real detection + rust_version_supports_avx512: true, + }; + // Should fall back to SSE tier since AVX512VL is required for VPCLMULQDQ + assert_eq!( + select_performance_tier_for_test(&x86_64_partial), + PerformanceTier::X86_64SsePclmulqdq + ); + } + } +} + +#[cfg(test)] +mod software_fallback_tests { + use super::*; + #[test] + fn test_aarch64_without_aes_falls_back_to_software() { + // Test that AArch64 without AES support falls back to software implementation + let capabilities_no_aes = ArchCapabilities { + has_aes: false, // No AES support + has_sha3: false, // SHA3 requires AES, so also false + has_sse41: false, + has_pclmulqdq: false, + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + + let tier = select_performance_tier_for_test(&capabilities_no_aes); + assert_eq!( + tier, + PerformanceTier::SoftwareTable, + "AArch64 without AES should fall back to software implementation" + ); + } + + #[test] + fn test_x86_without_pclmulqdq_falls_back_to_software() { + // Test that x86 without SSE4.1/PCLMULQDQ falls back to software implementation + let capabilities_no_pclmul = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: true, // SSE4.1 available + has_pclmulqdq: false, // But PCLMULQDQ not available + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + + let tier = select_performance_tier_for_test(&capabilities_no_pclmul); + assert_eq!( + tier, + PerformanceTier::SoftwareTable, + "x86 without PCLMULQDQ should fall back to software implementation" + ); + + // Test x86 without SSE4.1 + let capabilities_no_sse = ArchCapabilities { + has_aes: false, + has_sha3: false, + has_sse41: false, // No SSE4.1 support + has_pclmulqdq: false, // PCLMULQDQ requires SSE4.1 + has_avx512vl: false, + has_vpclmulqdq: false, + rust_version_supports_avx512: false, + }; + + let tier = select_performance_tier_for_test(&capabilities_no_sse); + assert_eq!( + tier, + PerformanceTier::SoftwareTable, + "x86 without SSE4.1 should fall back to software implementation" + ); + } + + #[test] + fn test_conditional_compilation_coverage() { + // Test that software fallback is properly conditionally compiled + // This test ensures the conditional compilation logic is working correctly + + // Software fallback should be available when needed + #[cfg(any( + not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")), + target_arch = "x86", + all(target_arch = "aarch64", not(target_feature = "aes")) + ))] + { + // Software fallback should be compiled in these cases + use crate::arch::software; + let _test_fn = software::update; + // This test passes if it compiles successfully + } + + // For x86_64, software fallback should not be needed since SSE4.1/PCLMULQDQ are always available + // But it may still be compiled for testing purposes + } +} diff --git a/src/lib.rs b/src/lib.rs index 21c4935..393edb0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -135,7 +135,7 @@ use crate::crc32::consts::{ CRC32_ISCSI, CRC32_ISO_HDLC, CRC32_JAMCRC, CRC32_MEF, CRC32_MPEG_2, CRC32_XFER, }; -#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))] +#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))] use crate::crc32::fusion; use crate::crc64::consts::{ @@ -155,6 +155,7 @@ mod consts; mod crc32; mod crc64; mod enums; +mod feature_detection; mod ffi; mod generate; mod structs; @@ -782,6 +783,19 @@ pub fn checksum_combine_with_params( /// Returns the target used to calculate the CRC checksum for the specified algorithm. /// +/// This function provides visibility into the active performance tier being used for CRC calculations. +/// The target string follows the format `{architecture}-{intrinsics-family}-{intrinsics-features}`, +/// such as `aarch64-aes-sha3` or `x86_64-avx512-vpclmulqdq`. +/// +/// The performance tier system provides graceful degradation across different hardware capabilities: +/// - **AArch64**: `aarch64-aes-sha3` (highest) → `aarch64-aes-pmull` (baseline) +/// - **x86_64**: `x86_64-avx512-vpclmulqdq` (highest) → `x86_64-avx512-pclmulqdq` (mid) → `x86_64-sse-pclmulqdq` (baseline) +/// - **x86**: `x86-sse-pclmulqdq` (baseline) → `software-fallback-tables` (fallback) +/// - **Other architectures**: `software-fallback-tables` +/// +/// The tier selection is deterministic and consistent across runs on the same hardware, +/// combining compile-time and runtime feature detection for safety and optimal performance. +/// /// These strings are informational only, not stable, and shouldn't be relied on to match across /// versions. /// @@ -790,9 +804,17 @@ pub fn checksum_combine_with_params( /// use crc_fast::{get_calculator_target, CrcAlgorithm::Crc32IsoHdlc}; /// /// let target = get_calculator_target(Crc32IsoHdlc); +/// println!("Using performance tier: {}", target); +/// // Example outputs: +/// // "aarch64-aes-sha3" - AArch64 with SHA3 and AES support +/// // "x86_64-avx512-vpclmulqdq" - x86_64 with VPCLMULQDQ support +/// // "x86_64-sse-pclmulqdq" - x86_64 baseline with SSE4.1 and PCLMULQDQ /// ``` pub fn get_calculator_target(_algorithm: CrcAlgorithm) -> String { - arch::get_target() + use crate::feature_detection::get_arch_ops; + + let arch_ops = get_arch_ops(); + arch_ops.get_target_string() } /// Returns the calculator function and parameters for the specified CRC algorithm. @@ -834,11 +856,15 @@ fn get_calculator_params(algorithm: CrcAlgorithm) -> (CalculatorFn, CrcParams) { #[inline(always)] fn crc32_iscsi_calculator(state: u64, data: &[u8], _params: CrcParams) -> u64 { // both aarch64 and x86 have native CRC-32/ISCSI support, so we can use fusion - #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))] + #[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))] return fusion::crc32_iscsi(state as u32, data) as u64; - #[cfg(all(not(target_arch = "aarch64"), not(target_arch = "x86_64")))] - // fallback to traditional calculation if not aarch64 or x86_64 + #[cfg(all( + not(target_arch = "aarch64"), + not(target_arch = "x86_64"), + not(target_arch = "x86") + ))] + // Fallback to traditional calculation for other architectures Calculator::calculate(state, data, _params) } @@ -945,6 +971,64 @@ mod lib { ); } + #[test] + fn test_get_calculator_target_format() { + let target = get_calculator_target(CrcAlgorithm::Crc32IsoHdlc); + + // Target string should not be empty + assert!(!target.is_empty()); + + // Should follow the expected format with valid architecture prefixes + let valid_prefixes = ["aarch64-", "x86_64-", "x86-", "software-"]; + assert!( + valid_prefixes + .iter() + .any(|prefix| target.starts_with(prefix)), + "Target '{}' should start with a valid architecture prefix", + target + ); + + // Should contain intrinsics family and features information + let parts: Vec<&str> = target.split('-').collect(); + assert!( + parts.len() >= 3, + "Target '{}' should have at least 3 parts: architecture-family-features", + target + ); + } + + #[test] + fn test_get_calculator_target_consistency() { + // Multiple calls should return the same result (deterministic) + let target1 = get_calculator_target(CrcAlgorithm::Crc32IsoHdlc); + let target2 = get_calculator_target(CrcAlgorithm::Crc32Iscsi); + let target3 = get_calculator_target(CrcAlgorithm::Crc64Nvme); + + assert_eq!( + target1, target2, + "Target should be consistent across different CRC-32 algorithms" + ); + assert_eq!( + target1, target3, + "Target should be consistent across CRC-32 and CRC-64 algorithms" + ); + } + + #[test] + fn test_get_calculator_target_uses_cached_detection() { + // This test verifies that the function uses cached feature detection + // by checking that multiple calls are consistent and don't perform + // redundant feature detection + + let target1 = get_calculator_target(CrcAlgorithm::Crc32IsoHdlc); + let target2 = get_calculator_target(CrcAlgorithm::Crc32IsoHdlc); + + assert_eq!( + target1, target2, + "Cached detection should return identical results" + ); + } + #[test] fn test_digest_updates_check() { for config in TEST_ALL_CONFIGS { diff --git a/src/test/future_proof.rs b/src/test/future_proof_tests.rs similarity index 100% rename from src/test/future_proof.rs rename to src/test/future_proof_tests.rs diff --git a/src/test/mod.rs b/src/test/mod.rs index 2b7b216..8b84fcc 100644 --- a/src/test/mod.rs +++ b/src/test/mod.rs @@ -7,7 +7,7 @@ pub(crate) mod consts; pub(crate) mod enums; -mod future_proof; +mod future_proof_tests; mod structs; /// Creates a new aligned data vector from the input slice for testing. diff --git a/tests/checksum_integration_tests.rs b/tests/checksum_integration_tests.rs new file mode 100644 index 0000000..2eab8dd --- /dev/null +++ b/tests/checksum_integration_tests.rs @@ -0,0 +1,237 @@ +// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. + +use std::fs; +use std::process::Command; + +#[test] +fn test_benchmark_flag_parsing() { + let output = Command::new("cargo") + .args(&["run", "--bin", "checksum", "--", "-a", "CRC-32/ISCSI", "-b"]) + .output() + .expect("Failed to execute command"); + + assert!( + output.status.success(), + "Command should succeed with -b flag" + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("Algorithm: CRC-32/ISCSI")); + assert!(stdout.contains("Throughput:")); + assert!(stdout.contains("GiB/s")); +} + +#[test] +fn test_benchmark_with_size_parameter() { + let output = Command::new("cargo") + .args(&[ + "run", + "--bin", + "checksum", + "--", + "-a", + "CRC-32/ISCSI", + "-b", + "--size", + "1024", + ]) + .output() + .expect("Failed to execute command"); + + assert!(output.status.success()); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("Data Size: 1,024 bytes")); +} + +#[test] +fn test_benchmark_with_duration_parameter() { + let output = Command::new("cargo") + .args(&[ + "run", + "--bin", + "checksum", + "--", + "-a", + "CRC-32/ISCSI", + "-b", + "--duration", + "1.0", + ]) + .output() + .expect("Failed to execute command"); + + assert!(output.status.success()); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("Duration: 1.")); +} + +#[test] +fn test_benchmark_invalid_size() { + let output = Command::new("cargo") + .args(&[ + "run", + "--bin", + "checksum", + "--", + "-a", + "CRC-32/ISCSI", + "-b", + "--size", + "0", + ]) + .output() + .expect("Failed to execute command"); + + assert!(!output.status.success()); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!(stderr.contains("Size must be greater than 0")); +} + +#[test] +fn test_benchmark_invalid_duration() { + let output = Command::new("cargo") + .args(&[ + "run", + "--bin", + "checksum", + "--", + "-a", + "CRC-32/ISCSI", + "-b", + "--duration", + "0", + ]) + .output() + .expect("Failed to execute command"); + + assert!(!output.status.success()); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!(stderr.contains("Duration must be greater than 0")); +} + +#[test] +fn test_benchmark_with_file_input() { + // Create a temporary test file + let test_file = "test_benchmark_file.txt"; + fs::write(test_file, "Hello, benchmark world!").expect("Failed to create test file"); + + let output = Command::new("cargo") + .args(&[ + "run", + "--bin", + "checksum", + "--", + "-a", + "CRC-32/ISCSI", + "-b", + "-f", + test_file, + "--duration", + "0.5", + ]) + .output() + .expect("Failed to execute command"); + + // Clean up + let _ = fs::remove_file(test_file); + + assert!(output.status.success()); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("Data Size: 23 bytes")); +} + +#[test] +fn test_benchmark_with_string_input() { + let output = Command::new("cargo") + .args(&[ + "run", + "--bin", + "checksum", + "--", + "-a", + "CRC-32/ISCSI", + "-b", + "-s", + "test string", + "--duration", + "0.5", + ]) + .output() + .expect("Failed to execute command"); + + assert!(output.status.success()); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("Data Size: 11 bytes")); +} + +#[test] +fn test_benchmark_different_algorithms() { + let algorithms = ["CRC-32/ISCSI", "CRC-64/NVME"]; + + for algorithm in &algorithms { + let output = Command::new("cargo") + .args(&[ + "run", + "--bin", + "checksum", + "--", + "-a", + algorithm, + "-b", + "--duration", + "0.5", + ]) + .output() + .expect("Failed to execute command"); + + assert!( + output.status.success(), + "Algorithm {} should work", + algorithm + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains(&format!("Algorithm: {}", algorithm))); + } +} + +#[test] +fn test_benchmark_size_without_benchmark_flag() { + let output = Command::new("cargo") + .args(&[ + "run", + "--bin", + "checksum", + "--", + "-a", + "CRC-32/ISCSI", + "--size", + "1024", + ]) + .output() + .expect("Failed to execute command"); + + assert!(!output.status.success()); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!(stderr.contains("--size and --duration can only be used with -b flag")); +} + +#[test] +fn test_benchmark_nonexistent_file() { + let output = Command::new("cargo") + .args(&[ + "run", + "--bin", + "checksum", + "--", + "-a", + "CRC-32/ISCSI", + "-b", + "-f", + "nonexistent_file.txt", + ]) + .output() + .expect("Failed to execute command"); + + assert!(!output.status.success()); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!(stderr.contains("File not found")); +}