From dd3c796480b5604dfbcdb35ffea3d8bc8d51ba30 Mon Sep 17 00:00:00 2001
From: Don MacAskill <don@smugmug.com>
Date: Thu, 30 Oct 2025 12:17:16 -0700
Subject: [PATCH 1/2] Improve runtime feature detection

Uses more runtime feature detection, rather than compile time feature
detection, for improved reliability, maintainability, graceful
degradataion across CPU families, and performance.

Should help minimize bugs such as
https://github.com/awesomized/crc-fast-rust/issues/14 in the future.
---
 .github/workflows/tests.yml                   |   45 +-
 .gitignore                                    |    3 +-
 .../specs/checksum-benchmark-option/design.md |  202 +++
 .../checksum-benchmark-option/requirements.md |   52 +
 .../specs/checksum-benchmark-option/tasks.md  |   49 +
 Cargo.lock                                    |  302 ++--
 Cargo.toml                                    |    5 +-
 README.md                                     |   92 +-
 benches/benchmark.rs                          |    7 +-
 src/algorithm.rs                              |    7 +-
 src/arch/{aarch64.rs => aarch64/aes.rs}       |   48 +-
 src/arch/aarch64/aes_sha3.rs                  |  201 +++
 src/arch/aarch64/mod.rs                       |    8 +
 src/arch/mod.rs                               |  240 ++--
 src/arch/software.rs                          |   52 +-
 src/arch/x86/mod.rs                           |    7 +
 src/arch/{x86.rs => x86/sse.rs}               |   56 +-
 src/arch/x86_64/avx512.rs                     |  220 +++
 src/arch/x86_64/avx512_vpclmulqdq.rs          |  630 +++++++++
 src/arch/x86_64/mod.rs                        |    8 +
 src/bin/arch-check.rs                         |    2 +
 src/bin/checksum.rs                           |  565 +++++++-
 src/bin/get-custom-params.rs                  |    2 +
 src/crc32/fusion/aarch64.rs                   | 1073 --------------
 src/crc32/fusion/aarch64/iscsi/crc_pmull.rs   |  179 +++
 .../fusion/aarch64/iscsi/crc_pmull_sha3.rs    |  266 ++++
 src/crc32/fusion/aarch64/iscsi/mod.rs         |    9 +
 .../fusion/aarch64/iso_hdlc/crc_pmull.rs      |  184 +++
 .../fusion/aarch64/iso_hdlc/crc_pmull_sha3.rs |  267 ++++
 src/crc32/fusion/aarch64/iso_hdlc/mod.rs      |    9 +
 src/crc32/fusion/aarch64/mod.rs               |  275 ++++
 src/crc32/fusion/mod.rs                       |   23 +-
 src/crc32/fusion/x86.rs                       |  740 ----------
 .../fusion/x86/iscsi/avx512_pclmulqdq.rs      |  168 +++
 .../fusion/x86/iscsi/avx512_vpclmulqdq.rs     |  208 +++
 src/crc32/fusion/x86/iscsi/mod.rs             |   10 +
 src/crc32/fusion/x86/iscsi/sse_pclmulqdq.rs   |  169 +++
 src/crc32/fusion/x86/mod.rs                   |  289 ++++
 src/crc32/mod.rs                              |    2 +-
 src/feature_detection.rs                      | 1238 +++++++++++++++++
 src/lib.rs                                    |   94 +-
 ...{future_proof.rs => future_proof_tests.rs} |    0
 src/test/mod.rs                               |    2 +-
 tests/checksum_integration_tests.rs           |  237 ++++
 44 files changed, 5990 insertions(+), 2255 deletions(-)
 create mode 100644 .kiro/specs/checksum-benchmark-option/design.md
 create mode 100644 .kiro/specs/checksum-benchmark-option/requirements.md
 create mode 100644 .kiro/specs/checksum-benchmark-option/tasks.md
 rename src/arch/{aarch64.rs => aarch64/aes.rs} (86%)
 create mode 100644 src/arch/aarch64/aes_sha3.rs
 create mode 100644 src/arch/aarch64/mod.rs
 create mode 100644 src/arch/x86/mod.rs
 rename src/arch/{x86.rs => x86/sse.rs} (85%)
 create mode 100644 src/arch/x86_64/avx512.rs
 create mode 100644 src/arch/x86_64/avx512_vpclmulqdq.rs
 create mode 100644 src/arch/x86_64/mod.rs
 delete mode 100644 src/crc32/fusion/aarch64.rs
 create mode 100644 src/crc32/fusion/aarch64/iscsi/crc_pmull.rs
 create mode 100644 src/crc32/fusion/aarch64/iscsi/crc_pmull_sha3.rs
 create mode 100644 src/crc32/fusion/aarch64/iscsi/mod.rs
 create mode 100644 src/crc32/fusion/aarch64/iso_hdlc/crc_pmull.rs
 create mode 100644 src/crc32/fusion/aarch64/iso_hdlc/crc_pmull_sha3.rs
 create mode 100644 src/crc32/fusion/aarch64/iso_hdlc/mod.rs
 create mode 100644 src/crc32/fusion/aarch64/mod.rs
 delete mode 100644 src/crc32/fusion/x86.rs
 create mode 100644 src/crc32/fusion/x86/iscsi/avx512_pclmulqdq.rs
 create mode 100644 src/crc32/fusion/x86/iscsi/avx512_vpclmulqdq.rs
 create mode 100644 src/crc32/fusion/x86/iscsi/mod.rs
 create mode 100644 src/crc32/fusion/x86/iscsi/sse_pclmulqdq.rs
 create mode 100644 src/crc32/fusion/x86/mod.rs
 create mode 100644 src/feature_detection.rs
 rename src/test/{future_proof.rs => future_proof_tests.rs} (100%)
 create mode 100644 tests/checksum_integration_tests.rs

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 180d113..3b93e76 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -6,14 +6,45 @@ on:
   workflow_dispatch:
 
 jobs:
-  test-accelerated:
-    name: Test accelerated (aarch64, x86_64)
+  test-aarch64:
+    name: Test aarch64
     strategy:
       matrix:
-        os: [ubuntu-latest, ubuntu-22.04-arm, ubuntu-24.04-arm, macos-latest]
+        os: [ubuntu-22.04-arm, ubuntu-24.04-arm, macos-14, macos-15, macos-26, macos-latest, windows-11-arm]
         rust-toolchain:
           - "1.81" # minimum for this crate
-          - "1.89" # when VPCLMULQDQ was stabilized
+          - "1.89" # when AVX-512 VPCLMULQDQ was stabilized
+          - "stable"
+          - "nightly"
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4 # not pinning to commit hash since this is a GitHub action, which we trust
+      - uses: actions-rust-lang/setup-rust-toolchain@9d7e65c320fdb52dcd45ffaa68deb6c02c8754d9 # v1.12.0
+        with:
+          toolchain: ${{ matrix.rust-toolchain }}
+          components: rustfmt, clippy
+          cache-key: ${{ matrix.os }}-${{ matrix.rust-toolchain }}
+      - name: Check
+        run: cargo check
+      - name: Architecture check
+        run: cargo run --bin arch-check
+      - if: ${{ matrix.rust-toolchain != 'nightly' }}
+        name: Format
+        run: cargo fmt -- --check
+      - if: ${{ matrix.rust-toolchain != 'nightly' }}
+        name: Clippy
+        run: cargo clippy
+      - name: Test
+        run: cargo test
+
+  test-x86_64:
+    name: Test x86_64
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, ubuntu-22.04, ubuntu-24.04, macos-13, macos-15-intel, windows-2022, windows-2025, windows-latest ]
+        rust-toolchain:
+          - "1.81" # minimum for this crate
+          - "1.89" # when AVX-512 VPCLMULQDQ was stabilized
           - "stable"
           - "nightly"
     runs-on: ${{ matrix.os }}
@@ -38,14 +69,14 @@ jobs:
         run: cargo test
 
   test-x86:
-    name:  Test accelerated (x86)
+    name:  Test x86
     runs-on: ubuntu-latest
     strategy:
       matrix:
         target: [i586-unknown-linux-gnu, i686-unknown-linux-gnu]
         rust-toolchain:
           - "1.81" # minimum for this crate
-          - "1.89" # when VPCLMULQDQ was stabilized
+          - "1.89" # when AVX-512 VPCLMULQDQ was stabilized
           - "stable"
           - "nightly"
     steps:
@@ -71,7 +102,7 @@ jobs:
         target: [powerpc-unknown-linux-gnu, powerpc64-unknown-linux-gnu]
         rust-toolchain:
           - "1.81" # minimum for this crate
-          - "1.89" # when VPCLMULQDQ was stabilized
+          - "1.89" # when AVX-512 VPCLMULQDQ was stabilized
           - "stable"
           - "nightly"
     steps:
diff --git a/.gitignore b/.gitignore
index 24335e6..d97ced5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,5 @@
 /test/test_*.bin
 .idea
 .DS_Store
-.git
\ No newline at end of file
+.git
+.vscode
\ No newline at end of file
diff --git a/.kiro/specs/checksum-benchmark-option/design.md b/.kiro/specs/checksum-benchmark-option/design.md
new file mode 100644
index 0000000..26cf154
--- /dev/null
+++ b/.kiro/specs/checksum-benchmark-option/design.md
@@ -0,0 +1,202 @@
+# Design Document
+
+## Overview
+
+This design extends the existing `bin/checksum.rs` tool with benchmark functionality through a new `-b` flag. The benchmark mode will measure CRC performance using either user-provided data (files/strings) or randomly generated data, reporting throughput in GiB/s along with the acceleration target used.
+
+The design maintains backward compatibility while adding a clean benchmark interface that leverages existing patterns from the `benches/benchmark.rs` implementation.
+
+## Architecture
+
+### Command Line Interface
+
+The tool will extend the existing argument parsing to support:
+- `-b`: Enable benchmark mode
+- `--size <bytes>`: Specify data size for random generation (when no file/string provided)
+- `--duration <seconds>`: Benchmark duration as floating-point seconds (default: 10.0)
+- Existing `-a <algorithm>`: CRC algorithm (required in benchmark mode)
+- Existing `-f <file>` or `-s <string>`: Optional data source for benchmarking
+
+### Data Flow
+
+```
+User Input → Argument Parsing → Mode Detection → Benchmark Execution → Results Display
+                                      ↓
+                              [Normal Checksum Mode]
+                                      ↓
+                              [Existing Functionality]
+```
+
+In benchmark mode:
+1. Parse and validate benchmark parameters
+2. Determine data source (file, string, or generated)
+3. For string/generated data: Load/generate test data once; For file data: use file path directly
+4. Run benchmark loop for specified duration using appropriate checksum function
+5. Calculate and display results
+
+## Components and Interfaces
+
+### Enhanced Config Structure
+
+```rust
+#[derive(Debug)]
+struct Config {
+    algorithm: String,
+    file: Option<String>,
+    string: Option<String>,
+    format: OutputFormat,
+    benchmark: Option<BenchmarkConfig>,
+}
+
+#[derive(Debug)]
+struct BenchmarkConfig {
+    size: Option<usize>,
+    duration: f64,
+}
+```
+
+### Benchmark Execution Module
+
+```rust
+enum BenchmarkData {
+    InMemory(Vec<u8>),
+    File(String),
+}
+
+struct BenchmarkRunner {
+    algorithm: CrcAlgorithm,
+    data: BenchmarkData,
+    duration: f64,
+}
+
+impl BenchmarkRunner {
+    fn new(algorithm: CrcAlgorithm, data: BenchmarkData, duration: f64) -> Self
+    fn run(&self) -> BenchmarkResult
+}
+
+struct BenchmarkResult {
+    iterations: u64,
+    elapsed_seconds: f64,
+    throughput_gibs: f64,
+    time_per_iteration_nanos: f64,
+    acceleration_target: String,
+    data_size: u64,
+}
+```
+
+### Data Generation
+
+The benchmark will reuse the random data generation pattern from `benches/benchmark.rs`:
+
+```rust
+fn generate_random_data(size: usize) -> Vec<u8> {
+    let mut rng = rand::rng();
+    let mut buf = vec![0u8; size];
+    rng.fill_bytes(&mut buf);
+    buf
+}
+```
+
+## Data Models
+
+### Input Data Sources
+
+1. **File Input**: Use `checksum_file()` function to benchmark the entire file I/O and checksum stack
+2. **String Input**: Use string bytes directly with in-memory `checksum()` function
+3. **Generated Data**: Create random data of specified size using `rand::RngCore::fill_bytes()` and use in-memory `checksum()` function
+
+### Benchmark Metrics
+
+- **Iterations**: Number of checksum calculations performed
+- **Elapsed Time**: Actual benchmark duration in seconds
+- **Throughput**: Calculated as `(data_size * iterations) / elapsed_time / (1024^3)` GiB/s
+- **Acceleration Target**: Result from `crc_fast::get_calculator_target(algorithm)`
+
+## Error Handling
+
+### Validation Errors
+
+- Invalid algorithm names (reuse existing validation)
+- Invalid size parameters (non-positive values)
+- Invalid duration parameters (non-positive values)
+- File read errors (reuse existing error handling)
+
+### Runtime Errors
+
+- Memory allocation failures for large data sizes
+- Timer precision issues (fallback to alternative timing methods)
+
+### Error Messages
+
+All errors will follow the existing pattern of displaying the error message followed by usage information.
+
+## Testing Strategy
+
+### Unit Tests
+
+- Argument parsing validation for benchmark flags
+- BenchmarkConfig creation and validation
+- Data generation with various sizes
+- Throughput calculation accuracy
+
+### Integration Tests
+
+- End-to-end benchmark execution with different algorithms
+- File and string input handling in benchmark mode
+- Error handling for invalid parameters
+- Backward compatibility verification
+
+### Performance Validation
+
+- Verify benchmark results are reasonable (within expected ranges)
+- Compare with existing `benches/benchmark.rs` results for consistency
+- Test with various data sizes to ensure linear scaling
+
+## Implementation Notes
+
+### Timing Mechanism
+
+Use `std::time::Instant` for high-precision timing, with different approaches for different data sources:
+
+```rust
+let start = std::time::Instant::now();
+let mut iterations = 0u64;
+
+while start.elapsed().as_secs_f64() < duration {
+    match &self.data {
+        BenchmarkData::InMemory(data) => {
+            std::hint::black_box(checksum(algorithm, data));
+        }
+        BenchmarkData::File(filename) => {
+            std::hint::black_box(checksum_file(algorithm, filename, None).unwrap());
+        }
+    }
+    iterations += 1;
+}
+
+let elapsed = start.elapsed().as_secs_f64();
+```
+
+### Memory Considerations
+
+- Pre-allocate test data once before benchmark loop
+- Use `std::hint::black_box()` to prevent compiler optimizations
+- Consider memory alignment for optimal performance (optional enhancement)
+
+### Output Format
+
+```
+Algorithm: CRC-32/ISCSI
+Acceleration Target: aarch64-neon-sha3
+Data Size: 1,048,576 bytes (1.0 MiB)
+Duration: 10.00 seconds
+Iterations: 12,345
+Throughput: 45.67 GiB/s
+Time per iteration: 810.2 μs
+```
+
+### Default Values
+
+- **Size**: 1,048,576 bytes (1 MiB)
+- **Duration**: 10.0 seconds
+- **Algorithm**: Must be specified via `-a` flag (no default)
\ No newline at end of file
diff --git a/.kiro/specs/checksum-benchmark-option/requirements.md b/.kiro/specs/checksum-benchmark-option/requirements.md
new file mode 100644
index 0000000..4da3195
--- /dev/null
+++ b/.kiro/specs/checksum-benchmark-option/requirements.md
@@ -0,0 +1,52 @@
+# Requirements Document
+
+## Introduction
+
+This feature adds a simple benchmark option to the existing `bin/checksum.rs` tool via a command-line flag. The benchmark will allow users to test performance across different platforms using a single binary, reporting throughput in GiB/s and the acceleration target used. This enables cross-platform performance comparison without requiring a full development environment checkout.
+
+## Glossary
+
+- **Checksum_Tool**: The existing `bin/checksum.rs` binary application
+- **Benchmark_Mode**: A new operational mode that measures and reports performance metrics
+- **Acceleration_Target**: The hardware-specific optimization path returned by `get_calculator_target()`
+- **Throughput_Metric**: Performance measurement expressed in GiB/s (gibibytes per second)
+- **Test_Data**: Randomly generated byte array used for benchmark measurements
+- **Black_Box**: Rust's `std::hint::black_box()` function that prevents compiler optimizations during benchmarking
+
+## Requirements
+
+### Requirement 1
+
+**User Story:** As a developer, I want to run performance benchmarks from the checksum tool, so that I can compare CRC performance across different hardware platforms without setting up a full development environment.
+
+#### Acceptance Criteria
+
+1. WHEN the user provides a `-b` flag, THE Checksum_Tool SHALL enter Benchmark_Mode
+2. WHILE in Benchmark_Mode, THE Checksum_Tool SHALL generate Test_Data of the specified size once and reuse it for all iterations
+3. THE Checksum_Tool SHALL report Throughput_Metric in GiB/s format
+4. THE Checksum_Tool SHALL display the Acceleration_Target used for the benchmark
+5. THE Checksum_Tool SHALL use Black_Box to prevent compiler optimizations during measurement
+
+### Requirement 2
+
+**User Story:** As a developer, I want to specify benchmark parameters, so that I can control the test conditions for consistent cross-platform comparisons.
+
+#### Acceptance Criteria
+
+1. WHEN the user provides `-a` parameter with `-b` flag, THE Checksum_Tool SHALL use the specified CRC algorithm for benchmarking
+2. WHEN the user provides `--size` parameter, THE Checksum_Tool SHALL generate Test_Data of the specified byte size
+3. WHEN the user provides `--duration` parameter, THE Checksum_Tool SHALL run the benchmark for the specified number of seconds
+4. WHERE no benchmark parameters are provided, THE Checksum_Tool SHALL use default values of 1 MiB for size and 10 seconds for duration
+5. THE Checksum_Tool SHALL validate all benchmark parameter values before starting the benchmark
+
+### Requirement 3
+
+**User Story:** As a developer, I want the benchmark to support both file/string input and generated data, so that I can benchmark with specific data or use random data for consistent testing.
+
+#### Acceptance Criteria
+
+1. WHEN the user provides `-b` with `-f` or `-s` flags, THE Checksum_Tool SHALL use the file or string content as Test_Data for benchmarking
+2. WHEN the user provides `-b` with `--size` parameter but no `-f` or `-s` flags, THE Checksum_Tool SHALL generate random Test_Data of the specified size
+3. IF the user provides `-b` without any data source or size specification, THEN THE Checksum_Tool SHALL generate random Test_Data using the default size
+4. THE Checksum_Tool SHALL display appropriate usage information when benchmark parameters are invalid
+5. THE Checksum_Tool SHALL maintain backward compatibility with existing checksum functionality
\ No newline at end of file
diff --git a/.kiro/specs/checksum-benchmark-option/tasks.md b/.kiro/specs/checksum-benchmark-option/tasks.md
new file mode 100644
index 0000000..492dd08
--- /dev/null
+++ b/.kiro/specs/checksum-benchmark-option/tasks.md
@@ -0,0 +1,49 @@
+# Implementation Plan
+
+- [x] 1. Extend command line argument parsing for benchmark options
+  - Add `-b` flag to enable benchmark mode in the argument parser
+  - Add `--size` parameter for specifying random data size
+  - Add `--duration` parameter for benchmark duration (floating-point seconds)
+  - Update the `Config` struct to include optional `BenchmarkConfig`
+  - Update usage/help text to include new benchmark options
+  - _Requirements: 1.1, 2.1, 2.2, 2.3, 2.4_
+
+- [x] 2. Implement benchmark data structures and validation
+  - Create `BenchmarkConfig` struct with size and duration fields
+  - Create `BenchmarkData` enum to handle in-memory vs file data sources
+  - Create `BenchmarkRunner` struct with algorithm, data, and duration
+  - Create `BenchmarkResult` struct with all metrics including time per iteration
+  - Add validation logic for benchmark parameters (positive values)
+  - _Requirements: 2.5, 3.4_
+
+- [x] 3. Implement benchmark execution logic
+  - Create benchmark runner with timing loop using `std::time::Instant`
+  - Implement separate execution paths for in-memory data vs file data
+  - Use `std::hint::black_box()` to prevent compiler optimizations
+  - Calculate throughput in GiB/s and time per iteration with appropriate units
+  - Integrate `get_calculator_target()` for acceleration target reporting
+  - _Requirements: 1.2, 1.3, 1.4, 1.5_
+
+- [x] 4. Implement data source handling
+  - Add random data generation function using `rand::RngCore::fill_bytes()`
+  - Implement logic to determine data source (file, string, or generated)
+  - Handle file size detection for throughput calculations
+  - Create `BenchmarkData` instances based on user input
+  - _Requirements: 3.1, 3.2, 3.3_
+
+- [x] 5. Integrate benchmark mode into main application flow
+  - Modify main function to detect benchmark mode and route accordingly
+  - Ensure mutual exclusivity validation between benchmark and normal modes
+  - Add benchmark result formatting and display
+  - Update error handling to include benchmark-specific errors
+  - Maintain backward compatibility with existing functionality
+  - _Requirements: 3.4, 3.5_
+
+- [x] 6. Add comprehensive testing for benchmark functionality
+  - Write unit tests for argument parsing with benchmark flags
+  - Test benchmark parameter validation (invalid sizes, durations)
+  - Test data source selection logic (file vs string vs generated)
+  - Test benchmark execution with different algorithms
+  - Verify throughput calculation accuracy
+  - Test error handling for invalid benchmark configurations
+  - _Requirements: All requirements_
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index 36a5d89..33e3bee 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -19,9 +19,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
 [[package]]
 name = "anstream"
-version = "0.6.20"
+version = "0.6.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192"
+checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a"
 dependencies = [
  "anstyle",
  "anstyle-parse",
@@ -34,9 +34,9 @@ dependencies = [
 
 [[package]]
 name = "anstyle"
-version = "1.0.11"
+version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd"
+checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78"
 
 [[package]]
 name = "anstyle-parse"
@@ -53,7 +53,7 @@ version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2"
 dependencies = [
- "windows-sys",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
@@ -64,7 +64,7 @@ checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a"
 dependencies = [
  "anstyle",
  "once_cell_polyfill",
- "windows-sys",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
@@ -75,9 +75,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
 
 [[package]]
 name = "bitflags"
-version = "2.9.3"
+version = "2.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34efbcccd345379ca2868b2b2c9d3782e9cc58ba87bc7d79d5b53d9c9ae6f25d"
+checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
 
 [[package]]
 name = "block-buffer"
@@ -102,9 +102,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
 [[package]]
 name = "cbindgen"
-version = "0.29.0"
+version = "0.29.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "975982cdb7ad6a142be15bdf84aea7ec6a9e5d4d797c004d43185b24cfe4e684"
+checksum = "befbfd072a8e81c02f8c507aefce431fe5e7d051f83d48a23ffc9b9fe5a11799"
 dependencies = [
  "clap",
  "heck",
@@ -121,9 +121,9 @@ dependencies = [
 
 [[package]]
 name = "cfg-if"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
 
 [[package]]
 name = "ciborium"
@@ -154,18 +154,18 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.46"
+version = "4.5.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c5e4fcf9c21d2e544ca1ee9d8552de13019a42aa7dbf32747fa7aaf1df76e57"
+checksum = "0c2cfd7bf8a6017ddaa4e32ffe7403d547790db06bd171c1c53926faab501623"
 dependencies = [
  "clap_builder",
 ]
 
 [[package]]
 name = "clap_builder"
-version = "4.5.46"
+version = "4.5.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fecb53a0e6fcfb055f686001bc2e2592fa527efaf38dbe81a6a9563562e57d41"
+checksum = "0a4c05b9e80c5ccd3a7ef080ad7b6ba7d6fc00a985b8b157197075677c82c7a0"
 dependencies = [
  "anstream",
  "anstyle",
@@ -175,9 +175,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.7.5"
+version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
+checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
 
 [[package]]
 name = "colorchoice"
@@ -208,6 +208,7 @@ dependencies = [
  "crc",
  "criterion",
  "digest",
+ "indexmap",
  "libc",
  "rand",
  "regex",
@@ -312,12 +313,12 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
 
 [[package]]
 name = "errno"
-version = "0.3.13"
+version = "0.3.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad"
+checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -328,9 +329,9 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
 
 [[package]]
 name = "generic-array"
-version = "0.14.7"
+version = "0.14.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+checksum = "4bb6743198531e02858aeaea5398fcc883e71851fcbcb5a2f773e2fb6cb1edf2"
 dependencies = [
  "typenum",
  "version_check",
@@ -338,31 +339,32 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.3.3"
+version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
 dependencies = [
  "cfg-if",
  "libc",
  "r-efi",
- "wasi",
+ "wasip2",
 ]
 
 [[package]]
 name = "half"
-version = "2.6.0"
+version = "2.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9"
+checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
 dependencies = [
  "cfg-if",
  "crunchy",
+ "zerocopy",
 ]
 
 [[package]]
 name = "hashbrown"
-version = "0.15.5"
+version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
+checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d"
 
 [[package]]
 name = "heck"
@@ -372,9 +374,9 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
 [[package]]
 name = "indexmap"
-version = "2.11.0"
+version = "2.11.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2481980430f9f78649238835720ddccc57e52df14ffce1c6f37391d61b563e9"
+checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5"
 dependencies = [
  "equivalent",
  "hashbrown",
@@ -382,9 +384,9 @@ dependencies = [
 
 [[package]]
 name = "is_terminal_polyfill"
-version = "1.70.1"
+version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
 
 [[package]]
 name = "itertools"
@@ -403,9 +405,9 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
 
 [[package]]
 name = "js-sys"
-version = "0.3.77"
+version = "0.3.81"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f"
+checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305"
 dependencies = [
  "once_cell",
  "wasm-bindgen",
@@ -413,27 +415,27 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.175"
+version = "0.2.177"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543"
+checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.9.4"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
+checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039"
 
 [[package]]
 name = "log"
-version = "0.4.27"
+version = "0.4.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
+checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432"
 
 [[package]]
 name = "memchr"
-version = "2.7.5"
+version = "2.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
+checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
 
 [[package]]
 name = "num-traits"
@@ -452,9 +454,9 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
 
 [[package]]
 name = "once_cell_polyfill"
-version = "1.70.1"
+version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
 
 [[package]]
 name = "oorandom"
@@ -501,18 +503,18 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.101"
+version = "1.0.103"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
+checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.40"
+version = "1.0.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1"
 dependencies = [
  "proc-macro2",
 ]
@@ -574,9 +576,9 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.11.2"
+version = "1.12.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912"
+checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -586,9 +588,9 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.4.10"
+version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6"
+checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -597,21 +599,21 @@ dependencies = [
 
 [[package]]
 name = "regex-syntax"
-version = "0.8.6"
+version = "0.8.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001"
+checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
 
 [[package]]
 name = "rustix"
-version = "1.0.8"
+version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8"
+checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e"
 dependencies = [
  "bitflags",
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -637,18 +639,28 @@ dependencies = [
 
 [[package]]
 name = "serde"
-version = "1.0.219"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.219"
+version = "1.0.228"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -657,23 +669,24 @@ dependencies = [
 
 [[package]]
 name = "serde_json"
-version = "1.0.143"
+version = "1.0.145"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d401abef1d108fbd9cbaebc3e46611f4b1021f714a0597a71f41ee463f5f4a5a"
+checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
 dependencies = [
  "itoa",
  "memchr",
  "ryu",
  "serde",
+ "serde_core",
 ]
 
 [[package]]
 name = "serde_spanned"
-version = "0.6.9"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3"
+checksum = "e24345aa0fe688594e73770a5f6d1b216508b4f93484c0026d521acd30134392"
 dependencies = [
- "serde",
+ "serde_core",
 ]
 
 [[package]]
@@ -684,9 +697,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 
 [[package]]
 name = "syn"
-version = "2.0.106"
+version = "2.0.108"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6"
+checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -695,15 +708,15 @@ dependencies = [
 
 [[package]]
 name = "tempfile"
-version = "3.21.0"
+version = "3.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e"
+checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16"
 dependencies = [
  "fastrand",
  "getrandom",
  "once_cell",
  "rustix",
- "windows-sys",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -718,56 +731,54 @@ dependencies = [
 
 [[package]]
 name = "toml"
-version = "0.8.23"
+version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362"
+checksum = "f0dc8b1fb61449e27716ec0e1bdf0f6b8f3e8f6b05391e8497b8b6d7804ea6d8"
 dependencies = [
- "serde",
+ "indexmap",
+ "serde_core",
  "serde_spanned",
  "toml_datetime",
- "toml_edit",
+ "toml_parser",
+ "toml_writer",
+ "winnow",
 ]
 
 [[package]]
 name = "toml_datetime"
-version = "0.6.11"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
+checksum = "f2cdb639ebbc97961c51720f858597f7f24c4fc295327923af55b74c3c724533"
 dependencies = [
- "serde",
+ "serde_core",
 ]
 
 [[package]]
-name = "toml_edit"
-version = "0.22.27"
+name = "toml_parser"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
+checksum = "c0cbe268d35bdb4bb5a56a2de88d0ad0eb70af5384a99d648cd4b3d04039800e"
 dependencies = [
- "indexmap",
- "serde",
- "serde_spanned",
- "toml_datetime",
- "toml_write",
  "winnow",
 ]
 
 [[package]]
-name = "toml_write"
-version = "0.1.2"
+name = "toml_writer"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801"
+checksum = "df8b2b54733674ad286d16267dcfc7a71ed5c776e4ac7aa3c3e2561f7c637bf2"
 
 [[package]]
 name = "typenum"
-version = "1.18.0"
+version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
+checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.18"
+version = "1.0.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06"
 
 [[package]]
 name = "utf8parse"
@@ -792,31 +803,32 @@ dependencies = [
 ]
 
 [[package]]
-name = "wasi"
-version = "0.14.3+wasi-0.2.4"
+name = "wasip2"
+version = "1.0.1+wasi-0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a51ae83037bdd272a9e28ce236db8c07016dd0d50c27038b3f407533c030c95"
+checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7"
 dependencies = [
  "wit-bindgen",
 ]
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.100"
+version = "0.2.104"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5"
+checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d"
 dependencies = [
  "cfg-if",
  "once_cell",
  "rustversion",
  "wasm-bindgen-macro",
+ "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.100"
+version = "0.2.104"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6"
+checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19"
 dependencies = [
  "bumpalo",
  "log",
@@ -828,9 +840,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.100"
+version = "0.2.104"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407"
+checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -838,9 +850,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.100"
+version = "0.2.104"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de"
+checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -851,18 +863,18 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.100"
+version = "0.2.104"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d"
+checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "web-sys"
-version = "0.3.77"
+version = "0.3.81"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2"
+checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -870,18 +882,18 @@ dependencies = [
 
 [[package]]
 name = "winapi-util"
-version = "0.1.10"
+version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0978bf7171b3d90bac376700cb56d606feb40f251a475a5d6634613564460b22"
+checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
 dependencies = [
- "windows-sys",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
 name = "windows-link"
-version = "0.1.3"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
 
 [[package]]
 name = "windows-sys"
@@ -892,11 +904,20 @@ dependencies = [
  "windows-targets",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link",
+]
+
 [[package]]
 name = "windows-targets"
-version = "0.53.3"
+version = "0.53.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91"
+checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3"
 dependencies = [
  "windows-link",
  "windows_aarch64_gnullvm",
@@ -911,81 +932,78 @@ dependencies = [
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764"
+checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c"
+checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3"
+checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3"
 
 [[package]]
 name = "windows_i686_gnullvm"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11"
+checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d"
+checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba"
+checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57"
+checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486"
+checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
 
 [[package]]
 name = "winnow"
 version = "0.7.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf"
-dependencies = [
- "memchr",
-]
 
 [[package]]
 name = "wit-bindgen"
-version = "0.45.0"
+version = "0.46.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "052283831dbae3d879dc7f51f3d92703a316ca49f91540417d38591826127814"
+checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
 
 [[package]]
 name = "zerocopy"
-version = "0.8.26"
+version = "0.8.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f"
+checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c"
 dependencies = [
  "zerocopy-derive",
 ]
 
 [[package]]
 name = "zerocopy-derive"
-version = "0.8.26"
+version = "0.8.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181"
+checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831"
 dependencies = [
  "proc-macro2",
  "quote",
diff --git a/Cargo.toml b/Cargo.toml
index 699b5a4..1e43229 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,9 +23,12 @@ crc = "3"
 digest = {  version = "0.10", features = ["alloc"] }
 rand = "0.9"
 libc = "0.2"
-regex = "1.11"
+regex = "1.12"
 rustversion = "1.0"
 
+# constrain indexmap (transitive) to a version compatible with Rust 1.81.0
+indexmap = { version = ">=2.11.0, <2.12.0", optional = true }
+
 [dev-dependencies]
 criterion = "0.7"
 cbindgen = "0.29"
diff --git a/README.md b/README.md
index 368c157..75e9499 100644
--- a/README.md
+++ b/README.md
@@ -339,73 +339,62 @@ This is a summary of the performance for the most important and popular CRC chec
 
 AKA `crc32c` in many, but not all, implementations.
 
-| Arch    | Brand | CPU             | System                    | Target              | 1KiB (GiB/s) | 1MiB (GiB/s) |
-|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:|
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq   |          ~49 |         ~111 |
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | sse-pclmulqdq       |          ~18 |          ~52 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq   |          ~23 |          ~54 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | sse-pclmulqdq       |          ~11 |          ~20 |
-| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-eor3-pclmulqdq |          ~19 |          ~39 |
-| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pclmulqdq      |          ~10 |          ~17 |
-| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-eor3-pclmulqdq |          ~49 |          ~99 |
-| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq |          ~56 |          ~94 |
+| Arch    | Brand | CPU             | System                    | Target            | 1KiB (GiB/s) | 1MiB (GiB/s) |
+|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:|
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq |          ~52 |         ~111 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq |          ~24 |          ~54 |
+| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-pmull-eor3   |          ~21 |          ~53 |
+| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pmull        |          ~11 |          ~17 |
+| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-pmull-eor3   |          ~49 |          ~99 |
+| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-pmull-eor3   |          ~56 |          ~94 |
 
 ### CRC-32/ISO-HDLC (reflected)
 
 AKA `crc32` in many, but not all, implementations.
 
-| Arch    | Brand | CPU             | System                    | Target              | 1KiB (GiB/s) | 1MiB (GiB/s) |
-|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:|
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-248xl       | avx512-vpclmulqdq   |          ~24 |         ~110 |
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-248xl       | sse-pclmulqdq       |          ~21 |          ~28 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq   |          ~24 |          ~55 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | sse-pclmulqdq       |          ~12 |          ~14 |
-| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-eor3-pclmulqdq |          ~19 |          ~39 |
-| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pclmulqdq      |          ~10 |          ~17 |
-| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-eor3-pclmulqdq |          ~48 |          ~98 |
-| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq |          ~56 |          ~94 |
+| Arch    | Brand | CPU             | System                    | Target            | 1KiB (GiB/s) | 1MiB (GiB/s) |
+|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:|
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-248xl       | avx512-vpclmulqdq |          ~20 |          ~88 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq |          ~15 |          ~55 |
+| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-pmull-eor3   |          ~21 |          ~53 |
+| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pmull        |          ~11 |          ~17 |
+| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-pmull-eor3   |          ~48 |          ~98 |
+| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-pmull-eor3   |          ~56 |          ~94 |
 
 ### CRC-64/NVME (reflected)
 
 [AWS S3's recommended checksum option](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html)
 
-| Arch    | Brand | CPU             | System                    | Target              | 1KiB (GiB/s) | 1MiB (GiB/s) |
-|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:|
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq   |          ~25 |         ~110 |
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | sse-pclmulqdq       |          ~21 |          ~28 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq   |          ~25 |          ~55 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | sse-pclmulqdq       |          ~11 |          ~14 |
-| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-eor3-pclmulqdq |          ~20 |          ~37 |
-| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pclmulqdq      |          ~10 |          ~16 |
-| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-eor3-pclmulqdq |          ~50 |          ~72 |
-| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq |          ~52 |          ~72 |
+| Arch    | Brand | CPU             | System                    | Target            | 1KiB (GiB/s) | 1MiB (GiB/s) |
+|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:|
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq |          ~21 |         ~110 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq |          ~16 |          ~55 |
+| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-pmull-eor3   |          ~21 |          ~41 |
+| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pmull        |          ~11 |          ~16 |
+| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-pmull-eor3   |          ~50 |          ~72 |
+| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-pmull-eor3   |          ~52 |          ~72 |
 
 ### CRC-32/BZIP2 (forward)
 
-| Arch    | Brand | CPU             | System                    | Target              | 1KiB (GiB/s) | 1MiB (GiB/s) |
-|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:|
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq   |          ~23 |          ~56 |
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | sse-pclmulqdq       |          ~19 |          ~28 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq   |          ~21 |          ~43 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | sse-pclmulqdq       |          ~11 |          ~13 |
-| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-eor3-pclmulqdq |          ~16 |          ~32 |
-| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pclmulqdq      |           ~9 |          ~14 |
-| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-eor3-pclmulqdq |          ~41 |          ~59 |
-| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq |          ~47 |          ~64 |
+| Arch    | Brand | CPU             | System                    | Target            | 1KiB (GiB/s) | 1MiB (GiB/s) |
+|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:|
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq |          ~20 |          ~56 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq |          ~14 |          ~43 |
+| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-pmull-eor3   |          ~18 |          ~40 |
+| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pmull        |           ~9 |          ~14 |
+| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-pmull-eor3   |          ~41 |          ~59 |
+| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-pmull-eor3   |          ~47 |          ~64 |
 
 ### CRC-64/ECMA-182 (forward)
 
-| Arch    | Brand | CPU             | System                    | Target              | 1KiB (GiB/s) | 1MiB (GiB/s) |
-|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:|
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq   |          ~24 |          ~56 |
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | sse-pclmulqdq       |          ~19 |          ~28 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq   |          ~21 |          ~43 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | sse-pclmulqdq       |          ~11 |          ~13 |
-| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-eor3-pclmulqdq |          ~18 |          ~31 |
-| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pclmulqdq      |           ~9 |          ~14 |
-| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-eor3-pclmulqdq |          ~40 |          ~59 |
-| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq |          ~46 |          ~61 |
-
+| Arch    | Brand | CPU             | System                    | Target            | 1KiB (GiB/s) | 1MiB (GiB/s) |
+|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:|
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq |          ~21 |          ~56 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq |          ~14 |          ~43 |
+| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-pmull-eor3   |          ~19 |          ~40 |
+| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pmull        |           ~9 |          ~14 |
+| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-pmull-eor3   |          ~40 |          ~59 |
+| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-pmull-eor3   |          ~46 |          ~61 |
 
 ## Other CRC widths
 
@@ -422,6 +411,7 @@ PRs welcome!
 
 ## References
 
+* [Catalogue of parametrised CRC algorithms](https://reveng.sourceforge.io/crc-catalogue/all.htm)
 * [crc32-fast](https://crates.io/crates/crc32fast) Original `CRC-32/ISO-HDLC` (`crc32`) implementation in `Rust`.
 * [crc64-fast](https://github.com/tikv/crc64fast) Original `CRC-64/XZ` implementation in `Rust`.
 * [crc64fast-nvme](https://github.com/awesomized/crc64fast-nvme) Original `CRC-64/NVME` implementation in `Rust`.
diff --git a/benches/benchmark.rs b/benches/benchmark.rs
index ecec9d8..82ee13a 100644
--- a/benches/benchmark.rs
+++ b/benches/benchmark.rs
@@ -31,11 +31,12 @@ pub const SIZES: &[(&str, i32); 2] = &[
 ];
 
 // these are the most important algorithms in popular use, with forward/reflected coverage
-pub const CRC32_ALGORITHMS: &[CrcAlgorithm; 3] = &[
+pub const CRC32_ALGORITHMS: &[CrcAlgorithm; 4] = &[
     // benchmark both CRC-32/ISCSI and CRC-32/ISO-HDLC since they're special flowers with lots of
     // different acceleration targets.
-    CrcAlgorithm::Crc32Iscsi,   // reflected
-    CrcAlgorithm::Crc32IsoHdlc, // reflected
+    CrcAlgorithm::Crc32Autosar, // reflected
+    CrcAlgorithm::Crc32Iscsi,   // reflected, fusion
+    CrcAlgorithm::Crc32IsoHdlc, // reflected, fusion
     CrcAlgorithm::Crc32Bzip2,   // forward
 ];
 
diff --git a/src/algorithm.rs b/src/algorithm.rs
index fe1459b..e3a83b9 100644
--- a/src/algorithm.rs
+++ b/src/algorithm.rs
@@ -53,12 +53,7 @@ fn extract_keys_array(params: CrcParams) -> [u64; 23] {
 }
 
 /// Main entry point that works for both CRC-32 and CRC-64
-#[inline]
-#[cfg_attr(
-    any(target_arch = "x86", target_arch = "x86_64"),
-    target_feature(enable = "sse3,sse4.1,pclmulqdq")
-)]
-#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))]
+#[inline(always)]
 pub unsafe fn update<T: ArchOps, W: EnhancedCrcWidth>(
     state: W::Value,
     bytes: &[u8],
diff --git a/src/arch/aarch64.rs b/src/arch/aarch64/aes.rs
similarity index 86%
rename from src/arch/aarch64.rs
rename to src/arch/aarch64/aes.rs
index 884ed82..8b9abb7 100644
--- a/src/arch/aarch64.rs
+++ b/src/arch/aarch64/aes.rs
@@ -1,16 +1,22 @@
 // Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
 
-//! This module provides AArch64-specific implementations of the ArchOps trait.
+//! This module provides AArch64-specific implementations of the ArchOps trait for architectures
+//! with AES support.
 
 #![cfg(target_arch = "aarch64")]
 
 use crate::traits::ArchOps;
 use std::arch::aarch64::*;
 
+// Tier-specific ArchOps implementations for AArch64
+
+/// Base AArch64 implementation with AES+NEON optimizations
+/// NEON is implicit with AES support on AArch64
 #[derive(Debug, Copy, Clone)]
-pub struct AArch64Ops;
+pub struct Aarch64AesOps;
 
-impl ArchOps for AArch64Ops {
+// Base implementation for AArch64 AES tier
+impl ArchOps for Aarch64AesOps {
     type Vector = uint8x16_t;
 
     #[inline]
@@ -56,7 +62,6 @@ impl ArchOps for AArch64Ops {
     #[target_feature(enable = "neon")]
     unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2] {
         let x = vreinterpretq_u64_u8(vector);
-
         [vgetq_lane_u64(x, 0), vgetq_lane_u64(x, 1)]
     }
 
@@ -64,7 +69,6 @@ impl ArchOps for AArch64Ops {
     #[target_feature(enable = "neon")]
     unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2] {
         let x = vreinterpretq_p64_u8(vector);
-
         [vgetq_lane_p64(x, 0), vgetq_lane_p64(x, 1)]
     }
 
@@ -89,7 +93,6 @@ impl ArchOps for AArch64Ops {
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector {
-        // AArch64 uses vqtbl1q_u8 for byte shuffle
         vqtbl1q_u8(data, mask)
     }
 
@@ -101,16 +104,13 @@ impl ArchOps for AArch64Ops {
         b: Self::Vector,
         mask: Self::Vector,
     ) -> Self::Vector {
-        // AArch64 needs explicit MSB mask creation and uses vbslq_u8
         let msb_mask = vcltq_s8(vreinterpretq_s8_u8(mask), vdupq_n_s8(0));
-
         vbslq_u8(msb_mask, b, a)
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 with 0 for shifting
         vextq_u8(vdupq_n_u8(0), vector, 8)
     }
 
@@ -123,7 +123,6 @@ impl ArchOps for AArch64Ops {
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn create_compare_mask(&self, vector: Self::Vector) -> Self::Vector {
-        // Create a mask based on MSB for AArch64
         vcltq_s8(vreinterpretq_s8_u8(vector), vdupq_n_s8(0))
     }
 
@@ -136,14 +135,12 @@ impl ArchOps for AArch64Ops {
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 for shifting
         vextq_u8(vector, vdupq_n_u8(0), 4)
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 with 0 for shifting
         vextq_u8(vdupq_n_u8(0), vector, 12) // 16-4=12
     }
 
@@ -158,56 +155,48 @@ impl ArchOps for AArch64Ops {
             // For high=false, place in the low 32 bits of the low 64 bits
             result = vreinterpretq_u64_u32(vsetq_lane_u32(value, vreinterpretq_u32_u64(result), 0));
         }
-
         vreinterpretq_u8_u64(result)
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 with 0 for shifting left
         vextq_u8(vdupq_n_u8(0), vector, 12) // 16-4=12
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 for shifting right
         vextq_u8(vector, vdupq_n_u8(0), 4)
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 for shifting
         vextq_u8(vector, vdupq_n_u8(0), 8)
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 for shifting
         vextq_u8(vector, vdupq_n_u8(0), 5)
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 for shifting
         vextq_u8(vector, vdupq_n_u8(0), 6)
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 for shifting
         vextq_u8(vector, vdupq_n_u8(0), 7)
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 for shifting
         vextq_u8(vector, vdupq_n_u8(0), 12)
     }
 
@@ -216,7 +205,6 @@ impl ArchOps for AArch64Ops {
     unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector {
         let low_32 = vgetq_lane_u32(vreinterpretq_u32_u8(vector), 0);
         let result = vsetq_lane_u32(low_32, vdupq_n_u32(0), 3);
-
         vreinterpretq_u8_u32(result)
     }
 
@@ -232,7 +220,6 @@ impl ArchOps for AArch64Ops {
     #[inline]
     #[target_feature(enable = "aes")]
     unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
-        // Low 64 bits of a, high 64 bits of b
         let a_low = vgetq_lane_p64(vreinterpretq_p64_u8(a), 1);
         let b_high = vgetq_lane_p64(vreinterpretq_p64_u8(b), 0);
         vreinterpretq_u8_p128(vmull_p64(a_low, b_high))
@@ -257,19 +244,6 @@ impl ArchOps for AArch64Ops {
     }
 
     #[inline]
-    #[cfg(target_feature = "sha3")]
-    #[target_feature(enable = "sha3")]
-    unsafe fn xor3_vectors(
-        &self,
-        a: Self::Vector,
-        b: Self::Vector,
-        c: Self::Vector,
-    ) -> Self::Vector {
-        veor3q_u8(a, b, c)
-    }
-
-    #[inline]
-    #[cfg(not(target_feature = "sha3"))]
     #[target_feature(enable = "neon")]
     unsafe fn xor3_vectors(
         &self,
@@ -277,13 +251,11 @@ impl ArchOps for AArch64Ops {
         b: Self::Vector,
         c: Self::Vector,
     ) -> Self::Vector {
-        // Fallback for when SHA3 is not available
         veorq_u8(veorq_u8(a, b), c)
     }
 }
 
-impl AArch64Ops {
-    // Helper methods specific to AArch64
+impl Aarch64AesOps {
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn load_key_pair(&self, idx1: u64, idx2: u64) -> uint8x16_t {
diff --git a/src/arch/aarch64/aes_sha3.rs b/src/arch/aarch64/aes_sha3.rs
new file mode 100644
index 0000000..bc3d365
--- /dev/null
+++ b/src/arch/aarch64/aes_sha3.rs
@@ -0,0 +1,201 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides AArch64-specific implementations of the ArchOps trait for architectures
+//! with AES+SHA3 support.
+
+#![cfg(target_arch = "aarch64")]
+
+use crate::arch::aarch64::aes::Aarch64AesOps;
+use crate::traits::ArchOps;
+use std::arch::aarch64::*;
+
+/// AArch64 AES+SHA3 tier - delegates to AES tier and overrides XOR3 operations
+/// Provides EOR3 instruction for optimal XOR3 performance
+#[derive(Debug, Copy, Clone)]
+pub struct Aarch64AesSha3Ops(Aarch64AesOps);
+
+impl Aarch64AesSha3Ops {
+    #[inline(always)]
+    pub fn new() -> Self {
+        Self(Aarch64AesOps)
+    }
+}
+
+// SHA3 tier implementation - delegates to AES tier and overrides XOR3
+impl ArchOps for Aarch64AesSha3Ops {
+    type Vector = uint8x16_t;
+
+    // Delegate methods to the base AES implementation
+    #[inline(always)]
+    unsafe fn create_vector_from_u64_pair(
+        &self,
+        high: u64,
+        low: u64,
+        reflected: bool,
+    ) -> Self::Vector {
+        self.0.create_vector_from_u64_pair(high, low, reflected)
+    }
+
+    #[inline(always)]
+    unsafe fn create_vector_from_u64_pair_non_reflected(
+        &self,
+        high: u64,
+        low: u64,
+    ) -> Self::Vector {
+        self.0.create_vector_from_u64_pair_non_reflected(high, low)
+    }
+
+    #[inline(always)]
+    unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector {
+        self.0.create_vector_from_u64(value, high)
+    }
+
+    #[inline(always)]
+    unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2] {
+        self.0.extract_u64s(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2] {
+        self.0.extract_poly64s(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn xor_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.xor_vectors(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector {
+        self.0.load_bytes(ptr)
+    }
+
+    #[inline(always)]
+    unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector {
+        self.0.load_aligned(ptr)
+    }
+
+    #[inline(always)]
+    unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector {
+        self.0.shuffle_bytes(data, mask)
+    }
+
+    #[inline(always)]
+    unsafe fn blend_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        mask: Self::Vector,
+    ) -> Self::Vector {
+        self.0.blend_vectors(a, b, mask)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_8(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn set_all_bytes(&self, value: u8) -> Self::Vector {
+        self.0.set_all_bytes(value)
+    }
+
+    #[inline(always)]
+    unsafe fn create_compare_mask(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.create_compare_mask(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn and_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.and_vectors(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_32(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_32(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn create_vector_from_u32(&self, value: u32, high: bool) -> Self::Vector {
+        self.0.create_vector_from_u32(value, high)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_4(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_4(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_8(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_5(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_6(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_7(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_12(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_12(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn carryless_mul_00(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_00(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_01(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn carryless_mul_10(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_10(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_11(a, b)
+    }
+
+    // Override XOR3 to use SHA3 EOR3 instruction when available
+    #[inline]
+    #[target_feature(enable = "sha3")]
+    unsafe fn xor3_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        c: Self::Vector,
+    ) -> Self::Vector {
+        // SHA3 tier always uses EOR3 instruction
+        // Feature detection is handled at the dispatch level
+        veor3q_u8(a, b, c)
+    }
+}
diff --git a/src/arch/aarch64/mod.rs b/src/arch/aarch64/mod.rs
new file mode 100644
index 0000000..4f04674
--- /dev/null
+++ b/src/arch/aarch64/mod.rs
@@ -0,0 +1,8 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides AArch64-specific implementations of the ArchOps trait.
+
+#![cfg(target_arch = "aarch64")]
+
+pub mod aes;
+pub mod aes_sha3;
diff --git a/src/arch/mod.rs b/src/arch/mod.rs
index 669ba2c..aef965d 100644
--- a/src/arch/mod.rs
+++ b/src/arch/mod.rs
@@ -3,167 +3,173 @@
 //! This module provides the main entry point for the SIMD CRC calculation.
 //!
 //! It dispatches to the appropriate architecture-specific implementation
-//! based on the target architecture.
 
 #[cfg(target_arch = "aarch64")]
 use std::arch::is_aarch64_feature_detected;
 
-#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
-use crate::algorithm;
-
 use crate::CrcParams;
 
-#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
-use crate::structs::{Width32, Width64};
-
 #[cfg(target_arch = "aarch64")]
-use aarch64::AArch64Ops;
+use crate::arch::aarch64::aes::Aarch64AesOps;
 
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-use x86::X86Ops;
+#[cfg(target_arch = "aarch64")]
+use crate::arch::aarch64::aes_sha3::Aarch64AesSha3Ops;
 
-#[rustversion::since(1.89)]
-#[cfg(target_arch = "x86_64")]
-use vpclmulqdq::Vpclmulqdq512Ops;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
+use crate::{
+    algorithm,
+    structs::{Width32, Width64},
+};
 
-mod aarch64;
-mod software;
-mod vpclmulqdq;
-mod x86;
+pub mod aarch64;
+pub mod software;
+pub mod x86;
+pub mod x86_64;
 
 /// Main entry point that dispatches to the appropriate architecture
 ///
-///
 /// # Safety
 /// May use native CPU features
-#[inline]
+#[inline(always)]
 #[cfg(target_arch = "aarch64")]
-#[target_feature(enable = "aes")]
 pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
-    let ops = AArch64Ops;
+    use crate::feature_detection::{get_arch_ops, ArchOpsInstance};
+
+    match get_arch_ops() {
+        ArchOpsInstance::Aarch64AesSha3(ops) => update_aarch64_aes_sha3(state, bytes, params, *ops),
+        ArchOpsInstance::Aarch64Aes(ops) => update_aarch64_aes(state, bytes, params, *ops),
+        ArchOpsInstance::SoftwareFallback => {
+            if !is_aarch64_feature_detected!("aes") || !is_aarch64_feature_detected!("neon") {
+                #[cfg(any(not(target_feature = "aes"), not(target_feature = "neon")))]
+                {
+                    // Use software implementation when no SIMD support is available
+                    return crate::arch::software::update(state, bytes, params);
+                }
+            }
 
-    match params.width {
-        64 => algorithm::update::<AArch64Ops, Width64>(state, bytes, params, &ops),
-        32 => algorithm::update::<AArch64Ops, Width32>(state as u32, bytes, params, &ops) as u64,
-        _ => panic!("Unsupported CRC width: {}", params.width),
+            // This should likely never happen, but just in case
+            panic!("aarch64 features missing (NEON and/or AES)");
+        }
     }
 }
 
-#[rustversion::before(1.89)]
-#[inline]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")]
-pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
-    update_x86_sse(state, bytes, params)
-}
-
-#[rustversion::since(1.89)]
 #[inline]
-#[cfg(target_arch = "x86")]
-#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")]
-pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
-    update_x86_sse(state, bytes, params)
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "aes")]
+unsafe fn update_aarch64_aes(
+    state: u64,
+    bytes: &[u8],
+    params: CrcParams,
+    ops: Aarch64AesOps,
+) -> u64 {
+    match params.width {
+        64 => algorithm::update::<_, Width64>(state, bytes, params, &ops),
+        32 => algorithm::update::<_, Width32>(state as u32, bytes, params, &ops) as u64,
+        _ => panic!("Unsupported CRC width: {}", params.width),
+    }
 }
 
-#[rustversion::since(1.89)]
 #[inline]
-#[cfg(target_arch = "x86_64")]
-#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")]
-pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
-    use std::arch::is_x86_feature_detected;
-
-    if bytes.len() >= 256
-        && is_x86_feature_detected!("vpclmulqdq")
-        && is_x86_feature_detected!("avx512f")
-        && is_x86_feature_detected!("avx512vl")
-    {
-        let ops = Vpclmulqdq512Ops::new();
-
-        return match params.width {
-            64 => algorithm::update::<Vpclmulqdq512Ops, Width64>(state, bytes, params, &ops),
-            32 => algorithm::update::<Vpclmulqdq512Ops, Width32>(state as u32, bytes, params, &ops)
-                as u64,
-            _ => panic!("Unsupported CRC width: {}", params.width),
-        };
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "aes,sha3")]
+unsafe fn update_aarch64_aes_sha3(
+    state: u64,
+    bytes: &[u8],
+    params: CrcParams,
+    ops: Aarch64AesSha3Ops,
+) -> u64 {
+    match params.width {
+        64 => algorithm::update::<_, Width64>(state, bytes, params, &ops),
+        32 => algorithm::update::<_, Width32>(state as u32, bytes, params, &ops) as u64,
+        _ => panic!("Unsupported CRC width: {}", params.width),
     }
-
-    // fallback to the standard x86 SSE implementation
-    update_x86_sse(state, bytes, params)
 }
 
-#[inline]
-#[cfg(all(
-    not(target_arch = "x86"),
-    not(target_arch = "x86_64"),
-    not(target_arch = "aarch64")
-))]
+/// Main entry point for x86/x86_64 (Rust 1.89+ which supports AVX-512)
+///
+/// # Safety
+/// May use native CPU features
+#[rustversion::since(1.89)]
+#[inline(always)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
-    software::update(state, bytes, params)
-}
+    use crate::feature_detection::{get_arch_ops, ArchOpsInstance};
 
-#[inline]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")]
-unsafe fn update_x86_sse(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
-    let ops = X86Ops;
+    match get_arch_ops() {
+        #[cfg(target_arch = "x86_64")]
+        ArchOpsInstance::X86_64Avx512Vpclmulqdq(ops) => match params.width {
+            64 => algorithm::update::<_, Width64>(state, bytes, params, ops),
+            32 => algorithm::update::<_, Width32>(state as u32, bytes, params, ops) as u64,
+            _ => panic!("Unsupported CRC width: {}", params.width),
+        },
+        #[cfg(target_arch = "x86_64")]
+        ArchOpsInstance::X86_64Avx512Pclmulqdq(ops) => match params.width {
+            64 => algorithm::update::<_, Width64>(state, bytes, params, ops),
+            32 => algorithm::update::<_, Width32>(state as u32, bytes, params, ops) as u64,
+            _ => panic!("Unsupported CRC width: {}", params.width),
+        },
+        ArchOpsInstance::X86SsePclmulqdq(ops) => match params.width {
+            64 => algorithm::update::<_, Width64>(state, bytes, params, ops),
+            32 => algorithm::update::<_, Width32>(state as u32, bytes, params, ops) as u64,
+            _ => panic!("Unsupported CRC width: {}", params.width),
+        },
+        ArchOpsInstance::SoftwareFallback => {
+            #[cfg(target_arch = "x86")]
+            crate::arch::x86_software_update(state, bytes, params);
 
-    match params.width {
-        64 => algorithm::update::<X86Ops, Width64>(state, bytes, params, &ops),
-        32 => algorithm::update::<X86Ops, Width32>(state as u32, bytes, params, &ops) as u64,
-        _ => panic!("Unsupported CRC width: {}", params.width),
+            // This should never happen, but just in case
+            panic!("x86 features missing (SSE4.1 && PCLMULQDQ)");
+        }
     }
 }
 
+/// Main entry point for x86/x86_64 (Rust < 1.89 with no AVX-512 support)
+///
+/// # Safety
+/// May use native CPU features
 #[rustversion::before(1.89)]
-pub fn get_target() -> String {
-    #[cfg(target_arch = "aarch64")]
-    {
-        if is_aarch64_feature_detected!("sha3") {
-            return "aarch64-neon-eor3-pclmulqdq".to_string();
-        }
+#[inline(always)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
+    use crate::feature_detection::{get_arch_ops, ArchOpsInstance};
 
-        "aarch64-neon-pclmulqdq".to_string()
+    match get_arch_ops() {
+        ArchOpsInstance::X86SsePclmulqdq(ops) => match params.width {
+            64 => algorithm::update::<_, Width64>(state, bytes, params, ops),
+            32 => algorithm::update::<_, Width32>(state as u32, bytes, params, ops) as u64,
+            _ => panic!("Unsupported CRC width: {}", params.width),
+        },
+        ArchOpsInstance::SoftwareFallback => x86_software_update(state, bytes, params),
     }
-
-    #[allow(unreachable_code)]
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    return "x86-sse-pclmulqdq".to_string();
-
-    #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))]
-    return "software-fallback-tables".to_string();
 }
 
-#[rustversion::since(1.89)]
-pub fn get_target() -> String {
-    #[cfg(target_arch = "aarch64")]
-    {
-        if is_aarch64_feature_detected!("sha3") {
-            return "aarch64-neon-eor3-pclmulqdq".to_string();
-        }
-
-        "aarch64-neon-pclmulqdq".to_string()
-    }
-
-    #[cfg(target_arch = "x86_64")]
-    {
-        if is_x86_feature_detected!("vpclmulqdq")
-            && is_x86_feature_detected!("avx512f")
-            && is_x86_feature_detected!("avx512vl")
+#[inline(always)]
+#[allow(unused)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn x86_software_update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
+    if !is_x86_feature_detected!("sse4.1") || !is_x86_feature_detected!("pclmulqdq") {
+        #[cfg(all(
+            target_arch = "x86",
+            any(not(target_feature = "sse4.1"), not(target_feature = "pclmulqdq"))
+        ))]
         {
-            return "x86_64-avx512-vpclmulqdq".to_string();
-        }
-
-        if is_x86_feature_detected!("avx2") {
-            return "x86_64-avx2-pclmulqdq".to_string();
+            // Use software implementation when no SIMD support is available
+            crate::arch::software::update(state, bytes, params);
         }
     }
 
-    #[allow(unreachable_code)]
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    return "x86-sse-pclmulqdq".to_string();
+    // This should never happen, but just in case
+    panic!("x86 features missing (SSE4.1 && PCLMULQDQ)");
+}
 
-    #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))]
-    return "software-fallback-tables".to_string();
+#[inline]
+#[cfg(all(
+    not(target_arch = "x86"),
+    not(target_arch = "x86_64"),
+    not(target_arch = "aarch64")
+))]
+pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
+    crate::arch::software::update(state, bytes, params)
 }
 
 #[cfg(test)]
diff --git a/src/arch/software.rs b/src/arch/software.rs
index 4739a1b..9cac286 100644
--- a/src/arch/software.rs
+++ b/src/arch/software.rs
@@ -1,66 +1,102 @@
 // Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
 
 //! This module contains a software fallback for unsupported architectures.
-
-#![cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
+//!
+//! Software fallback is conditionally compiled based on target architecture:
+//! - Always included for non-SIMD architectures (not x86/x86_64/aarch64)
+//! - Included for x86 when SSE4.1/PCLMULQDQ may not be available
+//! - Included for aarch64 for runtime fallback when AES is not detected
+//! - Excluded for x86_64 since SSE4.1/PCLMULQDQ are always available (but included for testing)
+
+#![cfg(any(
+    // Non-aarch64/x86/x86_64 architectures always need software fallback
+    not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")),
+    // x86 may not have SSE4.1/PCLMULQDQ support
+    all(target_arch = "x86", any(not(target_feature = "sse4.1"), not(target_feature = "pclmulqdq"))),
+    // aarch64 needs software fallback for runtime detection when AES is not available...
+    // NEON doesn't guarantee AES, so for rare outlier CPUs this might not work 100%...
+    all(target_arch = "aarch64", not(target_feature = "aes")),
+    // Include for testing on all architectures
+    test
+))]
 
 use crate::consts::CRC_64_NVME;
 use crate::CrcAlgorithm;
 use crate::CrcParams;
 use crc::{Algorithm, Table};
 
+#[allow(unused)]
 const RUST_CRC32_AIXM: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_AIXM);
 
+#[allow(unused)]
 const RUST_CRC32_AUTOSAR: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_AUTOSAR);
 
+#[allow(unused)]
 const RUST_CRC32_BASE91_D: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_BASE91_D);
 
+#[allow(unused)]
 const RUST_CRC32_BZIP2: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_BZIP2);
 
+#[allow(unused)]
 const RUST_CRC32_CD_ROM_EDC: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_CD_ROM_EDC);
 
+#[allow(unused)]
 const RUST_CRC32_CKSUM: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_CKSUM);
 
+#[allow(unused)]
 const RUST_CRC32_ISCSI: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_ISCSI);
 
+#[allow(unused)]
 const RUST_CRC32_ISO_HDLC: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_ISO_HDLC);
 
+#[allow(unused)]
 const RUST_CRC32_JAMCRC: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_JAMCRC);
 
+#[allow(unused)]
 const RUST_CRC32_MEF: crc::Crc<u32, Table<16>> = crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_MEF);
 
+#[allow(unused)]
 const RUST_CRC32_MPEG_2: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_MPEG_2);
 
+#[allow(unused)]
 const RUST_CRC32_XFER: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_XFER);
 
+#[allow(unused)]
 const RUST_CRC64_ECMA_182: crc::Crc<u64, Table<16>> =
     crc::Crc::<u64, Table<16>>::new(&crc::CRC_64_ECMA_182);
 
+#[allow(unused)]
 const RUST_CRC64_GO_ISO: crc::Crc<u64, Table<16>> =
     crc::Crc::<u64, Table<16>>::new(&crc::CRC_64_GO_ISO);
 
+#[allow(unused)]
 const RUST_CRC64_MS: crc::Crc<u64, Table<16>> = crc::Crc::<u64, Table<16>>::new(&crc::CRC_64_MS);
 
+#[allow(unused)]
 const RUST_CRC64_NVME: crc::Crc<u64, Table<16>> = crc::Crc::<u64, Table<16>>::new(&CRC_64_NVME);
 
+#[allow(unused)]
 const RUST_CRC64_REDIS: crc::Crc<u64, Table<16>> =
     crc::Crc::<u64, Table<16>>::new(&crc::CRC_64_REDIS);
 
+#[allow(unused)]
 const RUST_CRC64_WE: crc::Crc<u64, Table<16>> = crc::Crc::<u64, Table<16>>::new(&crc::CRC_64_WE);
 
+#[allow(unused)]
 const RUST_CRC64_XZ: crc::Crc<u64, Table<16>> = crc::Crc::<u64, Table<16>>::new(&crc::CRC_64_XZ);
 
+#[allow(unused)]
 // Dispatch function that handles the generic case
 pub(crate) fn update(state: u64, data: &[u8], params: CrcParams) -> u64 {
     match params.width {
@@ -80,7 +116,7 @@ pub(crate) fn update(state: u64, data: &[u8], params: CrcParams) -> u64 {
                 CrcAlgorithm::Crc32Xfer => RUST_CRC32_XFER,
                 CrcAlgorithm::Crc32Custom => {
                     let algorithm: Algorithm<u32> = Algorithm {
-                        width: params.width as u8,
+                        width: params.width,
                         poly: params.poly as u32,
                         init: params.init as u32,
                         refin: params.refin,
@@ -110,13 +146,13 @@ pub(crate) fn update(state: u64, data: &[u8], params: CrcParams) -> u64 {
                 CrcAlgorithm::Crc64Xz => RUST_CRC64_XZ,
                 CrcAlgorithm::Crc64Custom => {
                     let algorithm: Algorithm<u64> = Algorithm {
-                        width: params.width as u8,
-                        poly: params.poly as u64,
-                        init: params.init as u64,
+                        width: params.width,
+                        poly: params.poly,
+                        init: params.init,
                         refin: params.refin,
                         refout: params.refout,
-                        xorout: params.xorout as u64,
-                        check: params.check as u64,
+                        xorout: params.xorout,
+                        check: params.check,
                         residue: 0x0000000000000000, // unused in this context
                     };
 
diff --git a/src/arch/x86/mod.rs b/src/arch/x86/mod.rs
new file mode 100644
index 0000000..bead067
--- /dev/null
+++ b/src/arch/x86/mod.rs
@@ -0,0 +1,7 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides x86-specific implementations of the ArchOps trait.
+
+#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+
+pub mod sse;
diff --git a/src/arch/x86.rs b/src/arch/x86/sse.rs
similarity index 85%
rename from src/arch/x86.rs
rename to src/arch/x86/sse.rs
index 08a8b4a..adaf8b8 100644
--- a/src/arch/x86.rs
+++ b/src/arch/x86/sse.rs
@@ -4,7 +4,7 @@
 //!
 //! This module is designed to work with both x86 and x86_64 architectures.
 //!
-//! It uses the SSE2 and SSE4.1 instruction sets for SIMD operations.
+//! It uses the SSE4.1 instruction sets for SIMD operations.
 
 #![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 
@@ -16,10 +16,15 @@ use std::arch::x86_64::*;
 
 use crate::traits::ArchOps;
 
+// Tier-specific ArchOps implementations for x86/x86_64
+
+/// Base x86/x86_64 SSE+PCLMULQDQ implementation - baseline performance for both architectures
+/// Uses SSE4.1 and PCLMULQDQ instructions
 #[derive(Debug, Copy, Clone)]
-pub struct X86Ops;
+pub struct X86SsePclmulqdqOps;
 
-impl ArchOps for X86Ops {
+// Base implementation for x86/x86_64 SSE+PCLMULQDQ tier
+impl ArchOps for X86SsePclmulqdqOps {
     type Vector = __m128i;
 
     #[inline]
@@ -52,7 +57,6 @@ impl ArchOps for X86Ops {
     #[inline]
     #[target_feature(enable = "sse4.1")]
     unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector {
-        // x86 uses custom helper
         self.create_u64_vector(value, high)
     }
 
@@ -78,21 +82,18 @@ impl ArchOps for X86Ops {
     #[inline]
     #[target_feature(enable = "sse2")]
     unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector {
-        // x86 requires cast to __m128i*
         _mm_loadu_si128(ptr as *const __m128i)
     }
 
     #[inline]
     #[target_feature(enable = "sse2")]
     unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector {
-        // x86 requires cast to __m128i*
         _mm_loadu_si128(ptr as *const __m128i)
     }
 
     #[inline]
     #[target_feature(enable = "ssse3")]
     unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector {
-        // x86 uses specific SSSE3 instruction
         _mm_shuffle_epi8(data, mask)
     }
 
@@ -104,14 +105,12 @@ impl ArchOps for X86Ops {
         b: Self::Vector,
         mask: Self::Vector,
     ) -> Self::Vector {
-        // x86 has native blend that uses MSB automatically
         _mm_blendv_epi8(a, b, mask)
     }
 
     #[inline]
     #[target_feature(enable = "sse2")]
     unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector {
-        // x86 has a dedicated shift instruction
         _mm_slli_si128(vector, 8)
     }
 
@@ -227,23 +226,6 @@ impl ArchOps for X86Ops {
         _mm_clmulepi64_si128(a, b, 0x11)
     }
 
-    #[rustversion::since(1.89)]
-    #[inline]
-    #[target_feature(enable = "sse4.1")]
-    unsafe fn xor3_vectors(
-        &self,
-        a: Self::Vector,
-        b: Self::Vector,
-        c: Self::Vector,
-    ) -> Self::Vector {
-        if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
-            return self.xor3_vectors_avx512(a, b, c);
-        }
-
-        self.xor3_vectors_sse(a, b, c)
-    }
-
-    #[rustversion::before(1.89)]
     #[inline]
     #[target_feature(enable = "sse4.1")]
     unsafe fn xor3_vectors(
@@ -252,12 +234,12 @@ impl ArchOps for X86Ops {
         b: Self::Vector,
         c: Self::Vector,
     ) -> Self::Vector {
-        self.xor3_vectors_sse(a, b, c)
+        // SSE tier always uses two XOR operations
+        _mm_xor_si128(_mm_xor_si128(a, b), c)
     }
 }
 
-impl X86Ops {
-    // Helper methods specific to x86/x86_64
+impl X86SsePclmulqdqOps {
     #[inline]
     #[target_feature(enable = "sse2")]
     unsafe fn set_epi64x(&self, e1: u64, e0: u64) -> __m128i {
@@ -318,20 +300,4 @@ impl X86Ops {
             lo | (hi << 32)
         }
     }
-
-    #[rustversion::since(1.89)]
-    #[inline]
-    #[target_feature(enable = "avx512f,avx512vl")]
-    unsafe fn xor3_vectors_avx512(&self, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-        _mm_ternarylogic_epi64(
-            a, b, c, 0x96, // XOR3
-        )
-    }
-
-    #[inline]
-    #[target_feature(enable = "sse4.1")]
-    unsafe fn xor3_vectors_sse(&self, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-        // x86 doesn't have native XOR3 in SSE, use two XORs
-        _mm_xor_si128(_mm_xor_si128(a, b), c)
-    }
 }
diff --git a/src/arch/x86_64/avx512.rs b/src/arch/x86_64/avx512.rs
new file mode 100644
index 0000000..d26ec2f
--- /dev/null
+++ b/src/arch/x86_64/avx512.rs
@@ -0,0 +1,220 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides x86_64-specific implementations of the ArchOps trait.
+//!
+//! It uses the AVX-512 instruction sets for SIMD operations.
+
+#![cfg(target_arch = "x86_64")]
+
+#[rustversion::since(1.89)]
+use std::arch::x86_64::*;
+
+#[rustversion::since(1.89)]
+use crate::arch::x86::sse::X86SsePclmulqdqOps;
+#[rustversion::since(1.89)]
+use crate::traits::ArchOps;
+
+/// x86_64-only AVX512+PCLMULQDQ tier - delegates to SSE tier and overrides XOR3 operations
+/// Uses AVX512 ternary logic for XOR3 operations with PCLMULQDQ
+#[rustversion::since(1.89)]
+#[derive(Debug, Copy, Clone)]
+pub struct X86_64Avx512PclmulqdqOps(X86SsePclmulqdqOps);
+
+#[rustversion::since(1.89)]
+impl X86_64Avx512PclmulqdqOps {
+    #[inline(always)]
+    pub fn new() -> Self {
+        Self(X86SsePclmulqdqOps)
+    }
+}
+
+#[rustversion::since(1.89)]
+impl ArchOps for X86_64Avx512PclmulqdqOps {
+    type Vector = __m128i;
+
+    // Delegate all methods to the base SSE implementation
+    #[inline(always)]
+    unsafe fn create_vector_from_u64_pair(
+        &self,
+        high: u64,
+        low: u64,
+        reflected: bool,
+    ) -> Self::Vector {
+        self.0.create_vector_from_u64_pair(high, low, reflected)
+    }
+
+    #[inline(always)]
+    unsafe fn create_vector_from_u64_pair_non_reflected(
+        &self,
+        high: u64,
+        low: u64,
+    ) -> Self::Vector {
+        self.0.create_vector_from_u64_pair_non_reflected(high, low)
+    }
+
+    #[inline(always)]
+    unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector {
+        self.0.create_vector_from_u64(value, high)
+    }
+
+    #[inline(always)]
+    unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2] {
+        self.0.extract_u64s(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2] {
+        self.0.extract_poly64s(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn xor_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.xor_vectors(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector {
+        self.0.load_bytes(ptr)
+    }
+
+    #[inline(always)]
+    unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector {
+        self.0.load_aligned(ptr)
+    }
+
+    #[inline(always)]
+    unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector {
+        self.0.shuffle_bytes(data, mask)
+    }
+
+    #[inline(always)]
+    unsafe fn blend_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        mask: Self::Vector,
+    ) -> Self::Vector {
+        self.0.blend_vectors(a, b, mask)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_8(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn set_all_bytes(&self, value: u8) -> Self::Vector {
+        self.0.set_all_bytes(value)
+    }
+
+    #[inline(always)]
+    unsafe fn create_compare_mask(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.create_compare_mask(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn and_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.and_vectors(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_32(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_32(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn create_vector_from_u32(&self, value: u32, high: bool) -> Self::Vector {
+        self.0.create_vector_from_u32(value, high)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_4(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_4(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_8(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_5(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_6(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_7(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_12(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_12(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn carryless_mul_00(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_00(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_01(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn carryless_mul_10(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_10(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_11(a, b)
+    }
+
+    #[rustversion::since(1.89)]
+    #[inline]
+    #[target_feature(enable = "avx512vl")]
+    unsafe fn xor3_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        c: Self::Vector,
+    ) -> Self::Vector {
+        // AVX512 tier always uses ternary logic
+        _mm_ternarylogic_epi64(a, b, c, 0x96) // XOR3 operation
+    }
+
+    // Fallback for older Rust versions
+    #[rustversion::before(1.89)]
+    #[inline(always)]
+    unsafe fn xor3_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        c: Self::Vector,
+    ) -> Self::Vector {
+        // Rust < 1.89 doesn't have _mm_ternarylogic_epi64, fall back to SSE
+        self.0.xor3_vectors(a, b, c)
+    }
+}
diff --git a/src/arch/x86_64/avx512_vpclmulqdq.rs b/src/arch/x86_64/avx512_vpclmulqdq.rs
new file mode 100644
index 0000000..881ca7e
--- /dev/null
+++ b/src/arch/x86_64/avx512_vpclmulqdq.rs
@@ -0,0 +1,630 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides AVX-512 and VPCLMULQDQ-specific implementations of the ArchOps trait.
+//!
+//! It performs folding using 4 x ZMM registers of 512-bits each.
+
+#![cfg(target_arch = "x86_64")]
+
+#[rustversion::since(1.89)]
+use crate::arch::x86::sse::X86SsePclmulqdqOps;
+
+#[rustversion::since(1.89)]
+use crate::enums::Reflector;
+
+#[rustversion::since(1.89)]
+use crate::structs::CrcState;
+
+#[rustversion::since(1.89)]
+use crate::traits::{ArchOps, EnhancedCrcWidth};
+
+#[rustversion::since(1.89)]
+use std::arch::x86_64::*;
+
+#[rustversion::since(1.89)]
+use std::ops::BitXor;
+
+/// Implements the ArchOps trait using 512-bit AVX-512 and VPCLMULQDQ instructions at 512 bits.
+/// Delegates to X86SsePclmulqdqOps for standard 128-bit operations
+#[rustversion::since(1.89)]
+#[derive(Debug, Copy, Clone)]
+pub struct X86_64Avx512VpclmulqdqOps(X86SsePclmulqdqOps);
+
+#[rustversion::since(1.89)]
+impl X86_64Avx512VpclmulqdqOps {
+    #[inline(always)]
+    pub fn new() -> Self {
+        Self(X86SsePclmulqdqOps)
+    }
+}
+
+// Wrapper for __m512i to make it easier to work with
+#[rustversion::since(1.89)]
+#[derive(Debug, Copy, Clone)]
+struct Simd512(__m512i);
+
+#[rustversion::since(1.89)]
+impl Simd512 {
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    #[allow(clippy::too_many_arguments)]
+    unsafe fn new(x7: u64, x6: u64, x5: u64, x4: u64, x3: u64, x2: u64, x1: u64, x0: u64) -> Self {
+        Self(_mm512_set_epi64(
+            x7 as i64, x6 as i64, x5 as i64, x4 as i64, x3 as i64, x2 as i64, x1 as i64, x0 as i64,
+        ))
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512vl,vpclmulqdq")]
+    unsafe fn fold_64(&self, coeff: &Self, new_data: &Self) -> Self {
+        // Use 512-bit ternary logic XOR3 with carryless multiplication
+        Self(_mm512_ternarylogic_epi64(
+            _mm512_clmulepi64_epi128(self.0, coeff.0, 0), // Low parts
+            _mm512_clmulepi64_epi128(self.0, coeff.0, 17), // High parts
+            new_data.0,
+            0x96, // XOR3 operation
+        ))
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn extract_u64s(&self) -> [u64; 8] {
+        let mut result = [0u64; 8];
+        _mm512_storeu_si512(result.as_mut_ptr().cast(), self.0);
+
+        result
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn load_from_ptr(ptr: *const u8) -> Self {
+        Self(_mm512_loadu_si512(ptr as *const __m512i))
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn to_128i_extract<const INDEX: i32>(self) -> __m128i {
+        _mm512_extracti32x4_epi32(self.0, INDEX)
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn xor(&self, other: &Self) -> Self {
+        Self(_mm512_xor_si512(self.0, other.0))
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    #[allow(unused)]
+    unsafe fn print_hex(&self, prefix: &str) {
+        let values = self.extract_u64s();
+        println!(
+            "{}={:#016x}_{:016x}_{:016x}_{:016x}_{:016x}_{:016x}_{:016x}_{:016x}",
+            prefix,
+            values[7],
+            values[6],
+            values[5],
+            values[4],
+            values[3],
+            values[2],
+            values[1],
+            values[0]
+        );
+    }
+}
+
+#[rustversion::since(1.89)]
+impl X86_64Avx512VpclmulqdqOps {
+    /// Process aligned blocks using VPCLMULQDQ with 4 x 512-bit registers
+    ///
+    /// Note that #[inline(always)] loses the inlining performance boost, despite no native
+    /// target_features being used directly. Odd since that's not how Rust's docs make it sound...
+    #[inline]
+    #[target_feature(enable = "avx512vl,avx512bw,vpclmulqdq")]
+    unsafe fn process_blocks<W: EnhancedCrcWidth>(
+        &self,
+        state: &mut CrcState<<X86_64Avx512VpclmulqdqOps as ArchOps>::Vector>,
+        first: &[__m128i; 8],
+        rest: &[[__m128i; 8]],
+        keys: [u64; 23],
+        reflected: bool,
+    ) -> W::Value
+    where
+        W::Value: Copy + BitXor<Output = W::Value>,
+    {
+        let state_u64s = self.extract_u64s(state.value);
+
+        let positioned_state = if reflected {
+            Simd512::new(0, 0, 0, 0, 0, 0, 0, state_u64s[0])
+        } else {
+            Simd512::new(state_u64s[1], 0, 0, 0, 0, 0, 0, 0)
+        };
+
+        let reflector = create_reflector512(reflected);
+
+        // Load first 256 bytes (2nd half is rest[0] since these are 128-byte blocks)
+        let first_ptr = first.as_ptr() as *const u8;
+        let first_rest_ptr = rest[0].as_ptr() as *const u8;
+
+        let mut x = [
+            reflect_bytes512(&reflector, Simd512::load_from_ptr(first_ptr)),
+            reflect_bytes512(&reflector, Simd512::load_from_ptr(first_ptr.add(64))),
+            reflect_bytes512(&reflector, Simd512::load_from_ptr(first_rest_ptr)),
+            reflect_bytes512(&reflector, Simd512::load_from_ptr(first_rest_ptr.add(64))),
+        ];
+
+        x[0] = positioned_state.xor(&x[0]);
+
+        let coeff = self.create_avx512_256byte_coefficient(keys, reflected);
+
+        let remaining_rest = &rest[1..];
+        let pair_count = remaining_rest.len() / 2;
+
+        for i in 0..pair_count {
+            let block1_ptr = remaining_rest[i * 2].as_ptr() as *const u8;
+            let block2_ptr = remaining_rest[i * 2 + 1].as_ptr() as *const u8;
+
+            x[0] = x[0].fold_64(
+                &coeff,
+                &reflect_bytes512(&reflector, Simd512::load_from_ptr(block1_ptr)),
+            );
+            x[1] = x[1].fold_64(
+                &coeff,
+                &reflect_bytes512(&reflector, Simd512::load_from_ptr(block1_ptr.add(64))),
+            );
+            x[2] = x[2].fold_64(
+                &coeff,
+                &reflect_bytes512(&reflector, Simd512::load_from_ptr(block2_ptr)),
+            );
+            x[3] = x[3].fold_64(
+                &coeff,
+                &reflect_bytes512(&reflector, Simd512::load_from_ptr(block2_ptr.add(64))),
+            );
+        }
+
+        let processed_pairs = pair_count * 2;
+        let remaining_single_count = remaining_rest.len() - processed_pairs;
+
+        if remaining_single_count > 0 {
+            // We have 1 unprocessed block (128 bytes)
+            // Fold 4×512 down to 2×512 and process the remaining block with 2-register mode
+            let folded_2reg = self.fold_from_4x512_to_2x256(x, keys, reflected);
+            let coeff_2reg = self.create_avx512_128byte_coefficient(keys, reflected);
+
+            let last_block_ptr = remaining_rest[processed_pairs].as_ptr() as *const u8;
+
+            let final_x = [
+                folded_2reg[0].fold_64(
+                    &coeff_2reg,
+                    &reflect_bytes512(&reflector, Simd512::load_from_ptr(last_block_ptr)),
+                ),
+                folded_2reg[1].fold_64(
+                    &coeff_2reg,
+                    &reflect_bytes512(&reflector, Simd512::load_from_ptr(last_block_ptr.add(64))),
+                ),
+            ];
+
+            let folded = self.fold_from_2x512_to_1x128(final_x, keys, reflected);
+
+            return W::perform_final_reduction(folded, reflected, keys, self);
+        }
+
+        // All blocks processed in pairs - fold from 4 x 512-bit to 1 x 128-bit
+        let folded = self.fold_from_4x512_to_1x128(x, keys, reflected);
+
+        W::perform_final_reduction(folded, reflected, keys, self)
+    }
+
+    /// Create a folding coefficient for AVX-512 for 128-byte folding distances
+    #[inline(always)]
+    unsafe fn create_avx512_128byte_coefficient(
+        &self,
+        keys: [u64; 23],
+        reflected: bool,
+    ) -> Simd512 {
+        let (k1, k2) = if reflected {
+            (keys[3], keys[4])
+        } else {
+            (keys[4], keys[3])
+        };
+
+        // Replicate the coefficient pair
+        Simd512::new(k1, k2, k1, k2, k1, k2, k1, k2)
+    }
+
+    /// Create a folding coefficient for AVX-512 for 256-byte folding distances
+    #[inline(always)]
+    unsafe fn create_avx512_256byte_coefficient(
+        &self,
+        keys: [u64; 23],
+        reflected: bool,
+    ) -> Simd512 {
+        let (k1, k2) = if reflected {
+            (keys[21], keys[22])
+        } else {
+            (keys[22], keys[21])
+        };
+
+        // Replicate the coefficient pair
+        Simd512::new(k1, k2, k1, k2, k1, k2, k1, k2)
+    }
+
+    /// Fold from 4 x 512-bit to 1 x 128-bit
+    #[inline(always)]
+    unsafe fn fold_from_4x512_to_1x128(
+        &self,
+        x: [Simd512; 4],
+        keys: [u64; 23],
+        reflected: bool,
+    ) -> __m128i {
+        // Step 1: Fold 4 x 512-bit to 2 x 512-bit
+        let x2 = self.fold_from_4x512_to_2x256(x, keys, reflected);
+
+        // Step 2: Fold 2 x 512-bit to 1 x 128-bit
+        self.fold_from_2x512_to_1x128(x2, keys, reflected)
+    }
+
+    /// Fold from 4 x 512-bit to 2 x 512-bit
+    #[inline(always)]
+    unsafe fn fold_from_4x512_to_2x256(
+        &self,
+        x: [Simd512; 4],
+        keys: [u64; 23],
+        reflected: bool,
+    ) -> [Simd512; 2] {
+        // This folds registers that are 128 bytes apart (x[0] with x[2], x[1] with x[3])
+        let coeff = self.create_avx512_128byte_coefficient(keys, reflected);
+
+        // Fold pairs:
+        // x[0] (bytes 0-63) + x[2] (bytes 128-191) → result[0]
+        // x[1] (bytes 64-127) + x[3] (bytes 192-255) → result[1]
+        [x[0].fold_64(&coeff, &x[2]), x[1].fold_64(&coeff, &x[3])]
+    }
+
+    /// Fold from 2 x 512-bit to 1 x 128-bit
+    #[inline(always)]
+    unsafe fn fold_from_2x512_to_1x128(
+        &self,
+        x: [Simd512; 2],
+        keys: [u64; 23],
+        reflected: bool,
+    ) -> __m128i {
+        // Create the fold coefficients for different distances
+        let fold_coefficients = [
+            self.create_vector_from_u64_pair(keys[10], keys[9], reflected), // 112 bytes
+            self.create_vector_from_u64_pair(keys[12], keys[11], reflected), // 96 bytes
+            self.create_vector_from_u64_pair(keys[14], keys[13], reflected), // 80 bytes
+            self.create_vector_from_u64_pair(keys[16], keys[15], reflected), // 64 bytes
+            self.create_vector_from_u64_pair(keys[18], keys[17], reflected), // 48 bytes
+            self.create_vector_from_u64_pair(keys[20], keys[19], reflected), // 32 bytes
+            self.create_vector_from_u64_pair(keys[2], keys[1], reflected),  // 16 bytes
+        ];
+
+        // Extract the 8 x 128-bit vectors from the 2 x 512-bit vectors (this is faster than
+        // using 256-bit intrinsics for 1KiB payloads)
+        let v128 = if reflected {
+            [
+                x[0].to_128i_extract::<0>(), // 256-x0.low
+                x[0].to_128i_extract::<1>(), // 256-x0.high
+                x[0].to_128i_extract::<2>(), // 256-x1.low
+                x[0].to_128i_extract::<3>(), // 256-x1.high
+                x[1].to_128i_extract::<0>(), // 256-x2.low
+                x[1].to_128i_extract::<1>(), // 256-x2.high
+                x[1].to_128i_extract::<2>(), // 256-x3.low
+                x[1].to_128i_extract::<3>(), // 256-x3.high
+            ]
+        } else {
+            [
+                x[0].to_128i_extract::<3>(), // 256-x1.high
+                x[0].to_128i_extract::<2>(), // 256-x1.low
+                x[0].to_128i_extract::<1>(), // 256-x0.high
+                x[0].to_128i_extract::<0>(), // 256-x0.low
+                x[1].to_128i_extract::<3>(), // 256-x3.high
+                x[1].to_128i_extract::<2>(), // 256-x3.low
+                x[1].to_128i_extract::<1>(), // 256-x2.high
+                x[1].to_128i_extract::<0>(), // 256-x2.low
+            ]
+        };
+
+        // Fold the 8 xmm registers to 1 xmm register
+        let mut res = v128[7];
+
+        for (i, &coeff) in fold_coefficients.iter().enumerate() {
+            let folded_h = self.carryless_mul_00(v128[i], coeff);
+            let folded_l = self.carryless_mul_11(v128[i], coeff);
+            res = self.xor3_vectors(folded_h, folded_l, res);
+        }
+
+        res
+    }
+}
+
+// 512-bit version of the Reflector
+#[rustversion::since(1.89)]
+#[derive(Clone, Copy)]
+enum Reflector512 {
+    NoReflector,
+    ForwardReflector { smask: Simd512 },
+}
+
+// Function to create the appropriate reflector based on CRC parameters
+#[rustversion::since(1.89)]
+#[inline(always)]
+unsafe fn create_reflector512(reflected: bool) -> Reflector512 {
+    if reflected {
+        Reflector512::NoReflector
+    } else {
+        // Load shuffle mask
+        let smask = Simd512::new(
+            0x08090a0b0c0d0e0f,
+            0x0001020304050607,
+            0x08090a0b0c0d0e0f,
+            0x0001020304050607,
+            0x08090a0b0c0d0e0f,
+            0x0001020304050607,
+            0x08090a0b0c0d0e0f,
+            0x0001020304050607,
+        );
+        Reflector512::ForwardReflector { smask }
+    }
+}
+
+// Function to apply reflection to a 512-bit vector
+#[rustversion::since(1.89)]
+#[inline(always)]
+unsafe fn reflect_bytes512(reflector: &Reflector512, data: Simd512) -> Simd512 {
+    match reflector {
+        Reflector512::NoReflector => data,
+        Reflector512::ForwardReflector { smask } => shuffle_bytes512(data, *smask),
+    }
+}
+
+// pre-compute the reverse indices for 512-bit shuffling
+#[rustversion::since(1.89)]
+static REVERSE_INDICES_512: __m512i =
+    unsafe { std::mem::transmute([7u64, 6u64, 5u64, 4u64, 3u64, 2u64, 1u64, 0u64]) };
+
+// Implement a 512-bit byte shuffle function
+#[rustversion::since(1.89)]
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+unsafe fn shuffle_bytes512(data: Simd512, mask: Simd512) -> Simd512 {
+    Simd512(_mm512_permutexvar_epi64(
+        // Reverse the order using 512-bit permutation
+        REVERSE_INDICES_512,                 // reverse indices
+        _mm512_shuffle_epi8(data.0, mask.0), // shuffled data
+    ))
+}
+
+// Delegate all ArchOps methods to the inner X86SsePclmulqdqOps instance
+#[rustversion::since(1.89)]
+impl ArchOps for X86_64Avx512VpclmulqdqOps {
+    type Vector = __m128i;
+
+    #[inline(always)]
+    unsafe fn process_enhanced_simd_blocks<W: EnhancedCrcWidth>(
+        &self,
+        state: &mut CrcState<Self::Vector>,
+        first: &[Self::Vector; 8],
+        rest: &[[Self::Vector; 8]],
+        _reflector: &Reflector<Self::Vector>,
+        keys: [u64; 23],
+    ) -> bool
+    where
+        Self::Vector: Copy,
+    {
+        // Update the state with the result
+        *state = W::create_state(
+            self.process_blocks::<W>(state, first, rest, keys, state.reflected),
+            state.reflected,
+            self,
+        );
+
+        // Return true to indicate we handled it
+        true
+    }
+
+    // Delegate all other methods to X86SsePclmulqdqOps
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn create_vector_from_u64_pair(
+        &self,
+        high: u64,
+        low: u64,
+        reflected: bool,
+    ) -> Self::Vector {
+        self.0.create_vector_from_u64_pair(high, low, reflected)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn create_vector_from_u64_pair_non_reflected(
+        &self,
+        high: u64,
+        low: u64,
+    ) -> Self::Vector {
+        self.0.create_vector_from_u64_pair_non_reflected(high, low)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector {
+        self.0.create_vector_from_u64(value, high)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2] {
+        self.0.extract_u64s(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2] {
+        self.0.extract_poly64s(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn xor_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.xor_vectors(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector {
+        self.0.load_bytes(ptr)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector {
+        self.0.load_aligned(ptr)
+    }
+
+    #[inline]
+    #[target_feature(enable = "ssse3")]
+    unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector {
+        self.0.shuffle_bytes(data, mask)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn blend_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        mask: Self::Vector,
+    ) -> Self::Vector {
+        self.0.blend_vectors(a, b, mask)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_8(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn set_all_bytes(&self, value: u8) -> Self::Vector {
+        self.0.set_all_bytes(value)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn create_compare_mask(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.create_compare_mask(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn and_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.and_vectors(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_32(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_32(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn create_vector_from_u32(&self, value: u32, high: bool) -> Self::Vector {
+        self.0.create_vector_from_u32(value, high)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_4(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_4(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_8(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_5(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_6(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_7(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_12(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_12(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "pclmulqdq")]
+    unsafe fn carryless_mul_00(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_00(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "pclmulqdq")]
+    unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_01(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "pclmulqdq")]
+    unsafe fn carryless_mul_10(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_10(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "pclmulqdq")]
+    unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_11(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512vl")]
+    unsafe fn xor3_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        c: Self::Vector,
+    ) -> Self::Vector {
+        // VPCLMULQDQ implementation always uses AVX512 ternary logic
+        // Feature detection is handled at the dispatch level
+        _mm_ternarylogic_epi64(
+            a, b, c, 0x96, // XOR3 operation
+        )
+    }
+}
diff --git a/src/arch/x86_64/mod.rs b/src/arch/x86_64/mod.rs
new file mode 100644
index 0000000..4d1422d
--- /dev/null
+++ b/src/arch/x86_64/mod.rs
@@ -0,0 +1,8 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides x86_64-specific implementations of the ArchOps trait.
+
+#![cfg(target_arch = "x86_64")]
+
+pub mod avx512;
+pub mod avx512_vpclmulqdq;
diff --git a/src/bin/arch-check.rs b/src/bin/arch-check.rs
index b1dc3ef..023b2a0 100644
--- a/src/bin/arch-check.rs
+++ b/src/bin/arch-check.rs
@@ -1,3 +1,5 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
 //! This is a simple program that checks if the target architecture supports certain features.
 
 #[cfg(target_arch = "aarch64")]
diff --git a/src/bin/checksum.rs b/src/bin/checksum.rs
index dd02b08..6c017bb 100644
--- a/src/bin/checksum.rs
+++ b/src/bin/checksum.rs
@@ -1,6 +1,9 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
 //! This is a simple program to calculate a checksum from the command line
 
 use crc_fast::{checksum, checksum_file, CrcAlgorithm};
+use rand::RngCore;
 use std::env;
 use std::process::ExitCode;
 use std::str::FromStr;
@@ -11,6 +14,36 @@ struct Config {
     file: Option<String>,
     string: Option<String>,
     format: OutputFormat,
+    benchmark: Option<BenchmarkConfig>,
+}
+
+#[derive(Debug)]
+struct BenchmarkConfig {
+    size: Option<usize>,
+    duration: f64,
+}
+
+#[derive(Debug)]
+enum BenchmarkData {
+    InMemory(Vec<u8>),
+    File(String),
+}
+
+#[derive(Debug)]
+struct BenchmarkRunner {
+    algorithm: CrcAlgorithm,
+    data: BenchmarkData,
+    duration: f64,
+}
+
+#[derive(Debug)]
+struct BenchmarkResult {
+    iterations: u64,
+    elapsed_seconds: f64,
+    throughput_gibs: f64,
+    time_per_iteration_nanos: f64,
+    acceleration_target: String,
+    data_size: u64,
 }
 
 #[derive(Debug, Clone)]
@@ -19,20 +52,144 @@ enum OutputFormat {
     Decimal,
 }
 
+impl BenchmarkConfig {
+    fn validate(&self) -> Result<(), String> {
+        if self.duration <= 0.0 {
+            return Err("Duration must be greater than 0".to_string());
+        }
+
+        if let Some(size) = self.size {
+            if size == 0 {
+                return Err("Size must be greater than 0".to_string());
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl BenchmarkRunner {
+    fn new(algorithm: CrcAlgorithm, data: BenchmarkData, duration: f64) -> Self {
+        Self {
+            algorithm,
+            data,
+            duration,
+        }
+    }
+
+    fn run(&self) -> Result<BenchmarkResult, String> {
+        use std::time::Instant;
+
+        let start = Instant::now();
+        let mut iterations = 0u64;
+
+        while start.elapsed().as_secs_f64() < self.duration {
+            match &self.data {
+                BenchmarkData::InMemory(data) => {
+                    std::hint::black_box(checksum(self.algorithm, data));
+                }
+                BenchmarkData::File(filename) => {
+                    match checksum_file(self.algorithm, filename, None) {
+                        Ok(result) => {
+                            std::hint::black_box(result);
+                        }
+                        Err(e) => {
+                            return Err(format!("Failed to read file during benchmark: {}", e));
+                        }
+                    }
+                }
+            }
+            iterations += 1;
+        }
+
+        let elapsed = start.elapsed().as_secs_f64();
+        let data_size = match &self.data {
+            BenchmarkData::InMemory(data) => data.len() as u64,
+            BenchmarkData::File(filename) => std::fs::metadata(filename)
+                .map(|m| m.len())
+                .map_err(|e| format!("Failed to get file size: {}", e))?,
+        };
+
+        let acceleration_target = crc_fast::get_calculator_target(self.algorithm);
+
+        Ok(BenchmarkResult::new(
+            iterations,
+            elapsed,
+            acceleration_target,
+            data_size,
+        ))
+    }
+}
+
+impl BenchmarkResult {
+    fn new(
+        iterations: u64,
+        elapsed_seconds: f64,
+        acceleration_target: String,
+        data_size: u64,
+    ) -> Self {
+        let throughput_gibs = if elapsed_seconds > 0.0 {
+            (data_size as f64 * iterations as f64) / elapsed_seconds / (1024.0 * 1024.0 * 1024.0)
+        } else {
+            0.0
+        };
+
+        let time_per_iteration_nanos = if iterations > 0 {
+            elapsed_seconds * 1_000_000_000.0 / iterations as f64
+        } else {
+            0.0
+        };
+
+        Self {
+            iterations,
+            elapsed_seconds,
+            throughput_gibs,
+            time_per_iteration_nanos,
+            acceleration_target,
+            data_size,
+        }
+    }
+}
+
+fn generate_random_data(size: usize) -> Result<Vec<u8>, String> {
+    // Check for reasonable size limits to prevent memory issues
+    if size > 1_073_741_824 {
+        // 1 GiB limit
+        return Err("Data size too large (maximum 1 GiB)".to_string());
+    }
+
+    // Use vec! macro to avoid clippy warning about slow initialization
+    let mut buf = vec![0u8; size];
+    let mut rng = rand::rng();
+    rng.fill_bytes(&mut buf);
+    Ok(buf)
+}
+
 fn print_usage() {
     println!("Usage: checksum -a algorithm [-f file] [-s string] [--format hex|decimal]");
+    println!(
+        "       checksum -a algorithm -b [--size bytes] [--duration seconds] [-f file] [-s string]"
+    );
     println!();
     println!("Example: checksum -a CRC-32/ISCSI -f myfile.txt");
     println!("Example: checksum -a CRC-64/NVME -s 'Hello, world!' --format decimal");
+    println!("Example: checksum -a CRC-32/ISCSI -b --size 1048576 --duration 5.0");
     println!();
     println!("Options:");
     println!("  -a algorithm        Specify the checksum algorithm (required)");
     println!("  -f file             Calculate checksum for the specified file");
+    println!("  -h, --help          Show this help message");
     println!("  -s string           Calculate checksum for the specified string");
     println!("  --format hex|decimal Output format (default: hex)");
-    println!("  -h, --help          Show this help message");
     println!();
-    println!("Note: Either -f or -s must be provided, but not both.");
+    println!("Benchmarking:");
+    println!("  -b                  Enable benchmark mode");
+    println!("  --duration seconds  Benchmark duration in seconds (default: 10.0)");
+    println!("  --size bytes        Data size for random generation in benchmark mode (default: 1048576 [1MiB])");
+    println!();
+    println!();
+    println!("Note: In normal mode, either -f or -s must be provided, but not both.");
+    println!("      In benchmark mode (-b), -f or -s are optional for using specific data.");
 }
 
 fn parse_args() -> Result<Config, String> {
@@ -51,6 +208,9 @@ fn parse_args() -> Result<Config, String> {
     let mut file: Option<String> = None;
     let mut string: Option<String> = None;
     let mut format = OutputFormat::Hex; // Default to hex
+    let mut benchmark_mode = false;
+    let mut benchmark_size: Option<usize> = None;
+    let mut benchmark_duration = 10.0; // Default duration
 
     let mut i = 1; // Skip program name
     while i < args.len() {
@@ -98,6 +258,30 @@ fn parse_args() -> Result<Config, String> {
                 }
                 i += 2;
             }
+            "-b" => {
+                benchmark_mode = true;
+                i += 1;
+            }
+            "--size" => {
+                if i + 1 >= args.len() {
+                    return Err("Missing size value after --size flag".to_string());
+                }
+                benchmark_size = Some(
+                    args[i + 1]
+                        .parse::<usize>()
+                        .map_err(|_| format!("Invalid size value: {}", args[i + 1]))?,
+                );
+                i += 2;
+            }
+            "--duration" => {
+                if i + 1 >= args.len() {
+                    return Err("Missing duration value after --duration flag".to_string());
+                }
+                benchmark_duration = args[i + 1]
+                    .parse::<f64>()
+                    .map_err(|_| format!("Invalid duration value: {}", args[i + 1]))?;
+                i += 2;
+            }
             arg => {
                 return Err(format!("Unknown argument: {}", arg));
             }
@@ -107,15 +291,40 @@ fn parse_args() -> Result<Config, String> {
     // Validate required arguments
     let algorithm = algorithm.ok_or("Algorithm (-a) is required")?;
 
-    if file.is_none() && string.is_none() {
-        return Err("Either -f (file) or -s (string) must be provided".to_string());
+    // Validate mutual exclusivity between benchmark and normal modes
+    if !benchmark_mode && (benchmark_size.is_some() || benchmark_duration != 10.0) {
+        return Err("--size and --duration can only be used with -b flag".to_string());
+    }
+
+    // Create benchmark config if in benchmark mode
+    let benchmark = if benchmark_mode {
+        let config = BenchmarkConfig {
+            size: benchmark_size,
+            duration: benchmark_duration,
+        };
+        config.validate()?;
+        Some(config)
+    } else {
+        None
+    };
+
+    // Validate input requirements based on mode
+    if benchmark.is_none() {
+        // Normal mode: require either file or string input
+        if file.is_none() && string.is_none() {
+            return Err(
+                "Either -f (file) or -s (string) must be provided in normal mode".to_string(),
+            );
+        }
     }
+    // Benchmark mode: file and string are optional (will use generated data if neither provided)
 
     Ok(Config {
         algorithm,
         file,
         string,
         format,
+        benchmark,
     })
 }
 
@@ -123,6 +332,11 @@ fn calculate_checksum(config: &Config) -> Result<(), String> {
     let algorithm = CrcAlgorithm::from_str(&config.algorithm)
         .map_err(|_| format!("Invalid algorithm: {}", config.algorithm))?;
 
+    // Check if benchmark mode is enabled
+    if let Some(benchmark_config) = &config.benchmark {
+        return run_benchmark(config, benchmark_config, algorithm);
+    }
+
     let checksum = if let Some(ref filename) = config.file {
         checksum_file(algorithm, filename, None).unwrap()
     } else if let Some(ref text) = config.string {
@@ -139,6 +353,91 @@ fn calculate_checksum(config: &Config) -> Result<(), String> {
     Ok(())
 }
 
+fn run_benchmark(
+    config: &Config,
+    benchmark_config: &BenchmarkConfig,
+    algorithm: CrcAlgorithm,
+) -> Result<(), String> {
+    // Determine data source and create BenchmarkData
+    let data = if let Some(ref filename) = config.file {
+        // Validate file exists before benchmarking
+        if !std::path::Path::new(filename).exists() {
+            return Err(format!("File not found: {}", filename));
+        }
+        BenchmarkData::File(filename.clone())
+    } else if let Some(ref text) = config.string {
+        BenchmarkData::InMemory(text.as_bytes().to_vec())
+    } else {
+        // Generate random data with specified size or default (1 MiB)
+        let size = benchmark_config.size.unwrap_or(1_048_576);
+        let random_data = generate_random_data(size)?;
+        BenchmarkData::InMemory(random_data)
+    };
+
+    // Create and run benchmark
+    let runner = BenchmarkRunner::new(algorithm, data, benchmark_config.duration);
+    let result = runner.run()?;
+
+    // Display results with algorithm name
+    display_benchmark_results(&result, &config.algorithm);
+
+    Ok(())
+}
+
+// Format numbers with comma separators for better readability
+fn format_number_with_commas(n: u64) -> String {
+    let s = n.to_string();
+    let mut result = String::new();
+    let chars: Vec<char> = s.chars().collect();
+
+    for (i, ch) in chars.iter().enumerate() {
+        if i > 0 && (chars.len() - i) % 3 == 0 {
+            result.push(',');
+        }
+        result.push(*ch);
+    }
+
+    result
+}
+
+fn display_benchmark_results(result: &BenchmarkResult, algorithm_name: &str) {
+    println!("Algorithm: {}", algorithm_name);
+    println!("Acceleration Target: {}", result.acceleration_target);
+
+    // Format data size with appropriate units
+    let (size_value, size_unit) = if result.data_size >= 1_048_576 {
+        (result.data_size as f64 / 1_048_576.0, "MiB")
+    } else if result.data_size >= 1024 {
+        (result.data_size as f64 / 1024.0, "KiB")
+    } else {
+        (result.data_size as f64, "bytes")
+    };
+
+    println!(
+        "Data Size: {} bytes ({:.1} {})",
+        format_number_with_commas(result.data_size),
+        size_value,
+        size_unit
+    );
+    println!("Duration: {:.2} seconds", result.elapsed_seconds);
+    println!(
+        "Iterations: {}",
+        format_number_with_commas(result.iterations)
+    );
+    println!("Throughput: {:.2} GiB/s", result.throughput_gibs);
+
+    // Format time per iteration with appropriate units
+    let (time_value, time_unit) = if result.time_per_iteration_nanos >= 1_000_000.0 {
+        (result.time_per_iteration_nanos / 1_000_000.0, "ms")
+    } else if result.time_per_iteration_nanos >= 1_000.0 {
+        (result.time_per_iteration_nanos / 1_000.0, "μs")
+    } else {
+        (result.time_per_iteration_nanos, "ns")
+    };
+
+    println!("Time per iteration: {:.1} {}", time_value, time_unit);
+}
+
 fn main() -> ExitCode {
     match parse_args() {
         Ok(config) => {
@@ -162,3 +461,261 @@ fn main() -> ExitCode {
 
     ExitCode::SUCCESS
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::str::FromStr;
+
+    #[test]
+    fn test_benchmark_config_validation_valid() {
+        let config = BenchmarkConfig {
+            size: Some(1024),
+            duration: 5.0,
+        };
+        assert!(config.validate().is_ok());
+    }
+
+    #[test]
+    fn test_benchmark_config_validation_zero_duration() {
+        let config = BenchmarkConfig {
+            size: Some(1024),
+            duration: 0.0,
+        };
+        assert!(config.validate().is_err());
+        assert_eq!(
+            config.validate().unwrap_err(),
+            "Duration must be greater than 0"
+        );
+    }
+
+    #[test]
+    fn test_benchmark_config_validation_negative_duration() {
+        let config = BenchmarkConfig {
+            size: Some(1024),
+            duration: -1.0,
+        };
+        assert!(config.validate().is_err());
+        assert_eq!(
+            config.validate().unwrap_err(),
+            "Duration must be greater than 0"
+        );
+    }
+
+    #[test]
+    fn test_benchmark_config_validation_zero_size() {
+        let config = BenchmarkConfig {
+            size: Some(0),
+            duration: 5.0,
+        };
+        assert!(config.validate().is_err());
+        assert_eq!(
+            config.validate().unwrap_err(),
+            "Size must be greater than 0"
+        );
+    }
+
+    #[test]
+    fn test_benchmark_config_validation_none_size() {
+        let config = BenchmarkConfig {
+            size: None,
+            duration: 5.0,
+        };
+        assert!(config.validate().is_ok());
+    }
+
+    #[test]
+    fn test_generate_random_data_valid_size() {
+        let data = generate_random_data(1024).unwrap();
+        assert_eq!(data.len(), 1024);
+    }
+
+    #[test]
+    fn test_generate_random_data_large_size() {
+        let result = generate_random_data(1_073_741_825); // 1 GiB + 1
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err(), "Data size too large (maximum 1 GiB)");
+    }
+
+    #[test]
+    fn test_benchmark_result_calculation() {
+        let result = BenchmarkResult::new(
+            1000,               // iterations
+            2.0,                // elapsed_seconds
+            "test".to_string(), // acceleration_target
+            1024,               // data_size
+        );
+
+        assert_eq!(result.iterations, 1000);
+        assert_eq!(result.elapsed_seconds, 2.0);
+        assert_eq!(result.data_size, 1024);
+
+        // Throughput should be (1024 * 1000) / 2.0 / (1024^3) GiB/s
+        let expected_throughput = (1024.0 * 1000.0) / 2.0 / (1024.0 * 1024.0 * 1024.0);
+        assert!((result.throughput_gibs - expected_throughput).abs() < 1e-10);
+
+        // Time per iteration should be 2.0 * 1e9 / 1000 nanoseconds
+        let expected_time_per_iter = 2.0 * 1_000_000_000.0 / 1000.0;
+        assert!((result.time_per_iteration_nanos - expected_time_per_iter).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_benchmark_runner_creation() {
+        let algorithm = CrcAlgorithm::from_str("CRC-32/ISCSI").unwrap();
+        let data = BenchmarkData::InMemory(vec![1, 2, 3, 4]);
+        let runner = BenchmarkRunner::new(algorithm, data, 1.0);
+
+        assert_eq!(runner.duration, 1.0);
+        assert_eq!(runner.algorithm, algorithm);
+    }
+
+    #[test]
+    fn test_benchmark_runner_execution_in_memory() {
+        let algorithm = CrcAlgorithm::from_str("CRC-32/ISCSI").unwrap();
+        let data = BenchmarkData::InMemory(vec![1, 2, 3, 4]);
+        let runner = BenchmarkRunner::new(algorithm, data, 0.1); // Short duration for test
+
+        let result = runner.run().unwrap();
+        assert!(result.iterations > 0);
+        assert!(result.elapsed_seconds > 0.0);
+        assert_eq!(result.data_size, 4);
+        assert!(result.throughput_gibs >= 0.0);
+    }
+
+    #[test]
+    fn test_parse_args_benchmark_mode() {
+        // We can't easily mock std::env::args in unit tests, so we'll test the parsing logic
+        // by creating a Config directly and validating its structure
+        let config = Config {
+            algorithm: "CRC-32/ISCSI".to_string(),
+            file: None,
+            string: None,
+            format: OutputFormat::Hex,
+            benchmark: Some(BenchmarkConfig {
+                size: Some(1024),
+                duration: 5.0,
+            }),
+        };
+
+        assert!(config.benchmark.is_some());
+        let benchmark_config = config.benchmark.unwrap();
+        assert_eq!(benchmark_config.size, Some(1024));
+        assert_eq!(benchmark_config.duration, 5.0);
+    }
+
+    #[test]
+    fn test_parse_args_normal_mode() {
+        let config = Config {
+            algorithm: "CRC-32/ISCSI".to_string(),
+            file: Some("test.txt".to_string()),
+            string: None,
+            format: OutputFormat::Hex,
+            benchmark: None,
+        };
+
+        assert!(config.benchmark.is_none());
+        assert_eq!(config.file, Some("test.txt".to_string()));
+    }
+
+    #[test]
+    fn test_data_source_selection_file() {
+        // Test that file input creates File variant
+        let config = Config {
+            algorithm: "CRC-32/ISCSI".to_string(),
+            file: Some("test.txt".to_string()),
+            string: None,
+            format: OutputFormat::Hex,
+            benchmark: Some(BenchmarkConfig {
+                size: None,
+                duration: 1.0,
+            }),
+        };
+
+        // This would be tested in the run_benchmark function
+        // We can verify the config structure is correct for file input
+        assert!(config.file.is_some());
+        assert!(config.string.is_none());
+    }
+
+    #[test]
+    fn test_data_source_selection_string() {
+        let config = Config {
+            algorithm: "CRC-32/ISCSI".to_string(),
+            file: None,
+            string: Some("test data".to_string()),
+            format: OutputFormat::Hex,
+            benchmark: Some(BenchmarkConfig {
+                size: None,
+                duration: 1.0,
+            }),
+        };
+
+        assert!(config.file.is_none());
+        assert!(config.string.is_some());
+    }
+
+    #[test]
+    fn test_data_source_selection_generated() {
+        let config = Config {
+            algorithm: "CRC-32/ISCSI".to_string(),
+            file: None,
+            string: None,
+            format: OutputFormat::Hex,
+            benchmark: Some(BenchmarkConfig {
+                size: Some(1024),
+                duration: 1.0,
+            }),
+        };
+
+        // When neither file nor string is provided, generated data should be used
+        assert!(config.file.is_none());
+        assert!(config.string.is_none());
+        assert_eq!(config.benchmark.as_ref().unwrap().size, Some(1024));
+    }
+
+    #[test]
+    fn test_throughput_calculation_accuracy() {
+        // Test with known values to verify calculation accuracy
+        let data_size = 1_048_576u64; // 1 MiB
+        let iterations = 1000u64;
+        let elapsed_seconds = 1.0;
+
+        let result =
+            BenchmarkResult::new(iterations, elapsed_seconds, "test".to_string(), data_size);
+
+        // Expected throughput: (1 MiB * 1000) / 1 second = 1000 MiB/s = ~0.9537 GiB/s
+        let expected_gibs =
+            (data_size as f64 * iterations as f64) / elapsed_seconds / (1024.0 * 1024.0 * 1024.0);
+        assert!((result.throughput_gibs - expected_gibs).abs() < 1e-10);
+    }
+
+    #[test]
+    fn test_output_format_variants() {
+        let hex_format = OutputFormat::Hex;
+        let decimal_format = OutputFormat::Decimal;
+
+        // Test that both variants can be created and are different
+        match hex_format {
+            OutputFormat::Hex => assert!(true),
+            OutputFormat::Decimal => assert!(false),
+        }
+
+        match decimal_format {
+            OutputFormat::Decimal => assert!(true),
+            OutputFormat::Hex => assert!(false),
+        }
+    }
+
+    #[test]
+    fn test_format_number_with_commas() {
+        assert_eq!(format_number_with_commas(0), "0");
+        assert_eq!(format_number_with_commas(123), "123");
+        assert_eq!(format_number_with_commas(1234), "1,234");
+        assert_eq!(format_number_with_commas(12345), "12,345");
+        assert_eq!(format_number_with_commas(123456), "123,456");
+        assert_eq!(format_number_with_commas(1234567), "1,234,567");
+        assert_eq!(format_number_with_commas(12345678), "12,345,678");
+        assert_eq!(format_number_with_commas(123456789), "123,456,789");
+        assert_eq!(format_number_with_commas(1000000000), "1,000,000,000");
+    }
+}
diff --git a/src/bin/get-custom-params.rs b/src/bin/get-custom-params.rs
index ac5df47..d730788 100644
--- a/src/bin/get-custom-params.rs
+++ b/src/bin/get-custom-params.rs
@@ -1,3 +1,5 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
 //! This is a simple program to get custom CRC parameters from the command line.
 
 use std::env;
diff --git a/src/crc32/fusion/aarch64.rs b/src/crc32/fusion/aarch64.rs
deleted file mode 100644
index c9f0207..0000000
--- a/src/crc32/fusion/aarch64.rs
+++ /dev/null
@@ -1,1073 +0,0 @@
-//! Provides CRC-32/ISCSI and CRC-32/ISO-HDLC calculations using a fusion of native CLMUL
-//! instructions and native CRC calculation instructions on aarch64.
-//!
-//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
-//!
-//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-//! with the help of Claude.ai using:
-//!
-//! ./generate -i neon -p crc32c -a v12e_v1
-//! ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3
-//! ./generate -i neon -p crc32 -a v12e_v1
-//!
-//! Modified as necessary for this Rust implementation.
-//!
-//! MIT licensed.
-
-#![cfg(target_arch = "aarch64")]
-
-use std::arch::aarch64::*;
-
-/// Safe wrapper for CRC32 iSCSI calculation
-#[inline]
-#[cfg(target_feature = "sha3")]
-pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
-    unsafe {
-        const LARGE_BUFFER_THRESHOLD: usize = 1024;
-
-        // Select implementation based on buffer size
-        if data.len() <= LARGE_BUFFER_THRESHOLD {
-            crc32_iscsi_v12e_v1(crc, data.as_ptr(), data.len())
-        } else {
-            crc32_iscsi_eor3_v9s3x2e_s3(crc, data.as_ptr(), data.len())
-        }
-    }
-}
-
-#[inline]
-#[cfg(not(target_feature = "sha3"))]
-pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
-    unsafe { crc32_iscsi_v12e_v1(crc, data.as_ptr(), data.len()) }
-}
-
-/// Safe wrapper for CRC32 ISO-HDLC calculation
-#[inline]
-#[cfg(target_feature = "sha3")]
-pub fn crc32_iso_hdlc(crc: u32, data: &[u8]) -> u32 {
-    unsafe {
-        const LARGE_BUFFER_THRESHOLD: usize = 1024;
-
-        // Select implementation based on buffer size
-        if data.len() <= LARGE_BUFFER_THRESHOLD {
-            crc32_iso_hdlc_v12e_v1(crc, data.as_ptr(), data.len())
-        } else {
-            crc32_iso_hdlc_eor3_v9s3x2e_s3(crc, data.as_ptr(), data.len())
-        }
-    }
-}
-
-#[inline]
-#[cfg(not(target_feature = "sha3"))]
-pub fn crc32_iso_hdlc(crc: u32, data: &[u8]) -> u32 {
-    unsafe { crc32_iso_hdlc_v12e_v1(crc, data.as_ptr(), data.len()) }
-}
-
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes")]
-unsafe fn clmul_lo_eor3(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    // Polynomial multiply low parts - convert u128 result to uint64x2_t
-    let result = vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0));
-    vreinterpretq_u64_p128(result)
-}
-
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes")]
-unsafe fn clmul_hi_eor3(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    // Polynomial multiply high parts - convert u128 result to uint64x2_t
-    let result = vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1));
-    vreinterpretq_u64_p128(result)
-}
-
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes")]
-unsafe fn clmul_scalar(a: u32, b: u32) -> uint64x2_t {
-    // Polynomial multiply scalars - convert u128 result to uint64x2_t
-    let result = vmull_p64(a as u64, b as u64);
-    vreinterpretq_u64_p128(result)
-}
-
-// x^n mod P, in log(n) time
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes")]
-unsafe fn xnmodp_crc32_iscsi(mut n: u64) -> u32 {
-    let mut stack = !1u64;
-    let mut acc: u32;
-    let mut low: u32;
-
-    while n > 191 {
-        stack = (stack << 1) + (n & 1);
-        n = (n >> 1) - 16;
-    }
-    stack = !stack;
-    acc = 0x80000000u32 >> (n & 31);
-    n >>= 5;
-
-    while n > 0 {
-        // ARM CRC32 instruction
-        acc = unsafe { __crc32cw(acc, 0) };
-        n -= 1;
-    }
-
-    while {
-        low = (stack & 1) as u32;
-        stack >>= 1;
-        stack != 0
-    } {
-        unsafe {
-            // Convert to polynomial type and square it
-            let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64));
-            let squared = vmull_p8(x, x);
-            let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0);
-            acc = __crc32cd(0, y << low);
-        }
-    }
-    acc
-}
-
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes")]
-unsafe fn crc_shift_iscsi(crc: u32, nbytes: usize) -> uint64x2_t {
-    clmul_scalar(crc, xnmodp_crc32_iscsi((nbytes * 8 - 33) as u64))
-}
-
-// x^n mod P, in log(n) time
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes")]
-unsafe fn xnmodp_iso_hdlc(mut n: u64) -> u32 {
-    let mut stack = !1u64;
-    let mut acc: u32;
-    let mut low: u32;
-
-    while n > 191 {
-        stack = (stack << 1) + (n & 1);
-        n = (n >> 1) - 16;
-    }
-    stack = !stack;
-    acc = 0x80000000u32 >> (n & 31);
-    n >>= 5;
-
-    while n > 0 {
-        // ARM CRC32 instruction (ISO-HDLC uses standard CRC32, not CRC32C)
-        acc = unsafe { __crc32w(acc, 0) };
-        n -= 1;
-    }
-
-    while {
-        low = (stack & 1) as u32;
-        stack >>= 1;
-        stack != 0
-    } {
-        unsafe {
-            // Convert to polynomial type and square it
-            let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64));
-            let squared = vmull_p8(x, x);
-            let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0);
-            acc = __crc32d(0, y << low);
-        }
-    }
-    acc
-}
-
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes")]
-unsafe fn crc_shift_iso_hdlc(crc: u32, nbytes: usize) -> uint64x2_t {
-    clmul_scalar(crc, xnmodp_iso_hdlc((nbytes * 8 - 33) as u64))
-}
-
-/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-/// using:
-///
-/// ./generate -i neon_eor3 -p crc32c -a v9s3x2e_s3
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes,sha3")]
-unsafe fn crc32_iscsi_eor3_v9s3x2e_s3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
-    // Align to 8-byte boundary
-    while len > 0 && (buf as usize & 7) != 0 {
-        crc0 = __crc32cb(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    // Handle 8-byte alignment
-    if (buf as usize & 8) != 0 && len >= 8 {
-        crc0 = __crc32cd(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    if len >= 192 {
-        let end = buf.add(len);
-        let blk = len / 192;
-        let klen = blk * 16;
-        let buf2 = buf.add(klen * 3);
-        let limit = buf.add(klen).sub(32);
-        let mut crc1 = 0u32;
-        let mut crc2 = 0u32;
-
-        // First vector chunk
-        let mut x0 = vld1q_u64(buf2 as *const u64);
-        let mut x1 = vld1q_u64(buf2.add(16) as *const u64);
-        let mut x2 = vld1q_u64(buf2.add(32) as *const u64);
-        let mut x3 = vld1q_u64(buf2.add(48) as *const u64);
-        let mut x4 = vld1q_u64(buf2.add(64) as *const u64);
-        let mut x5 = vld1q_u64(buf2.add(80) as *const u64);
-        let mut x6 = vld1q_u64(buf2.add(96) as *const u64);
-        let mut x7 = vld1q_u64(buf2.add(112) as *const u64);
-        let mut x8 = vld1q_u64(buf2.add(128) as *const u64);
-
-        let k_vals: [u64; 2] = [0x7e908048, 0xc96cfdc0];
-        let mut k = vld1q_u64(k_vals.as_ptr());
-        let mut buf2 = buf2.add(144);
-
-        // Main loop
-        while buf <= limit {
-            let y0 = clmul_lo_eor3(x0, k);
-            x0 = clmul_hi_eor3(x0, k);
-            let y1 = clmul_lo_eor3(x1, k);
-            x1 = clmul_hi_eor3(x1, k);
-            let y2 = clmul_lo_eor3(x2, k);
-            x2 = clmul_hi_eor3(x2, k);
-            let y3 = clmul_lo_eor3(x3, k);
-            x3 = clmul_hi_eor3(x3, k);
-            let y4 = clmul_lo_eor3(x4, k);
-            x4 = clmul_hi_eor3(x4, k);
-            let y5 = clmul_lo_eor3(x5, k);
-            x5 = clmul_hi_eor3(x5, k);
-            let y6 = clmul_lo_eor3(x6, k);
-            x6 = clmul_hi_eor3(x6, k);
-            let y7 = clmul_lo_eor3(x7, k);
-            x7 = clmul_hi_eor3(x7, k);
-            let y8 = clmul_lo_eor3(x8, k);
-            x8 = clmul_hi_eor3(x8, k);
-
-            x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64));
-            x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64));
-            x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64));
-            x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64));
-            x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64));
-            x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64));
-            x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64));
-            x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64));
-            x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64));
-
-            crc0 = __crc32cd(crc0, *(buf as *const u64));
-            crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
-            crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
-            crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64));
-            crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64));
-            crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64));
-
-            buf = buf.add(16);
-            buf2 = buf2.add(144);
-        }
-
-        // Reduce x0 ... x8 to just x0
-        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_eor3(x0, k);
-        x0 = clmul_hi_eor3(x0, k);
-        x0 = veor3q_u64(x0, y0, x1);
-        x1 = x2;
-        x2 = x3;
-        x3 = x4;
-        x4 = x5;
-        x5 = x6;
-        x6 = x7;
-        x7 = x8;
-
-        let y0 = clmul_lo_eor3(x0, k);
-        x0 = clmul_hi_eor3(x0, k);
-        let y2 = clmul_lo_eor3(x2, k);
-        x2 = clmul_hi_eor3(x2, k);
-        let y4 = clmul_lo_eor3(x4, k);
-        x4 = clmul_hi_eor3(x4, k);
-        let y6 = clmul_lo_eor3(x6, k);
-        x6 = clmul_hi_eor3(x6, k);
-
-        x0 = veor3q_u64(x0, y0, x1);
-        x2 = veor3q_u64(x2, y2, x3);
-        x4 = veor3q_u64(x4, y4, x5);
-        x6 = veor3q_u64(x6, y6, x7);
-
-        let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_eor3(x0, k);
-        x0 = clmul_hi_eor3(x0, k);
-        let y4 = clmul_lo_eor3(x4, k);
-        x4 = clmul_hi_eor3(x4, k);
-
-        x0 = veor3q_u64(x0, y0, x2);
-        x4 = veor3q_u64(x4, y4, x6);
-
-        let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_eor3(x0, k);
-        x0 = clmul_hi_eor3(x0, k);
-        x0 = veor3q_u64(x0, y0, x4);
-
-        // Final scalar chunk
-        crc0 = __crc32cd(crc0, *(buf as *const u64));
-        crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
-        crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
-        crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64));
-        crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64));
-        crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64));
-
-        let vc0 = crc_shift_iscsi(crc0, klen * 2 + blk * 144);
-        let vc1 = crc_shift_iscsi(crc1, klen + blk * 144);
-        let vc2 = crc_shift_iscsi(crc2, blk * 144);
-        let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
-
-        // Reduce 128 bits to 32 bits, and multiply by x^32
-        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
-        crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1));
-
-        buf = buf2;
-        len = end.offset_from(buf) as usize;
-    }
-
-    if len >= 32 {
-        let klen = ((len - 8) / 24) * 8;
-        let mut crc1 = 0u32;
-        let mut crc2 = 0u32;
-
-        // Main loop
-        loop {
-            crc0 = __crc32cd(crc0, *(buf as *const u64));
-            crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
-            crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
-            buf = buf.add(8);
-            len -= 24;
-            if len < 32 {
-                break;
-            }
-        }
-
-        let vc0 = crc_shift_iscsi(crc0, klen * 2 + 8);
-        let vc1 = crc_shift_iscsi(crc1, klen + 8);
-        let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
-
-        // Final 8 bytes
-        buf = buf.add(klen * 2);
-        crc0 = crc2;
-        crc0 = __crc32cd(crc0, *(buf as *const u64) ^ vc);
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    while len >= 8 {
-        crc0 = __crc32cd(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    while len > 0 {
-        crc0 = __crc32cb(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    crc0
-}
-
-#[inline]
-#[target_feature(enable = "aes")]
-unsafe fn clmul_lo_e(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
-    // Polynomial multiply low parts and XOR with c
-    let mul_result = vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0));
-    let mul_vec = vreinterpretq_u64_p128(mul_result);
-    veorq_u64(mul_vec, c)
-}
-
-#[inline]
-#[target_feature(enable = "aes")]
-unsafe fn clmul_hi_e(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
-    // Polynomial multiply high parts and XOR with c
-    let mul_result = vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1));
-    let mul_vec = vreinterpretq_u64_p128(mul_result);
-    veorq_u64(mul_vec, c)
-}
-
-/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-/// using:
-///
-/// ./generate -i neon -p crc32c -a v12e_v1
-#[inline]
-#[target_feature(enable = "aes")]
-unsafe fn crc32_iscsi_v12e_v1(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
-    // Align to 8-byte boundary
-    while len > 0 && (buf as usize & 7) != 0 {
-        crc0 = __crc32cb(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    // Handle 8-byte alignment
-    if (buf as usize & 8) != 0 && len >= 8 {
-        crc0 = __crc32cd(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    if len >= 192 {
-        let end = buf.add(len);
-        let limit = buf.add(len - 192);
-
-        // First vector chunk
-        let mut x0 = vld1q_u64(buf as *const u64);
-        let mut x1 = vld1q_u64(buf.add(16) as *const u64);
-        let mut x2 = vld1q_u64(buf.add(32) as *const u64);
-        let mut x3 = vld1q_u64(buf.add(48) as *const u64);
-        let mut x4 = vld1q_u64(buf.add(64) as *const u64);
-        let mut x5 = vld1q_u64(buf.add(80) as *const u64);
-        let mut x6 = vld1q_u64(buf.add(96) as *const u64);
-        let mut x7 = vld1q_u64(buf.add(112) as *const u64);
-        let mut x8 = vld1q_u64(buf.add(128) as *const u64);
-        let mut x9 = vld1q_u64(buf.add(144) as *const u64);
-        let mut x10 = vld1q_u64(buf.add(160) as *const u64);
-        let mut x11 = vld1q_u64(buf.add(176) as *const u64);
-
-        let k_vals: [u64; 2] = [0xa87ab8a8, 0xab7aff2a];
-        let mut k = vld1q_u64(k_vals.as_ptr());
-
-        // Create CRC vector and XOR with first vector
-        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
-        x0 = veorq_u64(crc_vec, x0);
-        buf = buf.add(192);
-
-        // Main loop
-        while buf <= limit {
-            let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64));
-            x0 = clmul_hi_e(x0, k, y0);
-            let y1 = clmul_lo_e(x1, k, vld1q_u64(buf.add(16) as *const u64));
-            x1 = clmul_hi_e(x1, k, y1);
-            let y2 = clmul_lo_e(x2, k, vld1q_u64(buf.add(32) as *const u64));
-            x2 = clmul_hi_e(x2, k, y2);
-            let y3 = clmul_lo_e(x3, k, vld1q_u64(buf.add(48) as *const u64));
-            x3 = clmul_hi_e(x3, k, y3);
-            let y4 = clmul_lo_e(x4, k, vld1q_u64(buf.add(64) as *const u64));
-            x4 = clmul_hi_e(x4, k, y4);
-            let y5 = clmul_lo_e(x5, k, vld1q_u64(buf.add(80) as *const u64));
-            x5 = clmul_hi_e(x5, k, y5);
-            let y6 = clmul_lo_e(x6, k, vld1q_u64(buf.add(96) as *const u64));
-            x6 = clmul_hi_e(x6, k, y6);
-            let y7 = clmul_lo_e(x7, k, vld1q_u64(buf.add(112) as *const u64));
-            x7 = clmul_hi_e(x7, k, y7);
-            let y8 = clmul_lo_e(x8, k, vld1q_u64(buf.add(128) as *const u64));
-            x8 = clmul_hi_e(x8, k, y8);
-            let y9 = clmul_lo_e(x9, k, vld1q_u64(buf.add(144) as *const u64));
-            x9 = clmul_hi_e(x9, k, y9);
-            let y10 = clmul_lo_e(x10, k, vld1q_u64(buf.add(160) as *const u64));
-            x10 = clmul_hi_e(x10, k, y10);
-            let y11 = clmul_lo_e(x11, k, vld1q_u64(buf.add(176) as *const u64));
-            x11 = clmul_hi_e(x11, k, y11);
-            buf = buf.add(192);
-        }
-
-        // Reduce x0 ... x11 to just x0
-        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_e(x0, k, x1);
-        x0 = clmul_hi_e(x0, k, y0);
-        let y2 = clmul_lo_e(x2, k, x3);
-        x2 = clmul_hi_e(x2, k, y2);
-        let y4 = clmul_lo_e(x4, k, x5);
-        x4 = clmul_hi_e(x4, k, y4);
-        let y6 = clmul_lo_e(x6, k, x7);
-        x6 = clmul_hi_e(x6, k, y6);
-        let y8 = clmul_lo_e(x8, k, x9);
-        x8 = clmul_hi_e(x8, k, y8);
-        let y10 = clmul_lo_e(x10, k, x11);
-        x10 = clmul_hi_e(x10, k, y10);
-
-        let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_e(x0, k, x2);
-        x0 = clmul_hi_e(x0, k, y0);
-        let y4 = clmul_lo_e(x4, k, x6);
-        x4 = clmul_hi_e(x4, k, y4);
-        let y8 = clmul_lo_e(x8, k, x10);
-        x8 = clmul_hi_e(x8, k, y8);
-
-        let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_e(x0, k, x4);
-        x0 = clmul_hi_e(x0, k, y0);
-        x4 = x8;
-        let y0 = clmul_lo_e(x0, k, x4);
-        x0 = clmul_hi_e(x0, k, y0);
-
-        // Reduce 128 bits to 32 bits, and multiply by x^32
-        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
-        crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
-        len = end.offset_from(buf) as usize;
-    }
-
-    if len >= 16 {
-        // First vector chunk
-        let mut x0 = vld1q_u64(buf as *const u64);
-
-        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
-        let k = vld1q_u64(k_vals.as_ptr());
-
-        // Create CRC vector and XOR with first vector
-        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
-        x0 = veorq_u64(crc_vec, x0);
-        buf = buf.add(16);
-        len -= 16;
-
-        // Main loop
-        while len >= 16 {
-            let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64));
-            x0 = clmul_hi_e(x0, k, y0);
-            buf = buf.add(16);
-            len -= 16;
-        }
-
-        // Reduce 128 bits to 32 bits, and multiply by x^32
-        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
-        crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
-    }
-
-    while len >= 8 {
-        crc0 = __crc32cd(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    while len > 0 {
-        crc0 = __crc32cb(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    crc0
-}
-
-/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-/// using:
-///
-/// ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes,sha3")]
-unsafe fn crc32_iso_hdlc_eor3_v9s3x2e_s3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
-    // Align to 8-byte boundary
-    while len > 0 && (buf as usize & 7) != 0 {
-        crc0 = __crc32b(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    // Handle 8-byte alignment
-    if (buf as usize & 8) != 0 && len >= 8 {
-        crc0 = __crc32d(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    if len >= 192 {
-        let end = buf.add(len);
-        let blk = len / 192;
-        let klen = blk * 16;
-        let buf2 = buf.add(klen * 3);
-        let limit = buf.add(klen).sub(32);
-        let mut crc1 = 0u32;
-        let mut crc2 = 0u32;
-
-        // First vector chunk
-        let mut x0 = vld1q_u64(buf2 as *const u64);
-        let mut x1 = vld1q_u64(buf2.add(16) as *const u64);
-        let mut x2 = vld1q_u64(buf2.add(32) as *const u64);
-        let mut x3 = vld1q_u64(buf2.add(48) as *const u64);
-        let mut x4 = vld1q_u64(buf2.add(64) as *const u64);
-        let mut x5 = vld1q_u64(buf2.add(80) as *const u64);
-        let mut x6 = vld1q_u64(buf2.add(96) as *const u64);
-        let mut x7 = vld1q_u64(buf2.add(112) as *const u64);
-        let mut x8 = vld1q_u64(buf2.add(128) as *const u64);
-
-        // ISO-HDLC specific constants
-        let k_vals: [u64; 2] = [0x26b70c3d, 0x3f41287a];
-        let mut k = vld1q_u64(k_vals.as_ptr());
-        let mut buf2 = buf2.add(144);
-
-        // Main loop
-        while buf <= limit {
-            let y0 = clmul_lo_eor3(x0, k);
-            x0 = clmul_hi_eor3(x0, k);
-            let y1 = clmul_lo_eor3(x1, k);
-            x1 = clmul_hi_eor3(x1, k);
-            let y2 = clmul_lo_eor3(x2, k);
-            x2 = clmul_hi_eor3(x2, k);
-            let y3 = clmul_lo_eor3(x3, k);
-            x3 = clmul_hi_eor3(x3, k);
-            let y4 = clmul_lo_eor3(x4, k);
-            x4 = clmul_hi_eor3(x4, k);
-            let y5 = clmul_lo_eor3(x5, k);
-            x5 = clmul_hi_eor3(x5, k);
-            let y6 = clmul_lo_eor3(x6, k);
-            x6 = clmul_hi_eor3(x6, k);
-            let y7 = clmul_lo_eor3(x7, k);
-            x7 = clmul_hi_eor3(x7, k);
-            let y8 = clmul_lo_eor3(x8, k);
-            x8 = clmul_hi_eor3(x8, k);
-
-            x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64));
-            x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64));
-            x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64));
-            x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64));
-            x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64));
-            x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64));
-            x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64));
-            x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64));
-            x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64));
-
-            crc0 = __crc32d(crc0, *(buf as *const u64));
-            crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64));
-            crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64));
-            crc0 = __crc32d(crc0, *(buf.add(8) as *const u64));
-            crc1 = __crc32d(crc1, *(buf.add(klen + 8) as *const u64));
-            crc2 = __crc32d(crc2, *(buf.add(klen * 2 + 8) as *const u64));
-
-            buf = buf.add(16);
-            buf2 = buf2.add(144);
-        }
-
-        // Reduce x0 ... x8 to just x0
-        let k_vals: [u64; 2] = [0xae689191, 0xccaa009e];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_eor3(x0, k);
-        x0 = clmul_hi_eor3(x0, k);
-        x0 = veor3q_u64(x0, y0, x1);
-        x1 = x2;
-        x2 = x3;
-        x3 = x4;
-        x4 = x5;
-        x5 = x6;
-        x6 = x7;
-        x7 = x8;
-
-        let y0 = clmul_lo_eor3(x0, k);
-        x0 = clmul_hi_eor3(x0, k);
-        let y2 = clmul_lo_eor3(x2, k);
-        x2 = clmul_hi_eor3(x2, k);
-        let y4 = clmul_lo_eor3(x4, k);
-        x4 = clmul_hi_eor3(x4, k);
-        let y6 = clmul_lo_eor3(x6, k);
-        x6 = clmul_hi_eor3(x6, k);
-
-        x0 = veor3q_u64(x0, y0, x1);
-        x2 = veor3q_u64(x2, y2, x3);
-        x4 = veor3q_u64(x4, y4, x5);
-        x6 = veor3q_u64(x6, y6, x7);
-
-        let k_vals: [u64; 2] = [0xf1da05aa, 0x81256527];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_eor3(x0, k);
-        x0 = clmul_hi_eor3(x0, k);
-        let y4 = clmul_lo_eor3(x4, k);
-        x4 = clmul_hi_eor3(x4, k);
-
-        x0 = veor3q_u64(x0, y0, x2);
-        x4 = veor3q_u64(x4, y4, x6);
-
-        let k_vals: [u64; 2] = [0x8f352d95, 0x1d9513d7];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_eor3(x0, k);
-        x0 = clmul_hi_eor3(x0, k);
-        x0 = veor3q_u64(x0, y0, x4);
-
-        // Final scalar chunk
-        crc0 = __crc32d(crc0, *(buf as *const u64));
-        crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64));
-        crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64));
-        crc0 = __crc32d(crc0, *(buf.add(8) as *const u64));
-        crc1 = __crc32d(crc1, *(buf.add(klen + 8) as *const u64));
-        crc2 = __crc32d(crc2, *(buf.add(klen * 2 + 8) as *const u64));
-
-        let vc0 = crc_shift_iso_hdlc(crc0, klen * 2 + blk * 144);
-        let vc1 = crc_shift_iso_hdlc(crc1, klen + blk * 144);
-        let vc2 = crc_shift_iso_hdlc(crc2, blk * 144);
-        let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
-
-        // Reduce 128 bits to 32 bits, and multiply by x^32
-        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
-        crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1));
-
-        buf = buf2;
-        len = end.offset_from(buf) as usize;
-    }
-
-    if len >= 32 {
-        let klen = ((len - 8) / 24) * 8;
-        let mut crc1 = 0u32;
-        let mut crc2 = 0u32;
-
-        // Main loop
-        loop {
-            crc0 = __crc32d(crc0, *(buf as *const u64));
-            crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64));
-            crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64));
-            buf = buf.add(8);
-            len -= 24;
-            if len < 32 {
-                break;
-            }
-        }
-
-        let vc0 = crc_shift_iso_hdlc(crc0, klen * 2 + 8);
-        let vc1 = crc_shift_iso_hdlc(crc1, klen + 8);
-        let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
-
-        // Final 8 bytes
-        buf = buf.add(klen * 2);
-        crc0 = crc2;
-        crc0 = __crc32d(crc0, *(buf as *const u64) ^ vc);
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    while len >= 8 {
-        crc0 = __crc32d(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    while len > 0 {
-        crc0 = __crc32b(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    crc0
-}
-
-/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-/// using:
-///
-/// ./generate -i neon -p crc32 -a v12e_v1
-#[inline]
-#[target_feature(enable = "aes")]
-unsafe fn crc32_iso_hdlc_v12e_v1(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
-    // Align to 8-byte boundary
-    while len > 0 && (buf as usize & 7) != 0 {
-        crc0 = __crc32b(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    // Handle 8-byte alignment
-    if (buf as usize & 8) != 0 && len >= 8 {
-        crc0 = __crc32d(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    if len >= 192 {
-        let end = buf.add(len);
-        let limit = buf.add(len - 192);
-
-        // First vector chunk
-        let mut x0 = vld1q_u64(buf as *const u64);
-        let mut x1 = vld1q_u64(buf.add(16) as *const u64);
-        let mut x2 = vld1q_u64(buf.add(32) as *const u64);
-        let mut x3 = vld1q_u64(buf.add(48) as *const u64);
-        let mut x4 = vld1q_u64(buf.add(64) as *const u64);
-        let mut x5 = vld1q_u64(buf.add(80) as *const u64);
-        let mut x6 = vld1q_u64(buf.add(96) as *const u64);
-        let mut x7 = vld1q_u64(buf.add(112) as *const u64);
-        let mut x8 = vld1q_u64(buf.add(128) as *const u64);
-        let mut x9 = vld1q_u64(buf.add(144) as *const u64);
-        let mut x10 = vld1q_u64(buf.add(160) as *const u64);
-        let mut x11 = vld1q_u64(buf.add(176) as *const u64);
-
-        // ISO-HDLC specific constants for small implementation
-        let k_vals: [u64; 2] = [0x596c8d81, 0xf5e48c85];
-        let mut k = vld1q_u64(k_vals.as_ptr());
-
-        // Create CRC vector and XOR with first vector
-        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
-        x0 = veorq_u64(crc_vec, x0);
-        buf = buf.add(192);
-
-        // Main loop
-        while buf <= limit {
-            let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64));
-            x0 = clmul_hi_e(x0, k, y0);
-            let y1 = clmul_lo_e(x1, k, vld1q_u64(buf.add(16) as *const u64));
-            x1 = clmul_hi_e(x1, k, y1);
-            let y2 = clmul_lo_e(x2, k, vld1q_u64(buf.add(32) as *const u64));
-            x2 = clmul_hi_e(x2, k, y2);
-            let y3 = clmul_lo_e(x3, k, vld1q_u64(buf.add(48) as *const u64));
-            x3 = clmul_hi_e(x3, k, y3);
-            let y4 = clmul_lo_e(x4, k, vld1q_u64(buf.add(64) as *const u64));
-            x4 = clmul_hi_e(x4, k, y4);
-            let y5 = clmul_lo_e(x5, k, vld1q_u64(buf.add(80) as *const u64));
-            x5 = clmul_hi_e(x5, k, y5);
-            let y6 = clmul_lo_e(x6, k, vld1q_u64(buf.add(96) as *const u64));
-            x6 = clmul_hi_e(x6, k, y6);
-            let y7 = clmul_lo_e(x7, k, vld1q_u64(buf.add(112) as *const u64));
-            x7 = clmul_hi_e(x7, k, y7);
-            let y8 = clmul_lo_e(x8, k, vld1q_u64(buf.add(128) as *const u64));
-            x8 = clmul_hi_e(x8, k, y8);
-            let y9 = clmul_lo_e(x9, k, vld1q_u64(buf.add(144) as *const u64));
-            x9 = clmul_hi_e(x9, k, y9);
-            let y10 = clmul_lo_e(x10, k, vld1q_u64(buf.add(160) as *const u64));
-            x10 = clmul_hi_e(x10, k, y10);
-            let y11 = clmul_lo_e(x11, k, vld1q_u64(buf.add(176) as *const u64));
-            x11 = clmul_hi_e(x11, k, y11);
-            buf = buf.add(192);
-        }
-
-        // Reduce x0 ... x11 to just x0
-        let k_vals: [u64; 2] = [0xae689191, 0xccaa009e];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_e(x0, k, x1);
-        x0 = clmul_hi_e(x0, k, y0);
-        let y2 = clmul_lo_e(x2, k, x3);
-        x2 = clmul_hi_e(x2, k, y2);
-        let y4 = clmul_lo_e(x4, k, x5);
-        x4 = clmul_hi_e(x4, k, y4);
-        let y6 = clmul_lo_e(x6, k, x7);
-        x6 = clmul_hi_e(x6, k, y6);
-        let y8 = clmul_lo_e(x8, k, x9);
-        x8 = clmul_hi_e(x8, k, y8);
-        let y10 = clmul_lo_e(x10, k, x11);
-        x10 = clmul_hi_e(x10, k, y10);
-
-        let k_vals: [u64; 2] = [0xf1da05aa, 0x81256527];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_e(x0, k, x2);
-        x0 = clmul_hi_e(x0, k, y0);
-        let y4 = clmul_lo_e(x4, k, x6);
-        x4 = clmul_hi_e(x4, k, y4);
-        let y8 = clmul_lo_e(x8, k, x10);
-        x8 = clmul_hi_e(x8, k, y8);
-
-        let k_vals: [u64; 2] = [0x8f352d95, 0x1d9513d7];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_e(x0, k, x4);
-        x0 = clmul_hi_e(x0, k, y0);
-        x4 = x8;
-        let y0 = clmul_lo_e(x0, k, x4);
-        x0 = clmul_hi_e(x0, k, y0);
-
-        // Reduce 128 bits to 32 bits, and multiply by x^32
-        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
-        crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
-        len = end.offset_from(buf) as usize;
-    }
-
-    if len >= 16 {
-        // First vector chunk
-        let mut x0 = vld1q_u64(buf as *const u64);
-
-        let k_vals: [u64; 2] = [0xae689191, 0xccaa009e];
-        let k = vld1q_u64(k_vals.as_ptr());
-
-        // Create CRC vector and XOR with first vector
-        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
-        x0 = veorq_u64(crc_vec, x0);
-        buf = buf.add(16);
-        len -= 16;
-
-        // Main loop
-        while len >= 16 {
-            let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64));
-            x0 = clmul_hi_e(x0, k, y0);
-            buf = buf.add(16);
-            len -= 16;
-        }
-
-        // Reduce 128 bits to 32 bits, and multiply by x^32
-        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
-        crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
-    }
-
-    while len >= 8 {
-        crc0 = __crc32d(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    while len > 0 {
-        crc0 = __crc32b(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    crc0
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::test::consts::TEST_CHECK_STRING;
-    use crc::{Crc, Table};
-    use rand::{rng, Rng};
-
-    const RUST_CRC32_ISO_HDLC: Crc<u32, Table<16>> =
-        Crc::<u32, Table<16>>::new(&crc::CRC_32_ISO_HDLC);
-
-    const RUST_CRC32_ISCSI: Crc<u32, Table<16>> = Crc::<u32, Table<16>>::new(&crc::CRC_32_ISCSI);
-
-    #[test]
-    fn test_crc32_iso_hdlc_check() {
-        assert_eq!(
-            crc32_iso_hdlc(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
-            0xcbf43926
-        );
-    }
-
-    #[test]
-    fn test_crc32_iso_hdlc_small_all_lengths() {
-        for len in 1..=255 {
-            crc32_iso_hdlc_random(len)
-        }
-    }
-
-    #[test]
-    fn test_crc32_iso_hdlc_medium_lengths() {
-        // Test each length from 256 to 1024, which should fold and include handling remainders
-        for len in 256..=1024 {
-            crc32_iso_hdlc_random(len)
-        }
-    }
-
-    #[test]
-    fn test_crc32_iso_hdlc_large_lengths() {
-        // Test 1 MiB just before, at, and just after the folding boundaries
-        for len in 1048575..1048577 {
-            crc32_iso_hdlc_random(len)
-        }
-    }
-
-    #[test]
-    fn test_crc32_iscsi_check() {
-        assert_eq!(
-            crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
-            0xe3069283
-        );
-    }
-
-    #[test]
-    fn test_crc32_iscsi_small_all_lengths() {
-        for len in 1..=255 {
-            crc32_iscsi_random(len);
-        }
-    }
-
-    #[test]
-    fn test_crc32_iscsi_medium_lengths() {
-        // Test each length from 256 to 1024, which should fold and include handling remainders
-        for len in 256..=1024 {
-            crc32_iscsi_random(len);
-        }
-    }
-
-    #[test]
-    fn test_crc32_iscsi_large_lengths() {
-        // Test 1 MiB just before, at, and just after the folding boundaries
-        for len in 1048575..1048577 {
-            crc32_iscsi_random(len);
-        }
-    }
-
-    #[cfg(target_feature = "sha3")]
-    fn crc32_iso_hdlc_random(len: usize) {
-        let mut data = vec![0u8; len];
-        rng().fill(&mut data[..]);
-
-        let checksum = RUST_CRC32_ISO_HDLC.checksum(&data);
-
-        assert_eq!(crc32_iso_hdlc(0xffffffff, &data) ^ 0xffffffff, checksum);
-
-        unsafe {
-            assert_eq!(
-                crc32_iso_hdlc_eor3_v9s3x2e_s3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                checksum
-            );
-
-            assert_eq!(
-                crc32_iso_hdlc_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                checksum
-            );
-        }
-    }
-
-    #[cfg(not(target_feature = "sha3"))]
-    fn crc32_iso_hdlc_random(len: usize) {
-        let mut data = vec![0u8; len];
-        rng().fill(&mut data[..]);
-
-        let checksum = RUST_CRC32_ISO_HDLC.checksum(&data);
-
-        assert_eq!(crc32_iso_hdlc(0xffffffff, &data) ^ 0xffffffff, checksum);
-
-        unsafe {
-            assert_eq!(
-                crc32_iso_hdlc_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                checksum
-            );
-        }
-    }
-
-    #[cfg(target_feature = "sha3")]
-    fn crc32_iscsi_random(len: usize) {
-        let mut data = vec![0u8; len];
-        rng().fill(&mut data[..]);
-
-        let checksum = RUST_CRC32_ISCSI.checksum(&data);
-
-        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
-
-        unsafe {
-            assert_eq!(
-                crc32_iscsi_eor3_v9s3x2e_s3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                checksum
-            );
-
-            assert_eq!(
-                crc32_iscsi_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                checksum
-            );
-        }
-    }
-
-    #[cfg(not(target_feature = "sha3"))]
-    fn crc32_iscsi_random(len: usize) {
-        let mut data = vec![0u8; len];
-        rng().fill(&mut data[..]);
-
-        let checksum = RUST_CRC32_ISCSI.checksum(&data);
-
-        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
-
-        unsafe {
-            assert_eq!(
-                crc32_iscsi_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                checksum
-            );
-        }
-    }
-}
diff --git a/src/crc32/fusion/aarch64/iscsi/crc_pmull.rs b/src/crc32/fusion/aarch64/iscsi/crc_pmull.rs
new file mode 100644
index 0000000..1bb2473
--- /dev/null
+++ b/src/crc32/fusion/aarch64/iscsi/crc_pmull.rs
@@ -0,0 +1,179 @@
+//! Provides CRC-32/ISCSI calculations using a fusion of native CLMUL
+//! instructions and native CRC calculation instructions on aarch64.
+//!
+//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
+//!
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! Modified as necessary for this Rust implementation.
+//!
+//! MIT licensed.
+
+#![cfg(target_arch = "aarch64")]
+
+use crate::crc32::fusion::aarch64::{clmul_hi_and_xor, clmul_lo_and_xor};
+use std::arch::aarch64::{
+    __crc32cb, __crc32cd, veorq_u64, vgetq_lane_u64, vld1q_u64, vmovq_n_u64, vsetq_lane_u64,
+};
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// with the help of Claude.ai using:
+///
+/// ./generate -i neon -p crc32c -a v12e_v1
+///
+/// Modified as necessary for this Rust implementation.
+#[inline]
+#[target_feature(enable = "crc,aes")]
+pub(crate) unsafe fn crc32_iscsi_v12e_v1(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
+    // Align to 8-byte boundary
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = __crc32cb(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = __crc32cd(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 192 {
+        let end = buf.add(len);
+        let limit = buf.add(len - 192);
+
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf as *const u64);
+        let mut x1 = vld1q_u64(buf.add(16) as *const u64);
+        let mut x2 = vld1q_u64(buf.add(32) as *const u64);
+        let mut x3 = vld1q_u64(buf.add(48) as *const u64);
+        let mut x4 = vld1q_u64(buf.add(64) as *const u64);
+        let mut x5 = vld1q_u64(buf.add(80) as *const u64);
+        let mut x6 = vld1q_u64(buf.add(96) as *const u64);
+        let mut x7 = vld1q_u64(buf.add(112) as *const u64);
+        let mut x8 = vld1q_u64(buf.add(128) as *const u64);
+        let mut x9 = vld1q_u64(buf.add(144) as *const u64);
+        let mut x10 = vld1q_u64(buf.add(160) as *const u64);
+        let mut x11 = vld1q_u64(buf.add(176) as *const u64);
+
+        let k_vals: [u64; 2] = [0xa87ab8a8, 0xab7aff2a];
+        let mut k = vld1q_u64(k_vals.as_ptr());
+
+        // Create CRC vector and XOR with first vector
+        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
+        x0 = veorq_u64(crc_vec, x0);
+        buf = buf.add(192);
+
+        // Main loop
+        while buf <= limit {
+            let y0 = clmul_lo_and_xor(x0, k, vld1q_u64(buf as *const u64));
+            x0 = clmul_hi_and_xor(x0, k, y0);
+            let y1 = clmul_lo_and_xor(x1, k, vld1q_u64(buf.add(16) as *const u64));
+            x1 = clmul_hi_and_xor(x1, k, y1);
+            let y2 = clmul_lo_and_xor(x2, k, vld1q_u64(buf.add(32) as *const u64));
+            x2 = clmul_hi_and_xor(x2, k, y2);
+            let y3 = clmul_lo_and_xor(x3, k, vld1q_u64(buf.add(48) as *const u64));
+            x3 = clmul_hi_and_xor(x3, k, y3);
+            let y4 = clmul_lo_and_xor(x4, k, vld1q_u64(buf.add(64) as *const u64));
+            x4 = clmul_hi_and_xor(x4, k, y4);
+            let y5 = clmul_lo_and_xor(x5, k, vld1q_u64(buf.add(80) as *const u64));
+            x5 = clmul_hi_and_xor(x5, k, y5);
+            let y6 = clmul_lo_and_xor(x6, k, vld1q_u64(buf.add(96) as *const u64));
+            x6 = clmul_hi_and_xor(x6, k, y6);
+            let y7 = clmul_lo_and_xor(x7, k, vld1q_u64(buf.add(112) as *const u64));
+            x7 = clmul_hi_and_xor(x7, k, y7);
+            let y8 = clmul_lo_and_xor(x8, k, vld1q_u64(buf.add(128) as *const u64));
+            x8 = clmul_hi_and_xor(x8, k, y8);
+            let y9 = clmul_lo_and_xor(x9, k, vld1q_u64(buf.add(144) as *const u64));
+            x9 = clmul_hi_and_xor(x9, k, y9);
+            let y10 = clmul_lo_and_xor(x10, k, vld1q_u64(buf.add(160) as *const u64));
+            x10 = clmul_hi_and_xor(x10, k, y10);
+            let y11 = clmul_lo_and_xor(x11, k, vld1q_u64(buf.add(176) as *const u64));
+            x11 = clmul_hi_and_xor(x11, k, y11);
+            buf = buf.add(192);
+        }
+
+        // Reduce x0 ... x11 to just x0
+        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_and_xor(x0, k, x1);
+        x0 = clmul_hi_and_xor(x0, k, y0);
+        let y2 = clmul_lo_and_xor(x2, k, x3);
+        x2 = clmul_hi_and_xor(x2, k, y2);
+        let y4 = clmul_lo_and_xor(x4, k, x5);
+        x4 = clmul_hi_and_xor(x4, k, y4);
+        let y6 = clmul_lo_and_xor(x6, k, x7);
+        x6 = clmul_hi_and_xor(x6, k, y6);
+        let y8 = clmul_lo_and_xor(x8, k, x9);
+        x8 = clmul_hi_and_xor(x8, k, y8);
+        let y10 = clmul_lo_and_xor(x10, k, x11);
+        x10 = clmul_hi_and_xor(x10, k, y10);
+
+        let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_and_xor(x0, k, x2);
+        x0 = clmul_hi_and_xor(x0, k, y0);
+        let y4 = clmul_lo_and_xor(x4, k, x6);
+        x4 = clmul_hi_and_xor(x4, k, y4);
+        let y8 = clmul_lo_and_xor(x8, k, x10);
+        x8 = clmul_hi_and_xor(x8, k, y8);
+
+        let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_and_xor(x0, k, x4);
+        x0 = clmul_hi_and_xor(x0, k, y0);
+        x4 = x8;
+        let y0 = clmul_lo_and_xor(x0, k, x4);
+        x0 = clmul_hi_and_xor(x0, k, y0);
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
+        len = end.offset_from(buf) as usize;
+    }
+
+    if len >= 16 {
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf as *const u64);
+
+        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
+        let k = vld1q_u64(k_vals.as_ptr());
+
+        // Create CRC vector and XOR with first vector
+        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
+        x0 = veorq_u64(crc_vec, x0);
+        buf = buf.add(16);
+        len -= 16;
+
+        // Main loop
+        while len >= 16 {
+            let y0 = clmul_lo_and_xor(x0, k, vld1q_u64(buf as *const u64));
+            x0 = clmul_hi_and_xor(x0, k, y0);
+            buf = buf.add(16);
+            len -= 16;
+        }
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
+    }
+
+    while len >= 8 {
+        crc0 = __crc32cd(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len > 0 {
+        crc0 = __crc32cb(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
diff --git a/src/crc32/fusion/aarch64/iscsi/crc_pmull_sha3.rs b/src/crc32/fusion/aarch64/iscsi/crc_pmull_sha3.rs
new file mode 100644
index 0000000..137cb5c
--- /dev/null
+++ b/src/crc32/fusion/aarch64/iscsi/crc_pmull_sha3.rs
@@ -0,0 +1,266 @@
+//! Provides CRC-32/ISCSI calculations using a fusion of native CLMUL
+//! instructions and native CRC calculation instructions on aarch64.
+//!
+//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
+//!
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! Modified as necessary for this Rust implementation.
+//!
+//! MIT licensed.
+
+#![cfg(target_arch = "aarch64")]
+
+use crate::crc32::fusion::aarch64::{clmul_hi, clmul_lo, clmul_scalar};
+use std::arch::aarch64::{
+    __crc32cb, __crc32cd, __crc32cw, uint64x2_t, veor3q_u64, veorq_u64, vgetq_lane_u64, vld1q_u64,
+    vmov_n_u64, vmull_p8, vreinterpret_p8_u64, vreinterpretq_u64_p16,
+};
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// with the help of Claude.ai using:
+///
+/// ./generate -i neon_eor3 -p crc32c -a v9s3x2e_s3
+///
+/// Modified as necessary for this Rust implementation.
+#[inline]
+#[target_feature(enable = "crc,aes,sha3")]
+pub(crate) unsafe fn crc32_iscsi_eor3_v9s3x2e_s3(
+    mut crc0: u32,
+    mut buf: *const u8,
+    mut len: usize,
+) -> u32 {
+    // Align to 8-byte boundary
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = __crc32cb(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = __crc32cd(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 192 {
+        let end = buf.add(len);
+        let blk = len / 192;
+        let klen = blk * 16;
+        let buf2 = buf.add(klen * 3);
+        let limit = buf.add(klen).sub(32);
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf2 as *const u64);
+        let mut x1 = vld1q_u64(buf2.add(16) as *const u64);
+        let mut x2 = vld1q_u64(buf2.add(32) as *const u64);
+        let mut x3 = vld1q_u64(buf2.add(48) as *const u64);
+        let mut x4 = vld1q_u64(buf2.add(64) as *const u64);
+        let mut x5 = vld1q_u64(buf2.add(80) as *const u64);
+        let mut x6 = vld1q_u64(buf2.add(96) as *const u64);
+        let mut x7 = vld1q_u64(buf2.add(112) as *const u64);
+        let mut x8 = vld1q_u64(buf2.add(128) as *const u64);
+
+        let k_vals: [u64; 2] = [0x7e908048, 0xc96cfdc0];
+        let mut k = vld1q_u64(k_vals.as_ptr());
+        let mut buf2 = buf2.add(144);
+
+        // Main loop
+        while buf <= limit {
+            let y0 = clmul_lo(x0, k);
+            x0 = clmul_hi(x0, k);
+            let y1 = clmul_lo(x1, k);
+            x1 = clmul_hi(x1, k);
+            let y2 = clmul_lo(x2, k);
+            x2 = clmul_hi(x2, k);
+            let y3 = clmul_lo(x3, k);
+            x3 = clmul_hi(x3, k);
+            let y4 = clmul_lo(x4, k);
+            x4 = clmul_hi(x4, k);
+            let y5 = clmul_lo(x5, k);
+            x5 = clmul_hi(x5, k);
+            let y6 = clmul_lo(x6, k);
+            x6 = clmul_hi(x6, k);
+            let y7 = clmul_lo(x7, k);
+            x7 = clmul_hi(x7, k);
+            let y8 = clmul_lo(x8, k);
+            x8 = clmul_hi(x8, k);
+
+            x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64));
+            x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64));
+            x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64));
+            x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64));
+            x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64));
+            x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64));
+            x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64));
+            x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64));
+            x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64));
+
+            crc0 = __crc32cd(crc0, *(buf as *const u64));
+            crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
+            crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
+            crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64));
+            crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64));
+            crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+
+            buf = buf.add(16);
+            buf2 = buf2.add(144);
+        }
+
+        // Reduce x0 ... x8 to just x0
+        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo(x0, k);
+        x0 = clmul_hi(x0, k);
+        x0 = veor3q_u64(x0, y0, x1);
+        x1 = x2;
+        x2 = x3;
+        x3 = x4;
+        x4 = x5;
+        x5 = x6;
+        x6 = x7;
+        x7 = x8;
+
+        let y0 = clmul_lo(x0, k);
+        x0 = clmul_hi(x0, k);
+        let y2 = clmul_lo(x2, k);
+        x2 = clmul_hi(x2, k);
+        let y4 = clmul_lo(x4, k);
+        x4 = clmul_hi(x4, k);
+        let y6 = clmul_lo(x6, k);
+        x6 = clmul_hi(x6, k);
+
+        x0 = veor3q_u64(x0, y0, x1);
+        x2 = veor3q_u64(x2, y2, x3);
+        x4 = veor3q_u64(x4, y4, x5);
+        x6 = veor3q_u64(x6, y6, x7);
+
+        let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo(x0, k);
+        x0 = clmul_hi(x0, k);
+        let y4 = clmul_lo(x4, k);
+        x4 = clmul_hi(x4, k);
+
+        x0 = veor3q_u64(x0, y0, x2);
+        x4 = veor3q_u64(x4, y4, x6);
+
+        let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo(x0, k);
+        x0 = clmul_hi(x0, k);
+        x0 = veor3q_u64(x0, y0, x4);
+
+        // Final scalar chunk
+        crc0 = __crc32cd(crc0, *(buf as *const u64));
+        crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
+        crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
+        crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64));
+        crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64));
+        crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+
+        let vc0 = crc_shift_iscsi(crc0, klen * 2 + blk * 144);
+        let vc1 = crc_shift_iscsi(crc1, klen + blk * 144);
+        let vc2 = crc_shift_iscsi(crc2, blk * 144);
+        let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1));
+
+        buf = buf2;
+        len = end.offset_from(buf) as usize;
+    }
+
+    if len >= 32 {
+        let klen = ((len - 8) / 24) * 8;
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // Main loop
+        loop {
+            crc0 = __crc32cd(crc0, *(buf as *const u64));
+            crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
+            crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
+            buf = buf.add(8);
+            len -= 24;
+            if len < 32 {
+                break;
+            }
+        }
+
+        let vc0 = crc_shift_iscsi(crc0, klen * 2 + 8);
+        let vc1 = crc_shift_iscsi(crc1, klen + 8);
+        let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
+
+        // Final 8 bytes
+        buf = buf.add(klen * 2);
+        crc0 = crc2;
+        crc0 = __crc32cd(crc0, *(buf as *const u64) ^ vc);
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len >= 8 {
+        crc0 = __crc32cd(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len > 0 {
+        crc0 = __crc32cb(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn crc_shift_iscsi(crc: u32, nbytes: usize) -> uint64x2_t {
+    clmul_scalar(crc, xnmodp_crc32_iscsi((nbytes * 8 - 33) as u64))
+}
+
+// x^n mod P, in log(n) time
+#[inline]
+#[target_feature(enable = "crc,aes")]
+unsafe fn xnmodp_crc32_iscsi(mut n: u64) -> u32 {
+    let mut stack = !1u64;
+    let mut acc: u32;
+    let mut low: u32;
+
+    while n > 191 {
+        stack = (stack << 1) + (n & 1);
+        n = (n >> 1) - 16;
+    }
+    stack = !stack;
+    acc = 0x80000000u32 >> (n & 31);
+    n >>= 5;
+
+    while n > 0 {
+        // ARM CRC32 instruction
+        acc = __crc32cw(acc, 0);
+        n -= 1;
+    }
+
+    while {
+        low = (stack & 1) as u32;
+        stack >>= 1;
+        stack != 0
+    } {
+        // Convert to polynomial type and square it
+        let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64));
+        let squared = vmull_p8(x, x);
+        let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0);
+        acc = __crc32cd(0, y << low);
+    }
+    acc
+}
diff --git a/src/crc32/fusion/aarch64/iscsi/mod.rs b/src/crc32/fusion/aarch64/iscsi/mod.rs
new file mode 100644
index 0000000..653ea8a
--- /dev/null
+++ b/src/crc32/fusion/aarch64/iscsi/mod.rs
@@ -0,0 +1,9 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides aarch64-specific implementations of CRC-32/ISCSI calculations using
+//! fusion techniques.
+
+#![cfg(target_arch = "aarch64")]
+
+pub(crate) mod crc_pmull;
+pub(crate) mod crc_pmull_sha3;
diff --git a/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull.rs b/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull.rs
new file mode 100644
index 0000000..b57a959
--- /dev/null
+++ b/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull.rs
@@ -0,0 +1,184 @@
+//! Provides CRC-32/ISO-HDLC calculations using a fusion of native CLMUL
+//! instructions and native CRC calculation instructions on aarch64.
+//!
+//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
+//!
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! Modified as necessary for this Rust implementation.
+//!
+//! MIT licensed.
+
+#![cfg(target_arch = "aarch64")]
+
+use crate::crc32::fusion::aarch64::{clmul_hi_and_xor, clmul_lo_and_xor};
+use std::arch::aarch64::{
+    __crc32b, __crc32d, veorq_u64, vgetq_lane_u64, vld1q_u64, vmovq_n_u64, vsetq_lane_u64,
+};
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// with the help of Claude.ai using:
+///
+/// ./generate -i neon -p crc32 -a v12e_v1
+///
+/// Modified as necessary for this Rust implementation.
+#[inline]
+#[target_feature(enable = "crc,aes")]
+pub(crate) unsafe fn crc32_iso_hdlc_v12e_v1(
+    mut crc0: u32,
+    mut buf: *const u8,
+    mut len: usize,
+) -> u32 {
+    // Align to 8-byte boundary
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = __crc32b(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = __crc32d(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 192 {
+        let end = buf.add(len);
+        let limit = buf.add(len - 192);
+
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf as *const u64);
+        let mut x1 = vld1q_u64(buf.add(16) as *const u64);
+        let mut x2 = vld1q_u64(buf.add(32) as *const u64);
+        let mut x3 = vld1q_u64(buf.add(48) as *const u64);
+        let mut x4 = vld1q_u64(buf.add(64) as *const u64);
+        let mut x5 = vld1q_u64(buf.add(80) as *const u64);
+        let mut x6 = vld1q_u64(buf.add(96) as *const u64);
+        let mut x7 = vld1q_u64(buf.add(112) as *const u64);
+        let mut x8 = vld1q_u64(buf.add(128) as *const u64);
+        let mut x9 = vld1q_u64(buf.add(144) as *const u64);
+        let mut x10 = vld1q_u64(buf.add(160) as *const u64);
+        let mut x11 = vld1q_u64(buf.add(176) as *const u64);
+
+        // ISO-HDLC specific constants for small implementation
+        let k_vals: [u64; 2] = [0x596c8d81, 0xf5e48c85];
+        let mut k = vld1q_u64(k_vals.as_ptr());
+
+        // Create CRC vector and XOR with first vector
+        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
+        x0 = veorq_u64(crc_vec, x0);
+        buf = buf.add(192);
+
+        // Main loop
+        while buf <= limit {
+            let y0 = clmul_lo_and_xor(x0, k, vld1q_u64(buf as *const u64));
+            x0 = clmul_hi_and_xor(x0, k, y0);
+            let y1 = clmul_lo_and_xor(x1, k, vld1q_u64(buf.add(16) as *const u64));
+            x1 = clmul_hi_and_xor(x1, k, y1);
+            let y2 = clmul_lo_and_xor(x2, k, vld1q_u64(buf.add(32) as *const u64));
+            x2 = clmul_hi_and_xor(x2, k, y2);
+            let y3 = clmul_lo_and_xor(x3, k, vld1q_u64(buf.add(48) as *const u64));
+            x3 = clmul_hi_and_xor(x3, k, y3);
+            let y4 = clmul_lo_and_xor(x4, k, vld1q_u64(buf.add(64) as *const u64));
+            x4 = clmul_hi_and_xor(x4, k, y4);
+            let y5 = clmul_lo_and_xor(x5, k, vld1q_u64(buf.add(80) as *const u64));
+            x5 = clmul_hi_and_xor(x5, k, y5);
+            let y6 = clmul_lo_and_xor(x6, k, vld1q_u64(buf.add(96) as *const u64));
+            x6 = clmul_hi_and_xor(x6, k, y6);
+            let y7 = clmul_lo_and_xor(x7, k, vld1q_u64(buf.add(112) as *const u64));
+            x7 = clmul_hi_and_xor(x7, k, y7);
+            let y8 = clmul_lo_and_xor(x8, k, vld1q_u64(buf.add(128) as *const u64));
+            x8 = clmul_hi_and_xor(x8, k, y8);
+            let y9 = clmul_lo_and_xor(x9, k, vld1q_u64(buf.add(144) as *const u64));
+            x9 = clmul_hi_and_xor(x9, k, y9);
+            let y10 = clmul_lo_and_xor(x10, k, vld1q_u64(buf.add(160) as *const u64));
+            x10 = clmul_hi_and_xor(x10, k, y10);
+            let y11 = clmul_lo_and_xor(x11, k, vld1q_u64(buf.add(176) as *const u64));
+            x11 = clmul_hi_and_xor(x11, k, y11);
+            buf = buf.add(192);
+        }
+
+        // Reduce x0 ... x11 to just x0
+        let k_vals: [u64; 2] = [0xae689191, 0xccaa009e];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_and_xor(x0, k, x1);
+        x0 = clmul_hi_and_xor(x0, k, y0);
+        let y2 = clmul_lo_and_xor(x2, k, x3);
+        x2 = clmul_hi_and_xor(x2, k, y2);
+        let y4 = clmul_lo_and_xor(x4, k, x5);
+        x4 = clmul_hi_and_xor(x4, k, y4);
+        let y6 = clmul_lo_and_xor(x6, k, x7);
+        x6 = clmul_hi_and_xor(x6, k, y6);
+        let y8 = clmul_lo_and_xor(x8, k, x9);
+        x8 = clmul_hi_and_xor(x8, k, y8);
+        let y10 = clmul_lo_and_xor(x10, k, x11);
+        x10 = clmul_hi_and_xor(x10, k, y10);
+
+        let k_vals: [u64; 2] = [0xf1da05aa, 0x81256527];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_and_xor(x0, k, x2);
+        x0 = clmul_hi_and_xor(x0, k, y0);
+        let y4 = clmul_lo_and_xor(x4, k, x6);
+        x4 = clmul_hi_and_xor(x4, k, y4);
+        let y8 = clmul_lo_and_xor(x8, k, x10);
+        x8 = clmul_hi_and_xor(x8, k, y8);
+
+        let k_vals: [u64; 2] = [0x8f352d95, 0x1d9513d7];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_and_xor(x0, k, x4);
+        x0 = clmul_hi_and_xor(x0, k, y0);
+        x4 = x8;
+        let y0 = clmul_lo_and_xor(x0, k, x4);
+        x0 = clmul_hi_and_xor(x0, k, y0);
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
+        len = end.offset_from(buf) as usize;
+    }
+
+    if len >= 16 {
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf as *const u64);
+
+        let k_vals: [u64; 2] = [0xae689191, 0xccaa009e];
+        let k = vld1q_u64(k_vals.as_ptr());
+
+        // Create CRC vector and XOR with first vector
+        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
+        x0 = veorq_u64(crc_vec, x0);
+        buf = buf.add(16);
+        len -= 16;
+
+        // Main loop
+        while len >= 16 {
+            let y0 = clmul_lo_and_xor(x0, k, vld1q_u64(buf as *const u64));
+            x0 = clmul_hi_and_xor(x0, k, y0);
+            buf = buf.add(16);
+            len -= 16;
+        }
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
+    }
+
+    while len >= 8 {
+        crc0 = __crc32d(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len > 0 {
+        crc0 = __crc32b(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
diff --git a/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull_sha3.rs b/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull_sha3.rs
new file mode 100644
index 0000000..3483774
--- /dev/null
+++ b/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull_sha3.rs
@@ -0,0 +1,267 @@
+//! Provides CRC-32/ISO-HDLC calculations using a fusion of native CLMUL
+//! instructions and native CRC calculation instructions on aarch64.
+//!
+//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
+//!
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! Modified as necessary for this Rust implementation.
+//!
+//! MIT licensed.
+
+#![cfg(target_arch = "aarch64")]
+
+use crate::crc32::fusion::aarch64::{clmul_hi, clmul_lo, clmul_scalar};
+use std::arch::aarch64::{
+    __crc32b, __crc32d, __crc32w, uint64x2_t, veor3q_u64, veorq_u64, vgetq_lane_u64, vld1q_u64,
+    vmov_n_u64, vmull_p8, vreinterpret_p8_u64, vreinterpretq_u64_p16,
+};
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// with the help of Claude.ai using:
+///
+/// ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3
+///
+/// Modified as necessary for this Rust implementation.
+#[inline]
+#[target_feature(enable = "crc,aes,sha3")]
+pub(crate) unsafe fn crc32_iso_hdlc_eor3_v9s3x2e_s3(
+    mut crc0: u32,
+    mut buf: *const u8,
+    mut len: usize,
+) -> u32 {
+    // Align to 8-byte boundary
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = __crc32b(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = __crc32d(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 192 {
+        let end = buf.add(len);
+        let blk = len / 192;
+        let klen = blk * 16;
+        let buf2 = buf.add(klen * 3);
+        let limit = buf.add(klen).sub(32);
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf2 as *const u64);
+        let mut x1 = vld1q_u64(buf2.add(16) as *const u64);
+        let mut x2 = vld1q_u64(buf2.add(32) as *const u64);
+        let mut x3 = vld1q_u64(buf2.add(48) as *const u64);
+        let mut x4 = vld1q_u64(buf2.add(64) as *const u64);
+        let mut x5 = vld1q_u64(buf2.add(80) as *const u64);
+        let mut x6 = vld1q_u64(buf2.add(96) as *const u64);
+        let mut x7 = vld1q_u64(buf2.add(112) as *const u64);
+        let mut x8 = vld1q_u64(buf2.add(128) as *const u64);
+
+        // ISO-HDLC specific constants
+        let k_vals: [u64; 2] = [0x26b70c3d, 0x3f41287a];
+        let mut k = vld1q_u64(k_vals.as_ptr());
+        let mut buf2 = buf2.add(144);
+
+        // Main loop
+        while buf <= limit {
+            let y0 = clmul_lo(x0, k);
+            x0 = clmul_hi(x0, k);
+            let y1 = clmul_lo(x1, k);
+            x1 = clmul_hi(x1, k);
+            let y2 = clmul_lo(x2, k);
+            x2 = clmul_hi(x2, k);
+            let y3 = clmul_lo(x3, k);
+            x3 = clmul_hi(x3, k);
+            let y4 = clmul_lo(x4, k);
+            x4 = clmul_hi(x4, k);
+            let y5 = clmul_lo(x5, k);
+            x5 = clmul_hi(x5, k);
+            let y6 = clmul_lo(x6, k);
+            x6 = clmul_hi(x6, k);
+            let y7 = clmul_lo(x7, k);
+            x7 = clmul_hi(x7, k);
+            let y8 = clmul_lo(x8, k);
+            x8 = clmul_hi(x8, k);
+
+            x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64));
+            x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64));
+            x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64));
+            x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64));
+            x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64));
+            x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64));
+            x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64));
+            x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64));
+            x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64));
+
+            crc0 = __crc32d(crc0, *(buf as *const u64));
+            crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64));
+            crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64));
+            crc0 = __crc32d(crc0, *(buf.add(8) as *const u64));
+            crc1 = __crc32d(crc1, *(buf.add(klen + 8) as *const u64));
+            crc2 = __crc32d(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+
+            buf = buf.add(16);
+            buf2 = buf2.add(144);
+        }
+
+        // Reduce x0 ... x8 to just x0
+        let k_vals: [u64; 2] = [0xae689191, 0xccaa009e];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo(x0, k);
+        x0 = clmul_hi(x0, k);
+        x0 = veor3q_u64(x0, y0, x1);
+        x1 = x2;
+        x2 = x3;
+        x3 = x4;
+        x4 = x5;
+        x5 = x6;
+        x6 = x7;
+        x7 = x8;
+
+        let y0 = clmul_lo(x0, k);
+        x0 = clmul_hi(x0, k);
+        let y2 = clmul_lo(x2, k);
+        x2 = clmul_hi(x2, k);
+        let y4 = clmul_lo(x4, k);
+        x4 = clmul_hi(x4, k);
+        let y6 = clmul_lo(x6, k);
+        x6 = clmul_hi(x6, k);
+
+        x0 = veor3q_u64(x0, y0, x1);
+        x2 = veor3q_u64(x2, y2, x3);
+        x4 = veor3q_u64(x4, y4, x5);
+        x6 = veor3q_u64(x6, y6, x7);
+
+        let k_vals: [u64; 2] = [0xf1da05aa, 0x81256527];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo(x0, k);
+        x0 = clmul_hi(x0, k);
+        let y4 = clmul_lo(x4, k);
+        x4 = clmul_hi(x4, k);
+
+        x0 = veor3q_u64(x0, y0, x2);
+        x4 = veor3q_u64(x4, y4, x6);
+
+        let k_vals: [u64; 2] = [0x8f352d95, 0x1d9513d7];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo(x0, k);
+        x0 = clmul_hi(x0, k);
+        x0 = veor3q_u64(x0, y0, x4);
+
+        // Final scalar chunk
+        crc0 = __crc32d(crc0, *(buf as *const u64));
+        crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64));
+        crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64));
+        crc0 = __crc32d(crc0, *(buf.add(8) as *const u64));
+        crc1 = __crc32d(crc1, *(buf.add(klen + 8) as *const u64));
+        crc2 = __crc32d(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+
+        let vc0 = crc_shift_iso_hdlc(crc0, klen * 2 + blk * 144);
+        let vc1 = crc_shift_iso_hdlc(crc1, klen + blk * 144);
+        let vc2 = crc_shift_iso_hdlc(crc2, blk * 144);
+        let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1));
+
+        buf = buf2;
+        len = end.offset_from(buf) as usize;
+    }
+
+    if len >= 32 {
+        let klen = ((len - 8) / 24) * 8;
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // Main loop
+        loop {
+            crc0 = __crc32d(crc0, *(buf as *const u64));
+            crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64));
+            crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64));
+            buf = buf.add(8);
+            len -= 24;
+            if len < 32 {
+                break;
+            }
+        }
+
+        let vc0 = crc_shift_iso_hdlc(crc0, klen * 2 + 8);
+        let vc1 = crc_shift_iso_hdlc(crc1, klen + 8);
+        let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
+
+        // Final 8 bytes
+        buf = buf.add(klen * 2);
+        crc0 = crc2;
+        crc0 = __crc32d(crc0, *(buf as *const u64) ^ vc);
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len >= 8 {
+        crc0 = __crc32d(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len > 0 {
+        crc0 = __crc32b(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn crc_shift_iso_hdlc(crc: u32, nbytes: usize) -> uint64x2_t {
+    clmul_scalar(crc, xnmodp_iso_hdlc((nbytes * 8 - 33) as u64))
+}
+
+// x^n mod P, in log(n) time
+#[inline]
+#[target_feature(enable = "crc,aes")]
+unsafe fn xnmodp_iso_hdlc(mut n: u64) -> u32 {
+    let mut stack = !1u64;
+    let mut acc: u32;
+    let mut low: u32;
+
+    while n > 191 {
+        stack = (stack << 1) + (n & 1);
+        n = (n >> 1) - 16;
+    }
+    stack = !stack;
+    acc = 0x80000000u32 >> (n & 31);
+    n >>= 5;
+
+    while n > 0 {
+        // ARM CRC32 instruction (ISO-HDLC uses standard CRC32, not CRC32C)
+        acc = __crc32w(acc, 0);
+        n -= 1;
+    }
+
+    while {
+        low = (stack & 1) as u32;
+        stack >>= 1;
+        stack != 0
+    } {
+        // Convert to polynomial type and square it
+        let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64));
+        let squared = vmull_p8(x, x);
+        let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0);
+        acc = __crc32d(0, y << low);
+    }
+    acc
+}
diff --git a/src/crc32/fusion/aarch64/iso_hdlc/mod.rs b/src/crc32/fusion/aarch64/iso_hdlc/mod.rs
new file mode 100644
index 0000000..2f3db58
--- /dev/null
+++ b/src/crc32/fusion/aarch64/iso_hdlc/mod.rs
@@ -0,0 +1,9 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides aarch64-specific implementations of CRC-32/ISO-HDLC calculations using
+//! fusion techniques.
+
+#![cfg(target_arch = "aarch64")]
+
+pub(crate) mod crc_pmull;
+pub(crate) mod crc_pmull_sha3;
diff --git a/src/crc32/fusion/aarch64/mod.rs b/src/crc32/fusion/aarch64/mod.rs
new file mode 100644
index 0000000..de15ab4
--- /dev/null
+++ b/src/crc32/fusion/aarch64/mod.rs
@@ -0,0 +1,275 @@
+//! Provides CRC-32/ISCSI and CRC-32/ISO-HDLC calculations using a fusion of native CLMUL
+//! instructions and native CRC calculation instructions on aarch64.
+//!
+//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
+//!
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! Modified as necessary for this Rust implementation.
+//!
+//! MIT licensed.
+
+#![cfg(target_arch = "aarch64")]
+
+mod iscsi;
+mod iso_hdlc;
+
+use std::arch::aarch64::*;
+use std::arch::is_aarch64_feature_detected;
+
+use iscsi::crc_pmull::crc32_iscsi_v12e_v1;
+use iscsi::crc_pmull_sha3::crc32_iscsi_eor3_v9s3x2e_s3;
+use iso_hdlc::crc_pmull::crc32_iso_hdlc_v12e_v1;
+use iso_hdlc::crc_pmull_sha3::crc32_iso_hdlc_eor3_v9s3x2e_s3;
+
+#[inline(always)]
+pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
+    if is_aarch64_feature_detected!("sha3") {
+        unsafe { crc32_iscsi_aes_sha3(crc, data) }
+    } else {
+        unsafe { crc32_iscsi_aes(crc, data) }
+    }
+}
+
+#[inline(always)]
+pub fn crc32_iso_hdlc(crc: u32, data: &[u8]) -> u32 {
+    if is_aarch64_feature_detected!("sha3") {
+        unsafe { crc32_iso_hdlc_aes_sha3(crc, data) }
+    } else {
+        unsafe { crc32_iso_hdlc_aes(crc, data) }
+    }
+}
+
+/// Safe wrapper for CRC32 iSCSI calculation
+#[inline]
+#[target_feature(enable = "crc,aes,sha3")]
+unsafe fn crc32_iscsi_aes_sha3(crc: u32, data: &[u8]) -> u32 {
+    unsafe {
+        const LARGE_BUFFER_THRESHOLD: usize = 1024;
+
+        // Select implementation based on buffer size
+        if data.len() <= LARGE_BUFFER_THRESHOLD {
+            crc32_iscsi_v12e_v1(crc, data.as_ptr(), data.len())
+        } else {
+            crc32_iscsi_eor3_v9s3x2e_s3(crc, data.as_ptr(), data.len())
+        }
+    }
+}
+
+#[inline]
+#[target_feature(enable = "crc,aes")]
+unsafe fn crc32_iscsi_aes(crc: u32, data: &[u8]) -> u32 {
+    unsafe { crc32_iscsi_v12e_v1(crc, data.as_ptr(), data.len()) }
+}
+
+/// Safe wrapper for CRC32 ISO-HDLC calculation
+#[inline]
+#[target_feature(enable = "crc,aes,sha3")]
+unsafe fn crc32_iso_hdlc_aes_sha3(crc: u32, data: &[u8]) -> u32 {
+    unsafe {
+        const LARGE_BUFFER_THRESHOLD: usize = 1024;
+
+        // Select implementation based on buffer size
+        if data.len() <= LARGE_BUFFER_THRESHOLD {
+            crc32_iso_hdlc_v12e_v1(crc, data.as_ptr(), data.len())
+        } else {
+            crc32_iso_hdlc_eor3_v9s3x2e_s3(crc, data.as_ptr(), data.len())
+        }
+    }
+}
+
+#[inline]
+#[target_feature(enable = "crc,aes")]
+unsafe fn crc32_iso_hdlc_aes(crc: u32, data: &[u8]) -> u32 {
+    unsafe { crc32_iso_hdlc_v12e_v1(crc, data.as_ptr(), data.len()) }
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn clmul_lo(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    // Polynomial multiply low parts - convert u128 result to uint64x2_t
+    let result = vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0));
+    vreinterpretq_u64_p128(result)
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn clmul_hi(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    // Polynomial multiply high parts - convert u128 result to uint64x2_t
+    let result = vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1));
+    vreinterpretq_u64_p128(result)
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn clmul_scalar(a: u32, b: u32) -> uint64x2_t {
+    // Polynomial multiply scalars - convert u128 result to uint64x2_t
+    let result = vmull_p64(a as u64, b as u64);
+    vreinterpretq_u64_p128(result)
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn clmul_lo_and_xor(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    veorq_u64(clmul_lo(a, b), c)
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn clmul_hi_and_xor(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    veorq_u64(clmul_hi(a, b), c)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::test::consts::TEST_CHECK_STRING;
+    use crc::{Crc, Table};
+    use rand::{rng, Rng};
+
+    const RUST_CRC32_ISO_HDLC: Crc<u32, Table<16>> =
+        Crc::<u32, Table<16>>::new(&crc::CRC_32_ISO_HDLC);
+
+    const RUST_CRC32_ISCSI: Crc<u32, Table<16>> = Crc::<u32, Table<16>>::new(&crc::CRC_32_ISCSI);
+
+    #[test]
+    fn test_crc32_iso_hdlc_check() {
+        assert_eq!(
+            crc32_iso_hdlc(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
+            0xcbf43926
+        );
+    }
+
+    #[test]
+    fn test_crc32_iso_hdlc_small_all_lengths() {
+        for len in 1..=255 {
+            crc32_iso_hdlc_random(len)
+        }
+    }
+
+    #[test]
+    fn test_crc32_iso_hdlc_medium_lengths() {
+        // Test each length from 256 to 1024, which should fold and include handling remainders
+        for len in 256..=1024 {
+            crc32_iso_hdlc_random(len)
+        }
+    }
+
+    #[test]
+    fn test_crc32_iso_hdlc_large_lengths() {
+        // Test 1 MiB just before, at, and just after the folding boundaries
+        for len in 1048575..1048577 {
+            crc32_iso_hdlc_random(len)
+        }
+    }
+
+    #[test]
+    fn test_crc32_iscsi_check() {
+        assert_eq!(
+            crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
+            0xe3069283
+        );
+    }
+
+    #[test]
+    fn test_crc32_iscsi_small_all_lengths() {
+        for len in 1..=255 {
+            crc32_iscsi_random(len);
+        }
+    }
+
+    #[test]
+    fn test_crc32_iscsi_medium_lengths() {
+        // Test each length from 256 to 1024, which should fold and include handling remainders
+        for len in 256..=1024 {
+            crc32_iscsi_random(len);
+        }
+    }
+
+    #[test]
+    fn test_crc32_iscsi_large_lengths() {
+        // Test 1 MiB just before, at, and just after the folding boundaries
+        for len in 1048575..1048577 {
+            crc32_iscsi_random(len);
+        }
+    }
+
+    #[cfg(target_feature = "sha3")]
+    fn crc32_iso_hdlc_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISO_HDLC.checksum(&data);
+
+        assert_eq!(crc32_iso_hdlc(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            assert_eq!(
+                crc32_iso_hdlc_eor3_v9s3x2e_s3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+
+            assert_eq!(
+                crc32_iso_hdlc_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+
+    #[cfg(not(target_feature = "sha3"))]
+    fn crc32_iso_hdlc_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISO_HDLC.checksum(&data);
+
+        assert_eq!(crc32_iso_hdlc(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            assert_eq!(
+                crc32_iso_hdlc_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+
+    #[cfg(target_feature = "sha3")]
+    fn crc32_iscsi_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISCSI.checksum(&data);
+
+        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            assert_eq!(
+                crc32_iscsi_eor3_v9s3x2e_s3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+
+            assert_eq!(
+                crc32_iscsi_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+
+    #[cfg(not(target_feature = "sha3"))]
+    fn crc32_iscsi_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISCSI.checksum(&data);
+
+        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            assert_eq!(
+                crc32_iscsi_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+}
diff --git a/src/crc32/fusion/mod.rs b/src/crc32/fusion/mod.rs
index d75a64b..774ce3b 100644
--- a/src/crc32/fusion/mod.rs
+++ b/src/crc32/fusion/mod.rs
@@ -11,24 +11,23 @@
 mod aarch64;
 mod x86;
 
+/// Only AArch64 has native CRC-32/ISO-HDLC instructions
 #[inline(always)]
-#[allow(unused)]
+#[cfg(target_arch = "aarch64")]
 pub(crate) fn crc32_iso_hdlc(state: u32, data: &[u8]) -> u32 {
-    #[cfg(target_arch = "aarch64")]
-    return aarch64::crc32_iso_hdlc(state, data);
-
-    #[cfg(not(target_arch = "aarch64"))]
-    panic!("CRC-32/ISO-HDLC with fusion is only supported on AArch64 architecture");
+    aarch64::crc32_iso_hdlc(state, data)
 }
 
+/// Both AArch64 and x86 have native CRC-32/ISCSI instructions
 #[inline(always)]
 pub(crate) fn crc32_iscsi(state: u32, data: &[u8]) -> u32 {
     #[cfg(target_arch = "aarch64")]
-    return aarch64::crc32_iscsi(state, data);
-
-    #[cfg(target_arch = "x86_64")]
-    return x86::crc32_iscsi(state, data);
+    {
+        aarch64::crc32_iscsi(state, data)
+    }
 
-    #[cfg(all(not(target_arch = "aarch64"), not(target_arch = "x86_64")))]
-    panic!("CRC-32/ISCSI with fusion is only supported on AArch64 and X86_64 architectures");
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        x86::crc32_iscsi(state, data)
+    }
 }
diff --git a/src/crc32/fusion/x86.rs b/src/crc32/fusion/x86.rs
deleted file mode 100644
index c288072..0000000
--- a/src/crc32/fusion/x86.rs
+++ /dev/null
@@ -1,740 +0,0 @@
-//! Provides CRC-32/ISCSI and CRC-32/ISO-HDLC calculations using a fusion of native CLMUL
-//! instructions and native CRC calculation instructions on x86_64.
-//!
-//! https://www.corsix.org/content/fast-crc32c-4k
-//! https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
-//!
-//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-//! with the help of Claude.ai using:
-//!
-//! ./generate -i avx512_vpclmulqdq -p crc32c -a v3x2
-//! ./generate -i avx512 -p crc32c -a v4s3x3
-//! ./generate -i sse -p crc32c -a v4s3x3
-//!
-//! Modified as necessary for this Rust implementation.
-//!
-//! MIT licensed.
-
-#![cfg(target_arch = "x86_64")]
-
-use std::arch::x86_64::*;
-
-/// Safe wrapper for CRC32 iSCSI calculation using AVX-512
-#[rustversion::before(1.89)]
-#[inline(always)]
-pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
-    unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) }
-}
-
-#[rustversion::since(1.89)]
-#[inline(always)]
-pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
-    if is_x86_feature_detected!("vpclmulqdq")
-        && is_x86_feature_detected!("avx512f")
-        && is_x86_feature_detected!("avx512vl")
-    {
-        unsafe {
-            return crc32_iscsi_avx512_vpclmulqdq_v3x2(crc, data.as_ptr(), data.len());
-        }
-    }
-
-    if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
-        unsafe {
-            return crc32_iscsi_avx512_v4s3x3(crc, data.as_ptr(), data.len());
-        }
-    }
-
-    unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) }
-}
-
-#[rustversion::since(1.89)]
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl,vpclmulqdq")]
-unsafe fn clmul_lo_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i {
-    _mm512_clmulepi64_epi128(a, b, 0)
-}
-
-#[rustversion::since(1.89)]
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl,vpclmulqdq")]
-unsafe fn clmul_hi_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i {
-    _mm512_clmulepi64_epi128(a, b, 17)
-}
-
-#[inline]
-#[target_feature(enable = "pclmulqdq")]
-unsafe fn clmul_lo_sse(a: __m128i, b: __m128i) -> __m128i {
-    _mm_clmulepi64_si128(a, b, 0)
-}
-
-#[inline]
-#[target_feature(enable = "pclmulqdq")]
-unsafe fn clmul_hi_sse(a: __m128i, b: __m128i) -> __m128i {
-    _mm_clmulepi64_si128(a, b, 17)
-}
-
-#[inline]
-#[target_feature(enable = "pclmulqdq")]
-unsafe fn clmul_scalar_sse(a: u32, b: u32) -> __m128i {
-    _mm_clmulepi64_si128(_mm_cvtsi32_si128(a as i32), _mm_cvtsi32_si128(b as i32), 0)
-}
-
-// x^n mod P, in log(n) time
-#[target_feature(enable = "sse4.2,pclmulqdq")]
-unsafe fn xnmodp_iscsi_sse(mut n: u64) -> u32 {
-    let mut stack = !1u64;
-    let mut acc: u32;
-    let mut low: u32;
-
-    while n > 191 {
-        stack = (stack << 1) + (n & 1);
-        n = (n >> 1) - 16;
-    }
-    stack = !stack;
-    acc = 0x80000000u32 >> (n & 31);
-    n >>= 5;
-
-    while n > 0 {
-        // Use hardware CRC32C instruction
-        acc = _mm_crc32_u32(acc, 0);
-        n -= 1;
-    }
-
-    while {
-        low = (stack & 1) as u32;
-        stack >>= 1;
-        stack != 0
-    } {
-        let x = _mm_cvtsi32_si128(acc as i32);
-        let y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0)) as u64;
-        acc = _mm_crc32_u64(0, y << low) as u32;
-    }
-    acc
-}
-
-#[inline]
-#[target_feature(enable = "pclmulqdq")]
-unsafe fn crc_shift_iscsi_sse(crc: u32, nbytes: usize) -> __m128i {
-    clmul_scalar_sse(crc, xnmodp_iscsi_sse((nbytes * 8 - 33) as u64))
-}
-
-#[inline]
-#[target_feature(enable = "sse4.1")]
-unsafe fn mm_extract_epi64(val: __m128i, idx: i32) -> u64 {
-    if idx == 0 {
-        _mm_cvtsi128_si64(val) as u64
-    } else {
-        _mm_cvtsi128_si64(_mm_srli_si128(val, 8)) as u64
-    }
-}
-
-#[inline]
-#[target_feature(enable = "sse4.2")]
-unsafe fn mm_crc32_u64(crc: u32, val: u64) -> u32 {
-    _mm_crc32_u64(crc.into(), val) as u32
-}
-
-/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-/// using:
-///
-/// ./generate -i avx512_vpclmulqdq -p crc32c -a v3x2
-#[rustversion::since(1.89)]
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl,vpclmulqdq,sse4.2")]
-pub unsafe fn crc32_iscsi_avx512_vpclmulqdq_v3x2(
-    mut crc0: u32,
-    mut buf: *const u8,
-    mut len: usize,
-) -> u32 {
-    // Align to 8-byte boundary
-    while len > 0 && (buf as usize & 7) != 0 {
-        crc0 = _mm_crc32_u8(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    // Align to 64-byte boundary (cache line)
-    while (buf as usize & 56) != 0 && len >= 8 {
-        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    if len >= 384 {
-        // First vector chunk - load three 512-bit vectors (192 bytes total)
-        let mut x0 = _mm512_loadu_si512(buf as *const __m512i);
-        let mut x1 = _mm512_loadu_si512(buf.add(64) as *const __m512i);
-        let mut x2 = _mm512_loadu_si512(buf.add(128) as *const __m512i);
-
-        // Create the multiplication constant vector
-        // Pattern: [0xa87ab8a8, 0, 0xab7aff2a, 0] repeated across all 128-bit lanes
-        let k_128 = _mm_setr_epi32(0xa87ab8a8u32 as i32, 0, 0xab7aff2au32 as i32, 0);
-        let mut k = _mm512_broadcast_i32x4(k_128);
-
-        // XOR the CRC into the first vector's low 32 bits
-        let crc_vec = _mm512_castsi128_si512(_mm_cvtsi32_si128(crc0 as i32));
-        x0 = _mm512_xor_si512(crc_vec, x0);
-
-        // First round of polynomial multiplication
-        let mut y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
-        x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
-        let mut y1 = clmul_lo_avx512_vpclmulqdq(x1, k);
-        x1 = clmul_hi_avx512_vpclmulqdq(x1, k);
-        let mut y2 = clmul_lo_avx512_vpclmulqdq(x2, k);
-        x2 = clmul_hi_avx512_vpclmulqdq(x2, k);
-
-        // XOR with next chunk of data using ternary logic (A XOR B XOR C)
-        // 0x96 = A XOR B XOR C in ternary logic notation
-        x0 = _mm512_ternarylogic_epi64(
-            x0,
-            y0,
-            _mm512_loadu_si512(buf.add(192) as *const __m512i),
-            0x96,
-        );
-        x1 = _mm512_ternarylogic_epi64(
-            x1,
-            y1,
-            _mm512_loadu_si512(buf.add(256) as *const __m512i),
-            0x96,
-        );
-        x2 = _mm512_ternarylogic_epi64(
-            x2,
-            y2,
-            _mm512_loadu_si512(buf.add(320) as *const __m512i),
-            0x96,
-        );
-
-        buf = buf.add(384);
-        len -= 384;
-
-        // Main loop - process 384 bytes at a time
-        while len >= 384 {
-            // First folding step
-            y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
-            x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
-            y1 = clmul_lo_avx512_vpclmulqdq(x1, k);
-            x1 = clmul_hi_avx512_vpclmulqdq(x1, k);
-            y2 = clmul_lo_avx512_vpclmulqdq(x2, k);
-            x2 = clmul_hi_avx512_vpclmulqdq(x2, k);
-
-            x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512(buf as *const __m512i), 0x96);
-            x1 = _mm512_ternarylogic_epi64(
-                x1,
-                y1,
-                _mm512_loadu_si512(buf.add(64) as *const __m512i),
-                0x96,
-            );
-            x2 = _mm512_ternarylogic_epi64(
-                x2,
-                y2,
-                _mm512_loadu_si512(buf.add(128) as *const __m512i),
-                0x96,
-            );
-
-            // Second folding step
-            y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
-            x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
-            y1 = clmul_lo_avx512_vpclmulqdq(x1, k);
-            x1 = clmul_hi_avx512_vpclmulqdq(x1, k);
-            y2 = clmul_lo_avx512_vpclmulqdq(x2, k);
-            x2 = clmul_hi_avx512_vpclmulqdq(x2, k);
-
-            x0 = _mm512_ternarylogic_epi64(
-                x0,
-                y0,
-                _mm512_loadu_si512(buf.add(192) as *const __m512i),
-                0x96,
-            );
-            x1 = _mm512_ternarylogic_epi64(
-                x1,
-                y1,
-                _mm512_loadu_si512(buf.add(256) as *const __m512i),
-                0x96,
-            );
-            x2 = _mm512_ternarylogic_epi64(
-                x2,
-                y2,
-                _mm512_loadu_si512(buf.add(320) as *const __m512i),
-                0x96,
-            );
-
-            buf = buf.add(384);
-            len -= 384;
-        }
-
-        // Reduce x0, x1, x2 to just x0
-        let k_128 = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0);
-        k = _mm512_broadcast_i32x4(k_128);
-
-        y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
-        x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
-        x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96);
-        x1 = x2;
-
-        y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
-        x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
-        x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96);
-
-        // Reduce 512 bits to 128 bits
-        // Multiple reduction constants for different parts of the 512-bit vector
-        k = _mm512_setr_epi32(
-            0x1c291d04u32 as i32,
-            0,
-            0xddc0152bu32 as i32,
-            0, // Lane 0
-            0x3da6d0cbu32 as i32,
-            0,
-            0xba4fc28eu32 as i32,
-            0, // Lane 1
-            0xf20c0dfeu32 as i32,
-            0,
-            0x493c7d27u32 as i32,
-            0, // Lane 2
-            0,
-            0,
-            0,
-            0, // Lane 3 (unused)
-        );
-
-        y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
-        k = clmul_hi_avx512_vpclmulqdq(x0, k);
-        y0 = _mm512_xor_si512(y0, k);
-
-        // Extract 128-bit lanes and combine them
-        let lane0 = _mm512_castsi512_si128(y0);
-        let lane1 = _mm512_extracti32x4_epi32(y0, 1);
-        let lane2 = _mm512_extracti32x4_epi32(y0, 2);
-        let lane3 = _mm512_extracti32x4_epi32(x0, 3);
-
-        // Combine all lanes using ternary logic
-        let mut z0 = _mm_ternarylogic_epi64(lane0, lane1, lane2, 0x96);
-        z0 = _mm_xor_si128(z0, lane3);
-
-        // Reduce 128 bits to 32 bits using CRC32 instructions
-        crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0) as u64) as u32;
-        crc0 = _mm_crc32_u64(crc0.into(), _mm_extract_epi64(z0, 1) as u64) as u32;
-    }
-
-    // Process remaining 8-byte chunks
-    while len >= 8 {
-        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    // Process remaining bytes
-    while len > 0 {
-        crc0 = _mm_crc32_u8(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    crc0
-}
-
-/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-/// using:
-///
-/// ./generate -i avx512 -p crc32c -a v4s3x3
-#[rustversion::since(1.89)]
-#[inline]
-#[target_feature(enable = "avx2,avx512f,avx512vl,pclmulqdq")]
-pub unsafe fn crc32_iscsi_avx512_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
-    // Align to 8-byte boundary using hardware CRC32C instructions
-    while len > 0 && (buf as usize & 7) != 0 {
-        crc0 = _mm_crc32_u8(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    // Handle 8-byte alignment
-    if (buf as usize & 8) != 0 && len >= 8 {
-        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    if len >= 144 {
-        let blk = (len - 8) / 136;
-        let klen = blk * 24;
-        let buf2 = buf;
-        let mut crc1 = 0u32;
-        let mut crc2 = 0u32;
-
-        // First vector chunk - load four 128-bit vectors (64 bytes total)
-        let mut x0 = _mm_loadu_si128(buf2 as *const __m128i);
-        let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i);
-        let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i);
-        let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i);
-
-        // iSCSI-specific folding constant (different from ISO-HDLC)
-        let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0);
-
-        // XOR the CRC into the first vector's low 32 bits
-        x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0);
-        crc0 = 0;
-
-        let mut buf2 = buf2.add(64);
-        len -= 136;
-        buf = buf.add(blk * 64);
-
-        // Main loop - process 144 bytes at a time
-        while len >= 144 {
-            let y0 = clmul_lo_sse(x0, k);
-            x0 = clmul_hi_sse(x0, k);
-            let y1 = clmul_lo_sse(x1, k);
-            x1 = clmul_hi_sse(x1, k);
-            let y2 = clmul_lo_sse(x2, k);
-            x2 = clmul_hi_sse(x2, k);
-            let y3 = clmul_lo_sse(x3, k);
-            x3 = clmul_hi_sse(x3, k);
-
-            // XOR with next chunk of data using ternary logic (A XOR B XOR C)
-            x0 = _mm_ternarylogic_epi64(x0, y0, _mm_loadu_si128(buf2 as *const __m128i), 0x96);
-            x1 = _mm_ternarylogic_epi64(
-                x1,
-                y1,
-                _mm_loadu_si128(buf2.add(16) as *const __m128i),
-                0x96,
-            );
-            x2 = _mm_ternarylogic_epi64(
-                x2,
-                y2,
-                _mm_loadu_si128(buf2.add(32) as *const __m128i),
-                0x96,
-            );
-            x3 = _mm_ternarylogic_epi64(
-                x3,
-                y3,
-                _mm_loadu_si128(buf2.add(48) as *const __m128i),
-                0x96,
-            );
-
-            // Process scalar data in parallel using hardware CRC32C
-            crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
-            crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen) as *const u64)) as u32;
-            crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2) as *const u64)) as u32;
-            crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(8) as *const u64)) as u32;
-            crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 8) as *const u64)) as u32;
-            crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 8) as *const u64)) as u32;
-            crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(16) as *const u64)) as u32;
-            crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 16) as *const u64)) as u32;
-            crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 16) as *const u64)) as u32;
-
-            buf = buf.add(24);
-            buf2 = buf2.add(64);
-            len -= 136;
-        }
-
-        // Reduce x0 ... x3 to just x0 using iSCSI-specific constants
-        k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0);
-
-        let y0 = clmul_lo_sse(x0, k);
-        x0 = clmul_hi_sse(x0, k);
-        let y2 = clmul_lo_sse(x2, k);
-        x2 = clmul_hi_sse(x2, k);
-
-        x0 = _mm_ternarylogic_epi64(x0, y0, x1, 0x96);
-        x2 = _mm_ternarylogic_epi64(x2, y2, x3, 0x96);
-
-        k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0);
-
-        let y0 = clmul_lo_sse(x0, k);
-        x0 = clmul_hi_sse(x0, k);
-        x0 = _mm_ternarylogic_epi64(x0, y0, x2, 0x96);
-
-        // Final scalar chunk
-        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
-        crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen) as *const u64)) as u32;
-        crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2) as *const u64)) as u32;
-        crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(8) as *const u64)) as u32;
-        crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 8) as *const u64)) as u32;
-        crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 8) as *const u64)) as u32;
-        crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(16) as *const u64)) as u32;
-        crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 16) as *const u64)) as u32;
-        crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 16) as *const u64)) as u32;
-        buf = buf.add(24);
-
-        let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8);
-        let vc1 = crc_shift_iscsi_sse(crc1, klen + 8);
-        let mut vc = _mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0) as u64;
-
-        // Reduce 128 bits to 32 bits, and multiply by x^32
-        let x0_low = _mm_extract_epi64(x0, 0) as u64;
-        let x0_high = _mm_extract_epi64(x0, 1) as u64;
-        let vec_crc = _mm_crc32_u64(_mm_crc32_u64(0, x0_low), x0_high);
-        vc ^= _mm_extract_epi64(crc_shift_iscsi_sse(vec_crc as u32, klen * 3 + 8), 0) as u64;
-
-        // Final 8 bytes
-        buf = buf.add(klen * 2);
-        crc0 = crc2;
-        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64) ^ vc) as u32;
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    // Process remaining 8-byte chunks using hardware CRC32C
-    while len >= 8 {
-        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    // Process remaining bytes using hardware CRC32C
-    while len > 0 {
-        crc0 = _mm_crc32_u8(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    crc0
-}
-
-/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-/// using:
-///
-/// ./generate -i sse -p crc32c -a v4s3x3
-#[inline]
-#[target_feature(enable = "sse4.2,pclmulqdq")]
-pub unsafe fn crc32_iscsi_sse_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
-    // Align to 8-byte boundary using hardware CRC32C instructions
-    while len > 0 && (buf as usize & 7) != 0 {
-        crc0 = _mm_crc32_u8(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    // Handle 8-byte alignment
-    if (buf as usize & 8) != 0 && len >= 8 {
-        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    if len >= 144 {
-        let blk = (len - 8) / 136;
-        let klen = blk * 24;
-        let buf2 = buf;
-        let mut crc1 = 0u32;
-        let mut crc2 = 0u32;
-
-        // First vector chunk - load four 128-bit vectors (64 bytes total)
-        let mut x0 = _mm_loadu_si128(buf2 as *const __m128i);
-        let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i);
-        let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i);
-        let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i);
-
-        // iSCSI-specific folding constant (same as AVX-512 version)
-        let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0);
-
-        // XOR the CRC into the first vector's low 32 bits
-        x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0);
-        crc0 = 0;
-
-        let mut buf2 = buf2.add(64);
-        len -= 136;
-        buf = buf.add(blk * 64);
-
-        // Main loop - process 144 bytes at a time
-        while len >= 144 {
-            let mut y0 = clmul_lo_sse(x0, k);
-            x0 = clmul_hi_sse(x0, k);
-            let mut y1 = clmul_lo_sse(x1, k);
-            x1 = clmul_hi_sse(x1, k);
-            let mut y2 = clmul_lo_sse(x2, k);
-            x2 = clmul_hi_sse(x2, k);
-            let mut y3 = clmul_lo_sse(x3, k);
-            x3 = clmul_hi_sse(x3, k);
-
-            // XOR operations using separate XOR instructions (no ternary logic in SSE)
-            y0 = _mm_xor_si128(y0, _mm_loadu_si128(buf2 as *const __m128i));
-            x0 = _mm_xor_si128(x0, y0);
-            y1 = _mm_xor_si128(y1, _mm_loadu_si128(buf2.add(16) as *const __m128i));
-            x1 = _mm_xor_si128(x1, y1);
-            y2 = _mm_xor_si128(y2, _mm_loadu_si128(buf2.add(32) as *const __m128i));
-            x2 = _mm_xor_si128(x2, y2);
-            y3 = _mm_xor_si128(y3, _mm_loadu_si128(buf2.add(48) as *const __m128i));
-            x3 = _mm_xor_si128(x3, y3);
-
-            // Process scalar data in parallel using hardware CRC32C
-            crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
-            crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64));
-            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64));
-            crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64));
-            crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64));
-            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64));
-            crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64));
-            crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64));
-            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64));
-
-            buf = buf.add(24);
-            buf2 = buf2.add(64);
-            len -= 136;
-        }
-
-        // Reduce x0 ... x3 to just x0 using iSCSI-specific constants
-        k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0);
-
-        let mut y0 = clmul_lo_sse(x0, k);
-        x0 = clmul_hi_sse(x0, k);
-        let mut y2 = clmul_lo_sse(x2, k);
-        x2 = clmul_hi_sse(x2, k);
-
-        y0 = _mm_xor_si128(y0, x1);
-        x0 = _mm_xor_si128(x0, y0);
-        y2 = _mm_xor_si128(y2, x3);
-        x2 = _mm_xor_si128(x2, y2);
-
-        k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0);
-
-        y0 = clmul_lo_sse(x0, k);
-        x0 = clmul_hi_sse(x0, k);
-        y0 = _mm_xor_si128(y0, x2);
-        x0 = _mm_xor_si128(x0, y0);
-
-        // Final scalar chunk
-        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
-        crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64));
-        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64));
-        crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64));
-        crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64));
-        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64));
-        crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64));
-        crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64));
-        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64));
-        buf = buf.add(24);
-
-        let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8);
-        let vc1 = crc_shift_iscsi_sse(crc1, klen + 8);
-        let mut vc = mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0);
-
-        // Reduce 128 bits to 32 bits, and multiply by x^32
-        // Extract the two 64-bit parts of x0 and combine them
-        let x0_low = mm_extract_epi64(x0, 0);
-        let x0_high = mm_extract_epi64(x0, 1);
-        let x0_combined = mm_extract_epi64(
-            crc_shift_iscsi_sse(mm_crc32_u64(mm_crc32_u64(0, x0_low), x0_high), klen * 3 + 8),
-            0,
-        );
-        vc ^= x0_combined;
-
-        // Final 8 bytes
-        buf = buf.add(klen * 2);
-        crc0 = crc2;
-        crc0 = mm_crc32_u64(crc0, *(buf as *const u64) ^ vc);
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    // Process remaining 8-byte chunks using hardware CRC32C
-    while len >= 8 {
-        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    // Process remaining bytes using hardware CRC32C
-    while len > 0 {
-        crc0 = _mm_crc32_u8(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    crc0
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::test::consts::TEST_CHECK_STRING;
-    use crc::{Crc, Table};
-    use rand::{rng, Rng};
-
-    const RUST_CRC32_ISCSI: Crc<u32, Table<16>> = Crc::<u32, Table<16>>::new(&crc::CRC_32_ISCSI);
-
-    #[test]
-    fn test_crc32_iscsi_check() {
-        assert_eq!(
-            crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
-            0xe3069283
-        );
-    }
-
-    #[test]
-    fn test_crc32_iscsi_small_all_lengths() {
-        for len in 1..=255 {
-            test_crc32_iscsi_random(len);
-        }
-    }
-
-    #[test]
-    fn test_crc32_iscsi_medium_lengths() {
-        // Test each length from 256 to 1024, which should fold and include handling remainders
-        for len in 256..=1024 {
-            test_crc32_iscsi_random(len);
-        }
-    }
-
-    #[test]
-    fn test_crc32_iscsi_large_lengths() {
-        // Test 1 MiB just before, at, and just after the folding boundaries
-        for len in 1048575..1048577 {
-            test_crc32_iscsi_random(len);
-        }
-    }
-
-    #[rustversion::since(1.89)]
-    fn test_crc32_iscsi_random(len: usize) {
-        let mut data = vec![0u8; len];
-        rng().fill(&mut data[..]);
-
-        let checksum = RUST_CRC32_ISCSI.checksum(&data);
-
-        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
-
-        unsafe {
-            if is_x86_feature_detected!("vpclmulqdq")
-                && is_x86_feature_detected!("avx512vl")
-                && is_x86_feature_detected!("avx512f")
-            {
-                assert_eq!(
-                    crc32_iscsi_avx512_vpclmulqdq_v3x2(0xffffffff, data.as_ptr(), data.len())
-                        ^ 0xffffffff,
-                    checksum
-                );
-            }
-
-            if is_x86_feature_detected!("avx512vl")
-                && is_x86_feature_detected!("avx512f")
-                && is_x86_feature_detected!("pclmulqdq")
-            {
-                assert_eq!(
-                    crc32_iscsi_avx512_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                    checksum
-                );
-            }
-
-            assert_eq!(
-                crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                checksum
-            );
-        }
-    }
-
-    #[rustversion::before(1.89)]
-    fn test_crc32_iscsi_random(len: usize) {
-        let mut data = vec![0u8; len];
-        rng().fill(&mut data[..]);
-
-        let checksum = RUST_CRC32_ISCSI.checksum(&data);
-
-        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
-
-        unsafe {
-            assert_eq!(
-                crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                checksum
-            );
-        }
-    }
-}
diff --git a/src/crc32/fusion/x86/iscsi/avx512_pclmulqdq.rs b/src/crc32/fusion/x86/iscsi/avx512_pclmulqdq.rs
new file mode 100644
index 0000000..4bd0c42
--- /dev/null
+++ b/src/crc32/fusion/x86/iscsi/avx512_pclmulqdq.rs
@@ -0,0 +1,168 @@
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! MIT licensed.
+
+#![cfg(target_arch = "x86_64")]
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// with the help of Claude.ai using:
+///
+/// ./generate -i avx512 -p crc32c -a v4s3x3
+///
+/// Modified as necessary for this Rust implementation.
+#[rustversion::since(1.89)]
+#[inline]
+#[target_feature(enable = "avx512vl,pclmulqdq")]
+pub unsafe fn crc32_iscsi_avx512_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
+    use crate::fusion::x86::*;
+
+    // Align to 8-byte boundary using hardware CRC32C instructions
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 144 {
+        let blk = (len - 8) / 136;
+        let klen = blk * 24;
+        let buf2 = buf;
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // First vector chunk - load four 128-bit vectors (64 bytes total)
+        let mut x0 = _mm_loadu_si128(buf2 as *const __m128i);
+        let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i);
+        let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i);
+        let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i);
+
+        // iSCSI-specific folding constant (different from ISO-HDLC)
+        let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0);
+
+        // XOR the CRC into the first vector's low 32 bits
+        x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0);
+        crc0 = 0;
+
+        let mut buf2 = buf2.add(64);
+        len -= 136;
+        buf = buf.add(blk * 64);
+
+        // Main loop - process 144 bytes at a time
+        while len >= 144 {
+            let y0 = clmul_lo_sse(x0, k);
+            x0 = clmul_hi_sse(x0, k);
+            let y1 = clmul_lo_sse(x1, k);
+            x1 = clmul_hi_sse(x1, k);
+            let y2 = clmul_lo_sse(x2, k);
+            x2 = clmul_hi_sse(x2, k);
+            let y3 = clmul_lo_sse(x3, k);
+            x3 = clmul_hi_sse(x3, k);
+
+            // XOR with next chunk of data using ternary logic (A XOR B XOR C)
+            x0 = _mm_ternarylogic_epi64(x0, y0, _mm_loadu_si128(buf2 as *const __m128i), 0x96);
+            x1 = _mm_ternarylogic_epi64(
+                x1,
+                y1,
+                _mm_loadu_si128(buf2.add(16) as *const __m128i),
+                0x96,
+            );
+            x2 = _mm_ternarylogic_epi64(
+                x2,
+                y2,
+                _mm_loadu_si128(buf2.add(32) as *const __m128i),
+                0x96,
+            );
+            x3 = _mm_ternarylogic_epi64(
+                x3,
+                y3,
+                _mm_loadu_si128(buf2.add(48) as *const __m128i),
+                0x96,
+            );
+
+            // Process scalar data in parallel using hardware CRC32C
+            crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+            crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen) as *const u64)) as u32;
+            crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2) as *const u64)) as u32;
+            crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(8) as *const u64)) as u32;
+            crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 8) as *const u64)) as u32;
+            crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 8) as *const u64)) as u32;
+            crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(16) as *const u64)) as u32;
+            crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 16) as *const u64)) as u32;
+            crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 16) as *const u64)) as u32;
+
+            buf = buf.add(24);
+            buf2 = buf2.add(64);
+            len -= 136;
+        }
+
+        // Reduce x0 ... x3 to just x0 using iSCSI-specific constants
+        k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0);
+
+        let y0 = clmul_lo_sse(x0, k);
+        x0 = clmul_hi_sse(x0, k);
+        let y2 = clmul_lo_sse(x2, k);
+        x2 = clmul_hi_sse(x2, k);
+
+        x0 = _mm_ternarylogic_epi64(x0, y0, x1, 0x96);
+        x2 = _mm_ternarylogic_epi64(x2, y2, x3, 0x96);
+
+        k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0);
+
+        let y0 = clmul_lo_sse(x0, k);
+        x0 = clmul_hi_sse(x0, k);
+        x0 = _mm_ternarylogic_epi64(x0, y0, x2, 0x96);
+
+        // Final scalar chunk
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+        crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen) as *const u64)) as u32;
+        crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2) as *const u64)) as u32;
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(8) as *const u64)) as u32;
+        crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 8) as *const u64)) as u32;
+        crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 8) as *const u64)) as u32;
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(16) as *const u64)) as u32;
+        crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 16) as *const u64)) as u32;
+        crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 16) as *const u64)) as u32;
+        buf = buf.add(24);
+
+        let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8);
+        let vc1 = crc_shift_iscsi_sse(crc1, klen + 8);
+        let mut vc = _mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0) as u64;
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        let x0_low = _mm_extract_epi64(x0, 0) as u64;
+        let x0_high = _mm_extract_epi64(x0, 1) as u64;
+        let vec_crc = _mm_crc32_u64(_mm_crc32_u64(0, x0_low), x0_high);
+        vc ^= _mm_extract_epi64(crc_shift_iscsi_sse(vec_crc as u32, klen * 3 + 8), 0) as u64;
+
+        // Final 8 bytes
+        buf = buf.add(klen * 2);
+        crc0 = crc2;
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64) ^ vc) as u32;
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    // Process remaining 8-byte chunks using hardware CRC32C
+    while len >= 8 {
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    // Process remaining bytes using hardware CRC32C
+    while len > 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
diff --git a/src/crc32/fusion/x86/iscsi/avx512_vpclmulqdq.rs b/src/crc32/fusion/x86/iscsi/avx512_vpclmulqdq.rs
new file mode 100644
index 0000000..1b1f2cd
--- /dev/null
+++ b/src/crc32/fusion/x86/iscsi/avx512_vpclmulqdq.rs
@@ -0,0 +1,208 @@
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! MIT licensed.
+
+#![cfg(target_arch = "x86_64")]
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// with the help of Claude.ai using:
+///
+/// ./generate -i avx512_vpclmulqdq -p crc32c -a v3x2
+///
+/// Modified as necessary for this Rust implementation.
+#[rustversion::since(1.89)]
+#[inline]
+#[target_feature(enable = "avx512vl,vpclmulqdq")]
+pub unsafe fn crc32_iscsi_avx512_vpclmulqdq_v3x2(
+    mut crc0: u32,
+    mut buf: *const u8,
+    mut len: usize,
+) -> u32 {
+    use crate::fusion::x86::*;
+
+    // Align to 8-byte boundary
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Align to 64-byte boundary (cache line)
+    while (buf as usize & 56) != 0 && len >= 8 {
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 384 {
+        // First vector chunk - load three 512-bit vectors (192 bytes total)
+        let mut x0 = _mm512_loadu_si512(buf as *const __m512i);
+        let mut x1 = _mm512_loadu_si512(buf.add(64) as *const __m512i);
+        let mut x2 = _mm512_loadu_si512(buf.add(128) as *const __m512i);
+
+        // Create the multiplication constant vector
+        // Pattern: [0xa87ab8a8, 0, 0xab7aff2a, 0] repeated across all 128-bit lanes
+        let k_128 = _mm_setr_epi32(0xa87ab8a8u32 as i32, 0, 0xab7aff2au32 as i32, 0);
+        let mut k = _mm512_broadcast_i32x4(k_128);
+
+        // XOR the CRC into the first vector's low 32 bits
+        let crc_vec = _mm512_castsi128_si512(_mm_cvtsi32_si128(crc0 as i32));
+        x0 = _mm512_xor_si512(crc_vec, x0);
+
+        // First round of polynomial multiplication
+        let mut y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+        x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
+        let mut y1 = clmul_lo_avx512_vpclmulqdq(x1, k);
+        x1 = clmul_hi_avx512_vpclmulqdq(x1, k);
+        let mut y2 = clmul_lo_avx512_vpclmulqdq(x2, k);
+        x2 = clmul_hi_avx512_vpclmulqdq(x2, k);
+
+        // XOR with next chunk of data using ternary logic (A XOR B XOR C)
+        // 0x96 = A XOR B XOR C in ternary logic notation
+        x0 = _mm512_ternarylogic_epi64(
+            x0,
+            y0,
+            _mm512_loadu_si512(buf.add(192) as *const __m512i),
+            0x96,
+        );
+        x1 = _mm512_ternarylogic_epi64(
+            x1,
+            y1,
+            _mm512_loadu_si512(buf.add(256) as *const __m512i),
+            0x96,
+        );
+        x2 = _mm512_ternarylogic_epi64(
+            x2,
+            y2,
+            _mm512_loadu_si512(buf.add(320) as *const __m512i),
+            0x96,
+        );
+
+        buf = buf.add(384);
+        len -= 384;
+
+        // Main loop - process 384 bytes at a time
+        while len >= 384 {
+            // First folding step
+            y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+            x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
+            y1 = clmul_lo_avx512_vpclmulqdq(x1, k);
+            x1 = clmul_hi_avx512_vpclmulqdq(x1, k);
+            y2 = clmul_lo_avx512_vpclmulqdq(x2, k);
+            x2 = clmul_hi_avx512_vpclmulqdq(x2, k);
+
+            x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512(buf as *const __m512i), 0x96);
+            x1 = _mm512_ternarylogic_epi64(
+                x1,
+                y1,
+                _mm512_loadu_si512(buf.add(64) as *const __m512i),
+                0x96,
+            );
+            x2 = _mm512_ternarylogic_epi64(
+                x2,
+                y2,
+                _mm512_loadu_si512(buf.add(128) as *const __m512i),
+                0x96,
+            );
+
+            // Second folding step
+            y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+            x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
+            y1 = clmul_lo_avx512_vpclmulqdq(x1, k);
+            x1 = clmul_hi_avx512_vpclmulqdq(x1, k);
+            y2 = clmul_lo_avx512_vpclmulqdq(x2, k);
+            x2 = clmul_hi_avx512_vpclmulqdq(x2, k);
+
+            x0 = _mm512_ternarylogic_epi64(
+                x0,
+                y0,
+                _mm512_loadu_si512(buf.add(192) as *const __m512i),
+                0x96,
+            );
+            x1 = _mm512_ternarylogic_epi64(
+                x1,
+                y1,
+                _mm512_loadu_si512(buf.add(256) as *const __m512i),
+                0x96,
+            );
+            x2 = _mm512_ternarylogic_epi64(
+                x2,
+                y2,
+                _mm512_loadu_si512(buf.add(320) as *const __m512i),
+                0x96,
+            );
+
+            buf = buf.add(384);
+            len -= 384;
+        }
+
+        // Reduce x0, x1, x2 to just x0
+        let k_128 = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0);
+        k = _mm512_broadcast_i32x4(k_128);
+
+        y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+        x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
+        x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96);
+        x1 = x2;
+
+        y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+        x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
+        x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96);
+
+        // Reduce 512 bits to 128 bits
+        // Multiple reduction constants for different parts of the 512-bit vector
+        k = _mm512_setr_epi32(
+            0x1c291d04u32 as i32,
+            0,
+            0xddc0152bu32 as i32,
+            0, // Lane 0
+            0x3da6d0cbu32 as i32,
+            0,
+            0xba4fc28eu32 as i32,
+            0, // Lane 1
+            0xf20c0dfeu32 as i32,
+            0,
+            0x493c7d27u32 as i32,
+            0, // Lane 2
+            0,
+            0,
+            0,
+            0, // Lane 3 (unused)
+        );
+
+        y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+        k = clmul_hi_avx512_vpclmulqdq(x0, k);
+        y0 = _mm512_xor_si512(y0, k);
+
+        // Extract 128-bit lanes and combine them
+        let lane0 = _mm512_castsi512_si128(y0);
+        let lane1 = _mm512_extracti32x4_epi32(y0, 1);
+        let lane2 = _mm512_extracti32x4_epi32(y0, 2);
+        let lane3 = _mm512_extracti32x4_epi32(x0, 3);
+
+        // Combine all lanes using ternary logic
+        let mut z0 = _mm_ternarylogic_epi64(lane0, lane1, lane2, 0x96);
+        z0 = _mm_xor_si128(z0, lane3);
+
+        // Reduce 128 bits to 32 bits using CRC32 instructions
+        crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0) as u64) as u32;
+        crc0 = _mm_crc32_u64(crc0.into(), _mm_extract_epi64(z0, 1) as u64) as u32;
+    }
+
+    // Process remaining 8-byte chunks
+    while len >= 8 {
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    // Process remaining bytes
+    while len > 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
diff --git a/src/crc32/fusion/x86/iscsi/mod.rs b/src/crc32/fusion/x86/iscsi/mod.rs
new file mode 100644
index 0000000..eaffcde
--- /dev/null
+++ b/src/crc32/fusion/x86/iscsi/mod.rs
@@ -0,0 +1,10 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides x86-specific implementations of CRC-32/ISCSI calculations using
+//! fusion techniques.
+
+#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+
+pub(crate) mod avx512_pclmulqdq;
+pub(crate) mod avx512_vpclmulqdq;
+pub(crate) mod sse_pclmulqdq;
diff --git a/src/crc32/fusion/x86/iscsi/sse_pclmulqdq.rs b/src/crc32/fusion/x86/iscsi/sse_pclmulqdq.rs
new file mode 100644
index 0000000..bda9ae8
--- /dev/null
+++ b/src/crc32/fusion/x86/iscsi/sse_pclmulqdq.rs
@@ -0,0 +1,169 @@
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! MIT licensed.
+
+#![cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+use crate::fusion::x86::*;
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// with the help of Claude.ai using:
+///
+/// ./generate -i sse -p crc32c -a v4s3x3
+///
+/// Modified as necessary for this Rust implementation.
+#[inline]
+#[target_feature(enable = "sse4.2,pclmulqdq")]
+pub unsafe fn crc32_iscsi_sse_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
+    // Align to 8-byte boundary using hardware CRC32C instructions
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 144 {
+        let blk = (len - 8) / 136;
+        let klen = blk * 24;
+        let buf2 = buf;
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // First vector chunk - load four 128-bit vectors (64 bytes total)
+        let mut x0 = _mm_loadu_si128(buf2 as *const __m128i);
+        let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i);
+        let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i);
+        let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i);
+
+        // iSCSI-specific folding constant (same as AVX-512 version)
+        let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0);
+
+        // XOR the CRC into the first vector's low 32 bits
+        x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0);
+        crc0 = 0;
+
+        let mut buf2 = buf2.add(64);
+        len -= 136;
+        buf = buf.add(blk * 64);
+
+        // Main loop - process 144 bytes at a time
+        while len >= 144 {
+            let mut y0 = clmul_lo_sse(x0, k);
+            x0 = clmul_hi_sse(x0, k);
+            let mut y1 = clmul_lo_sse(x1, k);
+            x1 = clmul_hi_sse(x1, k);
+            let mut y2 = clmul_lo_sse(x2, k);
+            x2 = clmul_hi_sse(x2, k);
+            let mut y3 = clmul_lo_sse(x3, k);
+            x3 = clmul_hi_sse(x3, k);
+
+            // XOR operations using separate XOR instructions (no ternary logic in SSE)
+            y0 = _mm_xor_si128(y0, _mm_loadu_si128(buf2 as *const __m128i));
+            x0 = _mm_xor_si128(x0, y0);
+            y1 = _mm_xor_si128(y1, _mm_loadu_si128(buf2.add(16) as *const __m128i));
+            x1 = _mm_xor_si128(x1, y1);
+            y2 = _mm_xor_si128(y2, _mm_loadu_si128(buf2.add(32) as *const __m128i));
+            x2 = _mm_xor_si128(x2, y2);
+            y3 = _mm_xor_si128(y3, _mm_loadu_si128(buf2.add(48) as *const __m128i));
+            x3 = _mm_xor_si128(x3, y3);
+
+            // Process scalar data in parallel using hardware CRC32C
+            crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
+            crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64));
+            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64));
+            crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64));
+            crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64));
+            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+            crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64));
+            crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64));
+            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64));
+
+            buf = buf.add(24);
+            buf2 = buf2.add(64);
+            len -= 136;
+        }
+
+        // Reduce x0 ... x3 to just x0 using iSCSI-specific constants
+        k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0);
+
+        let mut y0 = clmul_lo_sse(x0, k);
+        x0 = clmul_hi_sse(x0, k);
+        let mut y2 = clmul_lo_sse(x2, k);
+        x2 = clmul_hi_sse(x2, k);
+
+        y0 = _mm_xor_si128(y0, x1);
+        x0 = _mm_xor_si128(x0, y0);
+        y2 = _mm_xor_si128(y2, x3);
+        x2 = _mm_xor_si128(x2, y2);
+
+        k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0);
+
+        y0 = clmul_lo_sse(x0, k);
+        x0 = clmul_hi_sse(x0, k);
+        y0 = _mm_xor_si128(y0, x2);
+        x0 = _mm_xor_si128(x0, y0);
+
+        // Final scalar chunk
+        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
+        crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64));
+        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64));
+        crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64));
+        crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64));
+        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+        crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64));
+        crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64));
+        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64));
+        buf = buf.add(24);
+
+        let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8);
+        let vc1 = crc_shift_iscsi_sse(crc1, klen + 8);
+        let mut vc = mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0);
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        // Extract the two 64-bit parts of x0 and combine them
+        let x0_low = mm_extract_epi64(x0, 0);
+        let x0_high = mm_extract_epi64(x0, 1);
+        let x0_combined = mm_extract_epi64(
+            crc_shift_iscsi_sse(mm_crc32_u64(mm_crc32_u64(0, x0_low), x0_high), klen * 3 + 8),
+            0,
+        );
+        vc ^= x0_combined;
+
+        // Final 8 bytes
+        buf = buf.add(klen * 2);
+        crc0 = crc2;
+        crc0 = mm_crc32_u64(crc0, *(buf as *const u64) ^ vc);
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    // Process remaining 8-byte chunks using hardware CRC32C
+    while len >= 8 {
+        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    // Process remaining bytes using hardware CRC32C
+    while len > 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
diff --git a/src/crc32/fusion/x86/mod.rs b/src/crc32/fusion/x86/mod.rs
new file mode 100644
index 0000000..bea8c9e
--- /dev/null
+++ b/src/crc32/fusion/x86/mod.rs
@@ -0,0 +1,289 @@
+//! Provides CRC-32/ISCSI calculations using a fusion of native CLMUL
+//! instructions and native CRC calculation instructions on x86 / x86_64.
+//!
+//! https://www.corsix.org/content/fast-crc32c-4k
+//! https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
+//!
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! Modified as necessary for this Rust implementation.
+//!
+//! MIT licensed.
+
+#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+
+mod iscsi;
+
+use iscsi::sse_pclmulqdq::crc32_iscsi_sse_v4s3x3;
+
+#[cfg(target_arch = "x86_64")]
+#[rustversion::since(1.89)]
+use iscsi::avx512_pclmulqdq::crc32_iscsi_avx512_v4s3x3;
+#[cfg(target_arch = "x86_64")]
+#[rustversion::since(1.89)]
+use iscsi::avx512_vpclmulqdq::crc32_iscsi_avx512_vpclmulqdq_v3x2;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+/// Safe wrapper for CRC32 iSCSI calculation using AVX-512
+/// CRC32 iSCSI calculation for Rust versions before 1.89
+///
+///
+/// This function is called by the wrapper layer after feature detection has been performed.
+/// For older Rust versions, only SSE implementation is available.
+#[rustversion::before(1.89)]
+#[inline(always)]
+pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
+    // Only SSE implementation is available for Rust versions before 1.89
+    // Runtime feature detection is handled by the wrapper layer
+    unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) }
+}
+
+/// CRC32 iSCSI calculation using the highest available instruction set
+///
+/// This function is called by the wrapper layer after feature detection has been performed.
+/// The wrapper layer ensures that only the appropriate implementation is called based on
+/// cached feature detection results, removing runtime checks from the hot path.
+#[rustversion::since(1.89)]
+#[inline(always)]
+pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
+    #[cfg(target_arch = "x86_64")]
+    {
+        // AVX512 + VPCLMULQDQ
+
+        if is_x86_feature_detected!("avx512vl") && is_x86_feature_detected!("vpclmulqdq") {
+            unsafe {
+                return crc32_iscsi_avx512_vpclmulqdq_v3x2(crc, data.as_ptr(), data.len());
+            }
+        }
+
+        // AVX512
+        if is_x86_feature_detected!("avx512vl") {
+            unsafe {
+                return crc32_iscsi_avx512_v4s3x3(crc, data.as_ptr(), data.len());
+            }
+        }
+    }
+
+    // Fallback to SSE implementation
+    unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) }
+}
+
+#[rustversion::since(1.89)]
+#[cfg(target_arch = "x86_64")]
+#[inline]
+#[target_feature(enable = "avx512vl,vpclmulqdq")]
+unsafe fn clmul_lo_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_clmulepi64_epi128(a, b, 0)
+}
+
+#[rustversion::since(1.89)]
+#[cfg(target_arch = "x86_64")]
+#[inline]
+#[target_feature(enable = "avx512vl,vpclmulqdq")]
+unsafe fn clmul_hi_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_clmulepi64_epi128(a, b, 17)
+}
+
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+unsafe fn clmul_lo_sse(a: __m128i, b: __m128i) -> __m128i {
+    _mm_clmulepi64_si128(a, b, 0)
+}
+
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+unsafe fn clmul_hi_sse(a: __m128i, b: __m128i) -> __m128i {
+    _mm_clmulepi64_si128(a, b, 17)
+}
+
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+unsafe fn clmul_scalar_sse(a: u32, b: u32) -> __m128i {
+    _mm_clmulepi64_si128(_mm_cvtsi32_si128(a as i32), _mm_cvtsi32_si128(b as i32), 0)
+}
+
+// x^n mod P, in log(n) time
+#[target_feature(enable = "sse4.2,pclmulqdq")]
+unsafe fn xnmodp_iscsi_sse(mut n: u64) -> u32 {
+    let mut stack = !1u64;
+    let mut acc: u32;
+    let mut low: u32;
+
+    while n > 191 {
+        stack = (stack << 1) + (n & 1);
+        n = (n >> 1) - 16;
+    }
+    stack = !stack;
+    acc = 0x80000000u32 >> (n & 31);
+    n >>= 5;
+
+    while n > 0 {
+        // Use hardware CRC32C instruction
+        acc = _mm_crc32_u32(acc, 0);
+        n -= 1;
+    }
+
+    while {
+        low = (stack & 1) as u32;
+        stack >>= 1;
+        stack != 0
+    } {
+        let x = _mm_cvtsi32_si128(acc as i32);
+        let clmul_result = _mm_clmulepi64_si128(x, x, 0);
+        let y = mm_extract_epi64(clmul_result, 0);
+        acc = mm_crc32_u64(0, y << low);
+    }
+    acc
+}
+
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+unsafe fn crc_shift_iscsi_sse(crc: u32, nbytes: usize) -> __m128i {
+    clmul_scalar_sse(crc, xnmodp_iscsi_sse((nbytes * 8 - 33) as u64))
+}
+
+#[inline]
+#[target_feature(enable = "sse4.1")]
+unsafe fn mm_extract_epi64(val: __m128i, idx: i32) -> u64 {
+    #[cfg(target_arch = "x86_64")]
+    {
+        if idx == 0 {
+            _mm_cvtsi128_si64(val) as u64
+        } else {
+            _mm_cvtsi128_si64(_mm_srli_si128(val, 8)) as u64
+        }
+    }
+    #[cfg(target_arch = "x86")]
+    {
+        // On 32-bit x86, extract two 32-bit values and combine them
+        let shifted = if idx == 0 {
+            val
+        } else {
+            _mm_srli_si128(val, 8)
+        };
+        let low = _mm_cvtsi128_si32(shifted) as u32;
+        let high = _mm_cvtsi128_si32(_mm_srli_si128(shifted, 4)) as u32;
+        (low as u64) | ((high as u64) << 32)
+    }
+}
+
+#[inline]
+#[target_feature(enable = "sse4.2")]
+unsafe fn mm_crc32_u64(crc: u32, val: u64) -> u32 {
+    #[cfg(target_arch = "x86_64")]
+    {
+        _mm_crc32_u64(crc.into(), val) as u32
+    }
+    #[cfg(target_arch = "x86")]
+    {
+        // On 32-bit x86, process 64-bit value as two 32-bit CRC operations
+        let low = val as u32;
+        let high = (val >> 32) as u32;
+        let crc = _mm_crc32_u32(crc, low);
+        _mm_crc32_u32(crc, high)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::test::consts::TEST_CHECK_STRING;
+    use crc::{Crc, Table};
+    use rand::{rng, Rng};
+
+    const RUST_CRC32_ISCSI: Crc<u32, Table<16>> = Crc::<u32, Table<16>>::new(&crc::CRC_32_ISCSI);
+
+    #[test]
+    fn test_crc32_iscsi_check() {
+        assert_eq!(
+            crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
+            0xe3069283
+        );
+    }
+
+    #[test]
+    fn test_crc32_iscsi_small_all_lengths() {
+        for len in 1..=255 {
+            test_crc32_iscsi_random(len);
+        }
+    }
+
+    #[test]
+    fn test_crc32_iscsi_medium_lengths() {
+        // Test each length from 256 to 1024, which should fold and include handling remainders
+        for len in 256..=1024 {
+            test_crc32_iscsi_random(len);
+        }
+    }
+
+    #[test]
+    fn test_crc32_iscsi_large_lengths() {
+        // Test 1 MiB just before, at, and just after the folding boundaries
+        for len in 1048575..1048577 {
+            test_crc32_iscsi_random(len);
+        }
+    }
+
+    #[rustversion::since(1.89)]
+    fn test_crc32_iscsi_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISCSI.checksum(&data);
+
+        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            #[cfg(target_arch = "x86_64")]
+            {
+                if is_x86_feature_detected!("vpclmulqdq")
+                    && is_x86_feature_detected!("avx512vl")
+                    && is_x86_feature_detected!("avx512f")
+                {
+                    assert_eq!(
+                        crc32_iscsi_avx512_vpclmulqdq_v3x2(0xffffffff, data.as_ptr(), data.len())
+                            ^ 0xffffffff,
+                        checksum
+                    );
+                }
+
+                if is_x86_feature_detected!("avx512vl")
+                    && is_x86_feature_detected!("avx512f")
+                    && is_x86_feature_detected!("pclmulqdq")
+                {
+                    assert_eq!(
+                        crc32_iscsi_avx512_v4s3x3(0xffffffff, data.as_ptr(), data.len())
+                            ^ 0xffffffff,
+                        checksum
+                    );
+                }
+            }
+
+            assert_eq!(
+                crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+
+    #[rustversion::before(1.89)]
+    fn test_crc32_iscsi_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISCSI.checksum(&data);
+
+        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            assert_eq!(
+                crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+}
diff --git a/src/crc32/mod.rs b/src/crc32/mod.rs
index 518f5f2..78b2c45 100644
--- a/src/crc32/mod.rs
+++ b/src/crc32/mod.rs
@@ -5,5 +5,5 @@
 pub mod algorithm;
 pub mod consts;
 
-#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
 pub(crate) mod fusion;
diff --git a/src/feature_detection.rs b/src/feature_detection.rs
new file mode 100644
index 0000000..d77a8fb
--- /dev/null
+++ b/src/feature_detection.rs
@@ -0,0 +1,1238 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! Feature detection system for safe and efficient hardware acceleration across different
+//! platforms.
+
+use std::sync::OnceLock;
+
+/// Global ArchOps instance cache - initialized once based on feature detection results
+static ARCH_OPS_INSTANCE: OnceLock<ArchOpsInstance> = OnceLock::new();
+
+/// Performance tiers representing different hardware capability levels
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[allow(dead_code)] // Some variants may not be constructed on all target architectures
+pub enum PerformanceTier {
+    // AArch64 tiers
+    AArch64AesSha3,
+    AArch64Aes,
+
+    // x86_64 tiers
+    X86_64Avx512Vpclmulqdq,
+    X86_64Avx512Pclmulqdq,
+    X86_64SsePclmulqdq,
+
+    // x86 tiers
+    X86SsePclmulqdq,
+
+    // Fallback
+    SoftwareTable,
+}
+
+/// Architecture-specific capabilities
+#[derive(Debug, Clone, Copy)]
+#[allow(dead_code)] // Some fields may not be read on all target architectures
+pub struct ArchCapabilities {
+    // AArch64 features
+    pub has_aes: bool, // provides PMULL support for CRC calculations (NEON is implicit)
+    pub has_sha3: bool, // requires 'aes', provides EOR3 for XOR3 operations
+
+    // x86/x86_64 features
+    pub has_sse41: bool,
+    pub has_pclmulqdq: bool,
+    pub has_avx512vl: bool, // implicitly enables avx512f, has XOR3 operations
+    pub has_vpclmulqdq: bool,
+
+    // Rust version gates
+    pub rust_version_supports_avx512: bool,
+}
+
+/// Helper function to convert a performance tier to a human-readable target string
+/// Format: {architecture}-{intrinsics-family}-{intrinsics-features}
+#[inline(always)]
+fn tier_to_target_string(tier: PerformanceTier) -> String {
+    match tier {
+        PerformanceTier::AArch64AesSha3 => "aarch64-neon-pmull-sha3".to_string(),
+        PerformanceTier::AArch64Aes => "aarch64-neon-pmull".to_string(),
+        PerformanceTier::X86_64Avx512Vpclmulqdq => "x86_64-avx512-vpclmulqdq".to_string(),
+        PerformanceTier::X86_64Avx512Pclmulqdq => "x86_64-avx512-pclmulqdq".to_string(),
+        PerformanceTier::X86_64SsePclmulqdq => "x86_64-sse-pclmulqdq".to_string(),
+        PerformanceTier::X86SsePclmulqdq => "x86-sse-pclmulqdq".to_string(),
+        PerformanceTier::SoftwareTable => "software-fallback-tables".to_string(),
+    }
+}
+
+/// Detect architecture-specific capabilities combining compile-time and runtime checks
+///
+/// # Safety
+/// Uses runtime feature detection which may access CPU-specific registers
+unsafe fn detect_arch_capabilities() -> ArchCapabilities {
+    #[cfg(target_arch = "aarch64")]
+    {
+        detect_aarch64_features()
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        detect_x86_features()
+    }
+
+    #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))]
+    {
+        // Other architectures use software fallback
+        ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: false,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        }
+    }
+}
+
+/// AArch64-specific feature detection
+///
+/// Note: NEON is always available on AArch64 and is implicitly enabled by AES support.
+/// AES support provides the PMULL instructions needed for CRC calculations.
+#[inline(always)]
+#[cfg(target_arch = "aarch64")]
+unsafe fn detect_aarch64_features() -> ArchCapabilities {
+    use std::arch::is_aarch64_feature_detected;
+
+    // AES is available on essentially all Aarch64 CPUs and provides the PMULL instructions
+    let has_aes = is_aarch64_feature_detected!("aes");
+
+    // SHA3 is available on modern Aarch64 CPUs, and provides the EOR3 instruction for efficient
+    // XOR3 operations.
+    let has_sha3 = is_aarch64_feature_detected!("sha3");
+
+    ArchCapabilities {
+        has_aes,
+        has_sha3,
+        has_sse41: false,
+        has_pclmulqdq: false,
+        has_avx512vl: false,
+        has_vpclmulqdq: false,
+        rust_version_supports_avx512: false,
+    }
+}
+
+/// x86/x86_64-specific feature detection
+#[inline(always)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+unsafe fn detect_x86_features() -> ArchCapabilities {
+    use std::arch::is_x86_feature_detected;
+
+    // Check Rust version support for VPCLMULQDQ (requires 1.89+)
+    let rust_version_supports_avx512 = check_rust_version_supports_avx512();
+
+    // SSE 4.1 and PCLMULQDQ support are the baseline for hardware acceleration
+    let has_sse41 = is_x86_feature_detected!("sse4.1");
+    let has_pclmulqdq = has_sse41 && is_x86_feature_detected!("pclmulqdq");
+
+    // After Rust 1.89, AVX-512VL and VPCLMULQDQ can be used if available
+    let has_avx512vl =
+        has_pclmulqdq && rust_version_supports_avx512 && is_x86_feature_detected!("avx512vl");
+    let has_vpclmulqdq =
+        has_avx512vl && rust_version_supports_avx512 && is_x86_feature_detected!("vpclmulqdq");
+
+    ArchCapabilities {
+        has_aes: false,
+        has_sha3: false,
+        has_sse41,
+        has_pclmulqdq,
+        has_avx512vl,
+        has_vpclmulqdq,
+        rust_version_supports_avx512,
+    }
+}
+
+/// Check if the current Rust version supports VPCLMULQDQ intrinsics
+/// VPCLMULQDQ intrinsics were stabilized in Rust 1.89
+#[rustversion::since(1.89)]
+#[inline(always)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub(crate) fn check_rust_version_supports_avx512() -> bool {
+    true
+}
+
+/// Check if the current Rust version supports VPCLMULQDQ intrinsics
+/// VPCLMULQDQ intrinsics were stabilized in Rust 1.89
+#[rustversion::before(1.89)]
+#[inline(always)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub(crate) fn check_rust_version_supports_avx512() -> bool {
+    false
+}
+
+/// Select the appropriate performance tier based on detected capabilities
+#[inline(always)]
+#[allow(unused)]
+pub(crate) fn select_performance_tier(capabilities: &ArchCapabilities) -> PerformanceTier {
+    #[cfg(target_arch = "aarch64")]
+    {
+        if capabilities.has_sha3 && capabilities.has_aes {
+            return PerformanceTier::AArch64AesSha3;
+        }
+
+        if capabilities.has_aes {
+            return PerformanceTier::AArch64Aes;
+        }
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        if capabilities.has_vpclmulqdq {
+            return PerformanceTier::X86_64Avx512Vpclmulqdq;
+        }
+        if capabilities.has_avx512vl {
+            return PerformanceTier::X86_64Avx512Pclmulqdq;
+        }
+        if capabilities.has_pclmulqdq {
+            return PerformanceTier::X86_64SsePclmulqdq;
+        }
+    }
+
+    #[cfg(target_arch = "x86")]
+    {
+        if capabilities.has_pclmulqdq {
+            return PerformanceTier::X86SsePclmulqdq;
+        }
+    }
+
+    // Fallback to software implementation
+    PerformanceTier::SoftwareTable
+}
+
+/// Enum that holds the different ArchOps implementations for compile-time dispatch
+/// This avoids the need for trait objects while still providing factory-based selection
+#[rustversion::since(1.89)]
+#[derive(Debug, Clone, Copy)]
+pub enum ArchOpsInstance {
+    #[cfg(target_arch = "aarch64")]
+    Aarch64Aes(crate::arch::aarch64::aes::Aarch64AesOps),
+    #[cfg(target_arch = "aarch64")]
+    Aarch64AesSha3(crate::arch::aarch64::aes_sha3::Aarch64AesSha3Ops),
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    X86SsePclmulqdq(crate::arch::x86::sse::X86SsePclmulqdqOps),
+    #[cfg(target_arch = "x86_64")]
+    X86_64Avx512Pclmulqdq(crate::arch::x86_64::avx512::X86_64Avx512PclmulqdqOps),
+    #[cfg(target_arch = "x86_64")]
+    X86_64Avx512Vpclmulqdq(crate::arch::x86_64::avx512_vpclmulqdq::X86_64Avx512VpclmulqdqOps),
+    /// Software fallback - no ArchOps struct needed
+    SoftwareFallback,
+}
+
+#[rustversion::before(1.89)]
+#[derive(Debug, Clone, Copy)]
+pub enum ArchOpsInstance {
+    #[cfg(target_arch = "aarch64")]
+    Aarch64Aes(crate::arch::aarch64::aes::Aarch64AesOps),
+    #[cfg(target_arch = "aarch64")]
+    Aarch64AesSha3(crate::arch::aarch64::aes_sha3::Aarch64AesSha3Ops),
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    X86SsePclmulqdq(crate::arch::x86::sse::X86SsePclmulqdqOps),
+    /// Software fallback - no ArchOps struct needed
+    SoftwareFallback,
+}
+
+impl ArchOpsInstance {
+    #[inline(always)]
+    #[rustversion::since(1.89)]
+    pub fn get_tier(&self) -> PerformanceTier {
+        match self {
+            #[cfg(target_arch = "aarch64")]
+            ArchOpsInstance::Aarch64Aes(_) => PerformanceTier::AArch64Aes,
+            #[cfg(target_arch = "aarch64")]
+            ArchOpsInstance::Aarch64AesSha3(_) => PerformanceTier::AArch64AesSha3,
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            ArchOpsInstance::X86SsePclmulqdq(_) => PerformanceTier::X86SsePclmulqdq,
+            #[cfg(target_arch = "x86_64")]
+            ArchOpsInstance::X86_64Avx512Pclmulqdq(_) => PerformanceTier::X86_64Avx512Pclmulqdq,
+            #[cfg(target_arch = "x86_64")]
+            ArchOpsInstance::X86_64Avx512Vpclmulqdq(_) => PerformanceTier::X86_64Avx512Vpclmulqdq,
+            ArchOpsInstance::SoftwareFallback => PerformanceTier::SoftwareTable,
+        }
+    }
+
+    #[inline(always)]
+    #[rustversion::before(1.89)]
+    pub fn get_tier(&self) -> PerformanceTier {
+        match self {
+            #[cfg(target_arch = "aarch64")]
+            ArchOpsInstance::Aarch64Aes(_) => PerformanceTier::AArch64Aes,
+            #[cfg(target_arch = "aarch64")]
+            ArchOpsInstance::Aarch64AesSha3(_) => PerformanceTier::AArch64AesSha3,
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            ArchOpsInstance::X86SsePclmulqdq(_) => PerformanceTier::X86SsePclmulqdq,
+            ArchOpsInstance::SoftwareFallback => PerformanceTier::SoftwareTable,
+        }
+    }
+
+    /// Get a human-readable target string describing the active configuration
+    #[inline(always)]
+    pub fn get_target_string(&self) -> String {
+        tier_to_target_string(self.get_tier())
+    }
+}
+
+/// Get the global ArchOps instance (thread-safe, initialized once based on feature detection)
+///
+/// This function provides access to the cached ArchOps instance that was selected based on
+/// feature detection results at library initialization time, eliminating runtime feature
+/// detection overhead from hot paths.
+pub fn get_arch_ops() -> &'static ArchOpsInstance {
+    ARCH_OPS_INSTANCE.get_or_init(create_arch_ops)
+}
+
+/// Factory function that creates the appropriate ArchOps struct based on cached feature detection
+///
+/// This function uses the cached feature detection results to select the optimal
+/// architecture-specific implementation at library initialization time, eliminating
+/// runtime feature detection overhead from hot paths.
+fn create_arch_ops() -> ArchOpsInstance {
+    let capabilities = unsafe { detect_arch_capabilities() };
+    let tier = select_performance_tier(&capabilities);
+
+    create_arch_ops_from_tier(tier)
+}
+
+/// Helper function to create ArchOpsInstance from a performance tier for Rust 1.89+ (when AVX512
+/// stabilized)
+#[rustversion::since(1.89)]
+fn create_arch_ops_from_tier(tier: PerformanceTier) -> ArchOpsInstance {
+    match tier {
+        #[cfg(target_arch = "aarch64")]
+        PerformanceTier::AArch64AesSha3 => {
+            use crate::arch::aarch64::aes_sha3::Aarch64AesSha3Ops;
+            ArchOpsInstance::Aarch64AesSha3(Aarch64AesSha3Ops::new())
+        }
+        #[cfg(target_arch = "aarch64")]
+        PerformanceTier::AArch64Aes => {
+            use crate::arch::aarch64::aes::Aarch64AesOps;
+            ArchOpsInstance::Aarch64Aes(Aarch64AesOps)
+        }
+        #[cfg(target_arch = "x86_64")]
+        PerformanceTier::X86_64Avx512Vpclmulqdq => {
+            use crate::arch::x86_64::avx512_vpclmulqdq::X86_64Avx512VpclmulqdqOps;
+            ArchOpsInstance::X86_64Avx512Vpclmulqdq(X86_64Avx512VpclmulqdqOps::new())
+        }
+        #[cfg(target_arch = "x86_64")]
+        PerformanceTier::X86_64Avx512Pclmulqdq => {
+            use crate::arch::x86_64::avx512::X86_64Avx512PclmulqdqOps;
+            ArchOpsInstance::X86_64Avx512Pclmulqdq(X86_64Avx512PclmulqdqOps::new())
+        }
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        PerformanceTier::X86_64SsePclmulqdq | PerformanceTier::X86SsePclmulqdq => {
+            create_x86_sse_pclmulqdq_ops()
+        }
+        PerformanceTier::SoftwareTable => {
+            // Use software fallback
+            ArchOpsInstance::SoftwareFallback
+        }
+        // Handle cases where the performance tier doesn't match the current architecture
+        _ => {
+            // This can happen when a tier is selected for a different architecture
+            // Fall back to software implementation
+            ArchOpsInstance::SoftwareFallback
+        }
+    }
+}
+
+/// Helper function to create ArchOpsInstance from a performance tier for Rust <1.89 (before AVX512
+/// stabilized)
+#[rustversion::before(1.89)]
+fn create_arch_ops_from_tier(tier: PerformanceTier) -> ArchOpsInstance {
+    match tier {
+        #[cfg(target_arch = "aarch64")]
+        PerformanceTier::AArch64AesSha3 => {
+            use crate::arch::aarch64::aes_sha3::Aarch64AesSha3Ops;
+            ArchOpsInstance::Aarch64AesSha3(Aarch64AesSha3Ops::new())
+        }
+        #[cfg(target_arch = "aarch64")]
+        PerformanceTier::AArch64Aes => {
+            use crate::arch::aarch64::aes::Aarch64AesOps;
+            ArchOpsInstance::Aarch64Aes(Aarch64AesOps)
+        }
+        #[cfg(target_arch = "x86_64")]
+        PerformanceTier::X86_64Avx512Vpclmulqdq => {
+            // VPCLMULQDQ and AVX512 not available in older Rust versions, fall back to SSE
+            create_x86_sse_pclmulqdq_ops()
+        }
+        #[cfg(target_arch = "x86_64")]
+        PerformanceTier::X86_64Avx512Pclmulqdq => {
+            // AVX512 not available in older Rust versions, fall back to SSE
+            create_x86_sse_pclmulqdq_ops()
+        }
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        PerformanceTier::X86_64SsePclmulqdq | PerformanceTier::X86SsePclmulqdq => {
+            create_x86_sse_pclmulqdq_ops()
+        }
+        PerformanceTier::SoftwareTable => {
+            // Use software fallback
+            ArchOpsInstance::SoftwareFallback
+        }
+        // Handle cases where the performance tier doesn't match the current architecture
+        _ => {
+            // This can happen when a tier is selected for a different architecture
+            // Fall back to software implementation
+            ArchOpsInstance::SoftwareFallback
+        }
+    }
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn create_x86_sse_pclmulqdq_ops() -> ArchOpsInstance {
+    use crate::arch::x86::sse::X86SsePclmulqdqOps;
+    ArchOpsInstance::X86SsePclmulqdq(X86SsePclmulqdqOps)
+}
+/// Test-specific tier selection that works across all architectures for comprehensive testing
+#[cfg(test)]
+pub fn select_performance_tier_for_test(capabilities: &ArchCapabilities) -> PerformanceTier {
+    // AArch64 tier selection - SHA3 requires AES to be available
+    if capabilities.has_sha3 && capabilities.has_aes {
+        return PerformanceTier::AArch64AesSha3;
+    }
+
+    if capabilities.has_aes {
+        return PerformanceTier::AArch64Aes;
+    }
+
+    // x86_64 tier selection - VPCLMULQDQ requires AVX512VL
+    if capabilities.has_vpclmulqdq
+        && capabilities.has_avx512vl
+        && capabilities.rust_version_supports_avx512
+    {
+        return PerformanceTier::X86_64Avx512Vpclmulqdq;
+    }
+
+    // AVX512VL requires PCLMULQDQ and SSE4.1
+    if capabilities.has_avx512vl
+        && capabilities.has_pclmulqdq
+        && capabilities.rust_version_supports_avx512
+    {
+        return PerformanceTier::X86_64Avx512Pclmulqdq;
+    }
+
+    // PCLMULQDQ requires SSE4.1
+    if capabilities.has_pclmulqdq && capabilities.has_sse41 {
+        return PerformanceTier::X86_64SsePclmulqdq;
+    }
+
+    // Fallback to software implementation
+    PerformanceTier::SoftwareTable
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn test_rust_version_check() {
+        let supports_vpclmulqdq = check_rust_version_supports_avx512();
+
+        // Should return a boolean without panicking
+        let _ = supports_vpclmulqdq;
+    }
+
+    #[test]
+    fn test_aarch64_tier_selection() {
+        // Test that aarch64 tier selection follows the expected hierarchy
+
+        // Test SHA3 + AES (highest tier) - NEON is implicit with AES
+        let capabilities_sha3 = ArchCapabilities {
+            has_aes: true,
+            has_sha3: true,
+            has_sse41: false,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_sha3),
+            PerformanceTier::AArch64AesSha3
+        );
+
+        // Test AES only (baseline tier) - NEON is implicit with AES
+        let capabilities_aes = ArchCapabilities {
+            has_aes: true,
+            has_sha3: false,
+            has_sse41: false,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_aes),
+            PerformanceTier::AArch64Aes
+        );
+
+        // Test missing AES (should fall back to software)
+        let capabilities_no_aes = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: false,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_no_aes),
+            PerformanceTier::SoftwareTable
+        );
+    }
+
+    #[test]
+    fn test_aarch64_feature_hierarchy() {
+        // Test that AArch64 feature hierarchy is properly maintained
+        // NEON is always available on AArch64 and implicit with AES
+        // AES provides PMULL instructions, SHA3 provides EOR3
+
+        // Create test capabilities with AES support (NEON is implicit)
+        let capabilities_with_aes = ArchCapabilities {
+            has_aes: true,
+            has_sha3: false,
+            has_sse41: false,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+
+        // AES support means we have PMULL instructions available for CRC calculations
+        assert!(capabilities_with_aes.has_aes);
+
+        // SHA3 requires AES to be available first
+        let capabilities_with_sha3 = ArchCapabilities {
+            has_aes: true,
+            has_sha3: true,
+            has_sse41: false,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+
+        assert!(capabilities_with_sha3.has_aes);
+        assert!(capabilities_with_sha3.has_sha3);
+    }
+
+    #[test]
+    #[cfg(target_arch = "x86_64")]
+    fn test_x86_64_tier_selection() {
+        // Test that x86_64 tier selection follows the expected hierarchy
+
+        // Test VPCLMULQDQ + AVX512 (highest tier) on Rust 1.89+
+        let capabilities_vpclmulqdq = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: true,
+            has_avx512vl: true,
+            has_vpclmulqdq: true,
+            rust_version_supports_avx512: true,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_vpclmulqdq),
+            PerformanceTier::X86_64Avx512Vpclmulqdq
+        );
+
+        // Test AVX512 + PCLMULQDQ (mid-tier) on Rust 1.89+
+        let capabilities_avx512 = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: true,
+            has_avx512vl: true,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: true,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_avx512),
+            PerformanceTier::X86_64Avx512Pclmulqdq
+        );
+
+        // Test VPCLMULQDQ + AVX512 (highest tier) on Rust < 1.89
+        let capabilities_vpclmulqdq = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: true,
+            has_avx512vl: true,
+            has_vpclmulqdq: true,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_vpclmulqdq),
+            PerformanceTier::X86_64SsePclmulqdq
+        );
+
+        // Test AVX512 + PCLMULQDQ (mid-tier) on Rust < 1.89
+        let capabilities_avx512 = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: true,
+            has_avx512vl: true,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_avx512),
+            PerformanceTier::X86_64SsePclmulqdq
+        );
+
+        // Test SSE + PCLMULQDQ (baseline tier)
+        let capabilities_sse = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: true,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_sse),
+            PerformanceTier::X86_64SsePclmulqdq
+        );
+
+        // Test missing PCLMULQDQ (should fall back to software)
+        let capabilities_no_pclmul = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_no_pclmul),
+            PerformanceTier::SoftwareTable
+        );
+    }
+
+    #[test]
+    #[cfg(target_arch = "x86")]
+    fn test_x86_tier_selection() {
+        // Test that x86 (32-bit) tier selection works correctly
+
+        // Test SSE + PCLMULQDQ (only available tier for x86)
+        let capabilities_sse = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: true,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_sse),
+            PerformanceTier::X86_64SsePclmulqdq
+        );
+
+        // For x86 32-bit testing, we need a special case since AVX512VL indicates x86_64
+        // Create capabilities without AVX512VL to simulate x86 32-bit
+        let capabilities_x86_sse = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: true,
+            has_avx512vl: false, // No AVX512 on 32-bit x86
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        // This should select x86_64 tier since we're testing the general case
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_x86_sse),
+            PerformanceTier::X86_64SsePclmulqdq
+        );
+
+        // Test missing PCLMULQDQ (should fall back to software)
+        let capabilities_no_pclmul = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_no_pclmul),
+            PerformanceTier::SoftwareTable
+        );
+    }
+
+    #[test]
+    fn test_x86_feature_hierarchy() {
+        // Test that x86 feature hierarchy is properly maintained
+        // SSE4.1 is required for PCLMULQDQ
+        // AVX512VL requires PCLMULQDQ
+        // VPCLMULQDQ requires AVX512VL and Rust 1.89+
+
+        // Test feature dependencies are enforced
+        let capabilities_full = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: true,
+            has_avx512vl: true,
+            has_vpclmulqdq: true,
+            rust_version_supports_avx512: true,
+        };
+
+        // All x86 features should be available when hierarchy is satisfied
+        assert!(capabilities_full.has_sse41);
+        assert!(capabilities_full.has_pclmulqdq);
+        assert!(capabilities_full.has_avx512vl);
+        assert!(capabilities_full.has_vpclmulqdq);
+        assert!(capabilities_full.rust_version_supports_avx512);
+    }
+
+    #[test]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn test_rust_version_gating() {
+        // Test that VPCLMULQDQ is properly gated by Rust version
+        let rust_support = check_rust_version_supports_avx512();
+
+        // Should return a boolean based on Rust version
+        // This will be true for Rust 1.89+ and false for earlier versions
+        assert!(rust_support == true || rust_support == false);
+    }
+
+    // Mock tests for compile-time and runtime feature agreement scenarios
+    mod mock_feature_agreement_tests {
+        use super::*;
+
+        #[test]
+        fn test_rust_version_gating_scenarios() {
+            // Test VPCLMULQDQ with different Rust version scenarios
+
+            // All features available but Rust version too old
+            let capabilities_old_rust = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: true,
+                has_vpclmulqdq: true,                // Hardware supports it
+                rust_version_supports_avx512: false, // But Rust version is too old
+            };
+
+            // Should not select VPCLMULQDQ or AVX512 tiers due to Rust version constraint
+            let tier = select_performance_tier_for_test(&capabilities_old_rust);
+            assert_ne!(tier, PerformanceTier::X86_64Avx512Vpclmulqdq);
+            assert_ne!(tier, PerformanceTier::X86_64Avx512Pclmulqdq);
+            assert_eq!(tier, PerformanceTier::X86_64SsePclmulqdq);
+        }
+
+        #[test]
+        fn test_feature_dependency_validation() {
+            // Test that feature dependencies are properly validated
+
+            // SHA3 without AES should not be possible
+            let invalid_sha3_caps = ArchCapabilities {
+                has_aes: false, // Missing required dependency
+                has_sha3: true, // This should be impossible in real detection
+                has_sse41: false,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+
+            // Should fall back to software since AES is required for SHA3
+            assert_eq!(
+                select_performance_tier_for_test(&invalid_sha3_caps),
+                PerformanceTier::SoftwareTable
+            );
+
+            // VPCLMULQDQ without AVX512VL should not be possible
+            let invalid_vpclmul_caps = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: false,  // Missing required dependency
+                has_vpclmulqdq: true, // This should be impossible in real detection
+                rust_version_supports_avx512: true,
+            };
+
+            // Should fall back to SSE tier since AVX512VL is required for VPCLMULQDQ
+            assert_eq!(
+                select_performance_tier_for_test(&invalid_vpclmul_caps),
+                PerformanceTier::X86_64SsePclmulqdq
+            );
+        }
+    }
+
+    // Comprehensive tier selection tests across different hardware configurations
+    mod tier_selection_comprehensive_tests {
+        use super::*;
+
+        #[test]
+        fn test_all_aarch64_tier_combinations() {
+            // Test all possible AArch64 capability combinations
+
+            // No features - software fallback
+            let no_features = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: false,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&no_features),
+                PerformanceTier::SoftwareTable
+            );
+
+            // AES only - baseline AArch64 tier
+            let aes_only = ArchCapabilities {
+                has_aes: true,
+                has_sha3: false,
+                has_sse41: false,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&aes_only),
+                PerformanceTier::AArch64Aes
+            );
+
+            // AES + SHA3 - highest AArch64 tier
+            let aes_sha3 = ArchCapabilities {
+                has_aes: true,
+                has_sha3: true,
+                has_sse41: false,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&aes_sha3),
+                PerformanceTier::AArch64AesSha3
+            );
+        }
+
+        #[test]
+        fn test_all_x86_64_tier_combinations() {
+            // Test all possible x86_64 capability combinations
+
+            // No features - software fallback
+            let no_features = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: false,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&no_features),
+                PerformanceTier::SoftwareTable
+            );
+
+            // SSE4.1 only - software fallback (PCLMULQDQ required)
+            let sse_only = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&sse_only),
+                PerformanceTier::SoftwareTable
+            );
+
+            // SSE4.1 + PCLMULQDQ - baseline x86_64 tier
+            let sse_pclmul = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&sse_pclmul),
+                PerformanceTier::X86_64SsePclmulqdq
+            );
+
+            // SSE4.1 + PCLMULQDQ + AVX512VL but old Rust - should fall back to SSE tier
+            let avx512_pclmul_old_rust = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: true,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false, // Old Rust version
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&avx512_pclmul_old_rust),
+                PerformanceTier::X86_64SsePclmulqdq
+            );
+
+            // SSE4.1 + PCLMULQDQ + AVX512VL with new Rust - mid-tier
+            let avx512_pclmul_new_rust = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: true,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: true, // New Rust version
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&avx512_pclmul_new_rust),
+                PerformanceTier::X86_64Avx512Pclmulqdq
+            );
+
+            // All features + old Rust - should fall back to SSE tier
+            let all_features_old_rust = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: true,
+                has_vpclmulqdq: true,
+                rust_version_supports_avx512: false, // Old Rust version
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&all_features_old_rust),
+                PerformanceTier::X86_64SsePclmulqdq
+            );
+
+            // All features + new Rust - highest tier
+            let all_features_new_rust = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: true,
+                has_vpclmulqdq: true,
+                rust_version_supports_avx512: true, // New Rust version
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&all_features_new_rust),
+                PerformanceTier::X86_64Avx512Vpclmulqdq
+            );
+        }
+
+        #[test]
+        fn test_x86_32bit_tier_combinations() {
+            // Test x86 (32-bit) capability combinations
+
+            // No features - software fallback
+            let no_features = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: false,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&no_features),
+                PerformanceTier::SoftwareTable
+            );
+
+            // SSE4.1 + PCLMULQDQ - for x86 32-bit testing, we expect x86_64 tier in our test function
+            // since the test function doesn't distinguish between x86 and x86_64 architectures
+            let sse_pclmul = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: false, // AVX512 not available on 32-bit x86
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            // The test function will return x86_64 tier since it doesn't distinguish architectures
+            assert_eq!(
+                select_performance_tier_for_test(&sse_pclmul),
+                PerformanceTier::X86_64SsePclmulqdq
+            );
+        }
+
+        #[test]
+        fn test_target_string_consistency() {
+            // Test that target strings are consistent with selected tiers
+
+            let test_cases = [
+                (PerformanceTier::AArch64AesSha3, "aarch64-neon-pmull-sha3"),
+                (PerformanceTier::AArch64Aes, "aarch64-neon-pmull"),
+                (
+                    PerformanceTier::X86_64Avx512Vpclmulqdq,
+                    "x86_64-avx512-vpclmulqdq",
+                ),
+                (
+                    PerformanceTier::X86_64Avx512Pclmulqdq,
+                    "x86_64-avx512-pclmulqdq",
+                ),
+                (PerformanceTier::X86_64SsePclmulqdq, "x86_64-sse-pclmulqdq"),
+                (PerformanceTier::X86SsePclmulqdq, "x86-sse-pclmulqdq"),
+                (PerformanceTier::SoftwareTable, "software-fallback-tables"),
+            ];
+
+            for (tier, expected_string) in test_cases {
+                assert_eq!(tier_to_target_string(tier), expected_string);
+            }
+        }
+    }
+
+    // Tests for graceful degradation between performance tiers
+    mod graceful_degradation_tests {
+        use super::*;
+
+        #[test]
+        fn test_aarch64_degradation_path() {
+            // Test the degradation path for AArch64: SHA3+AES -> AES -> Software
+
+            // Start with highest tier capabilities
+            let mut capabilities = ArchCapabilities {
+                has_aes: true,
+                has_sha3: true,
+                has_sse41: false,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+
+            // Should select highest tier
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities),
+                PerformanceTier::AArch64AesSha3
+            );
+
+            // Remove SHA3 - should degrade to AES tier
+            capabilities.has_sha3 = false;
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities),
+                PerformanceTier::AArch64Aes
+            );
+
+            // Remove AES - should degrade to software
+            capabilities.has_aes = false;
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities),
+                PerformanceTier::SoftwareTable
+            );
+        }
+
+        #[test]
+        fn test_x86_64_degradation_path() {
+            // Test the degradation path for x86_64: VPCLMULQDQ -> AVX512 -> SSE -> Software
+
+            // Start with highest tier capabilities
+            let mut capabilities = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: true,
+                has_vpclmulqdq: true,
+                rust_version_supports_avx512: true,
+            };
+
+            // Should select highest tier
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities),
+                PerformanceTier::X86_64Avx512Vpclmulqdq
+            );
+
+            // Remove VPCLMULQDQ - should degrade to AVX512 tier
+            capabilities.has_vpclmulqdq = false;
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities),
+                PerformanceTier::X86_64Avx512Pclmulqdq
+            );
+
+            // Remove AVX512VL - should degrade to SSE tier
+            capabilities.has_avx512vl = false;
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities),
+                PerformanceTier::X86_64SsePclmulqdq
+            );
+
+            // Remove PCLMULQDQ - should degrade to software
+            capabilities.has_pclmulqdq = false;
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities),
+                PerformanceTier::SoftwareTable
+            );
+
+            // Remove SSE4.1 - should still be software (already at lowest tier)
+            capabilities.has_sse41 = false;
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities),
+                PerformanceTier::SoftwareTable
+            );
+        }
+
+        #[test]
+        fn test_rust_version_degradation() {
+            // Test degradation when Rust version doesn't support VPCLMULQDQ
+
+            let capabilities_with_vpclmulqdq = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: true,
+                has_vpclmulqdq: true,
+                rust_version_supports_avx512: false, // Old Rust version
+            };
+
+            // Should degrade from VPCLMULQDQ tier to SSE tier due to Rust version
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities_with_vpclmulqdq),
+                PerformanceTier::X86_64SsePclmulqdq
+            );
+        }
+
+        #[test]
+        fn test_partial_feature_availability() {
+            // Test scenarios where only some features in a tier are available
+
+            // AArch64: SHA3 available but AES not (impossible in real hardware, but test safety)
+            let aarch64_partial = ArchCapabilities {
+                has_aes: false,
+                has_sha3: true, // This would be impossible in real detection
+                has_sse41: false,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            // Should fall back to software since AES is required for SHA3
+            assert_eq!(
+                select_performance_tier_for_test(&aarch64_partial),
+                PerformanceTier::SoftwareTable
+            );
+
+            // x86_64: VPCLMULQDQ available but AVX512VL not (impossible in real hardware)
+            let x86_64_partial = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: false,
+                has_vpclmulqdq: true, // This would be impossible in real detection
+                rust_version_supports_avx512: true,
+            };
+            // Should fall back to SSE tier since AVX512VL is required for VPCLMULQDQ
+            assert_eq!(
+                select_performance_tier_for_test(&x86_64_partial),
+                PerformanceTier::X86_64SsePclmulqdq
+            );
+        }
+    }
+}
+
+#[cfg(test)]
+mod software_fallback_tests {
+    use super::*;
+    #[test]
+    fn test_aarch64_without_aes_falls_back_to_software() {
+        // Test that AArch64 without AES support falls back to software implementation
+        let capabilities_no_aes = ArchCapabilities {
+            has_aes: false,  // No AES support
+            has_sha3: false, // SHA3 requires AES, so also false
+            has_sse41: false,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+
+        let tier = select_performance_tier_for_test(&capabilities_no_aes);
+        assert_eq!(
+            tier,
+            PerformanceTier::SoftwareTable,
+            "AArch64 without AES should fall back to software implementation"
+        );
+    }
+
+    #[test]
+    fn test_x86_without_pclmulqdq_falls_back_to_software() {
+        // Test that x86 without SSE4.1/PCLMULQDQ falls back to software implementation
+        let capabilities_no_pclmul = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,      // SSE4.1 available
+            has_pclmulqdq: false, // But PCLMULQDQ not available
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+
+        let tier = select_performance_tier_for_test(&capabilities_no_pclmul);
+        assert_eq!(
+            tier,
+            PerformanceTier::SoftwareTable,
+            "x86 without PCLMULQDQ should fall back to software implementation"
+        );
+
+        // Test x86 without SSE4.1
+        let capabilities_no_sse = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: false,     // No SSE4.1 support
+            has_pclmulqdq: false, // PCLMULQDQ requires SSE4.1
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+
+        let tier = select_performance_tier_for_test(&capabilities_no_sse);
+        assert_eq!(
+            tier,
+            PerformanceTier::SoftwareTable,
+            "x86 without SSE4.1 should fall back to software implementation"
+        );
+    }
+
+    #[test]
+    fn test_conditional_compilation_coverage() {
+        // Test that software fallback is properly conditionally compiled
+        // This test ensures the conditional compilation logic is working correctly
+
+        // Software fallback should be available when needed
+        #[cfg(any(
+            not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")),
+            target_arch = "x86",
+            all(target_arch = "aarch64", not(target_feature = "aes"))
+        ))]
+        {
+            // Software fallback should be compiled in these cases
+            use crate::arch::software;
+            let _test_fn = software::update;
+            // This test passes if it compiles successfully
+        }
+
+        // For x86_64, software fallback should not be needed since SSE4.1/PCLMULQDQ are always available
+        // But it may still be compiled for testing purposes
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 21c4935..393edb0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -135,7 +135,7 @@ use crate::crc32::consts::{
     CRC32_ISCSI, CRC32_ISO_HDLC, CRC32_JAMCRC, CRC32_MEF, CRC32_MPEG_2, CRC32_XFER,
 };
 
-#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
 use crate::crc32::fusion;
 
 use crate::crc64::consts::{
@@ -155,6 +155,7 @@ mod consts;
 mod crc32;
 mod crc64;
 mod enums;
+mod feature_detection;
 mod ffi;
 mod generate;
 mod structs;
@@ -782,6 +783,19 @@ pub fn checksum_combine_with_params(
 
 /// Returns the target used to calculate the CRC checksum for the specified algorithm.
 ///
+/// This function provides visibility into the active performance tier being used for CRC calculations.
+/// The target string follows the format `{architecture}-{intrinsics-family}-{intrinsics-features}`,
+/// such as `aarch64-aes-sha3` or `x86_64-avx512-vpclmulqdq`.
+///
+/// The performance tier system provides graceful degradation across different hardware capabilities:
+/// - **AArch64**: `aarch64-aes-sha3` (highest) → `aarch64-aes-pmull` (baseline)
+/// - **x86_64**: `x86_64-avx512-vpclmulqdq` (highest) → `x86_64-avx512-pclmulqdq` (mid) → `x86_64-sse-pclmulqdq` (baseline)
+/// - **x86**: `x86-sse-pclmulqdq` (baseline) → `software-fallback-tables` (fallback)
+/// - **Other architectures**: `software-fallback-tables`
+///
+/// The tier selection is deterministic and consistent across runs on the same hardware,
+/// combining compile-time and runtime feature detection for safety and optimal performance.
+///
 /// These strings are informational only, not stable, and shouldn't be relied on to match across
 /// versions.
 ///
@@ -790,9 +804,17 @@ pub fn checksum_combine_with_params(
 /// use crc_fast::{get_calculator_target, CrcAlgorithm::Crc32IsoHdlc};
 ///
 /// let target = get_calculator_target(Crc32IsoHdlc);
+/// println!("Using performance tier: {}", target);
+/// // Example outputs:
+/// // "aarch64-aes-sha3" - AArch64 with SHA3 and AES support
+/// // "x86_64-avx512-vpclmulqdq" - x86_64 with VPCLMULQDQ support
+/// // "x86_64-sse-pclmulqdq" - x86_64 baseline with SSE4.1 and PCLMULQDQ
 /// ```
 pub fn get_calculator_target(_algorithm: CrcAlgorithm) -> String {
-    arch::get_target()
+    use crate::feature_detection::get_arch_ops;
+
+    let arch_ops = get_arch_ops();
+    arch_ops.get_target_string()
 }
 
 /// Returns the calculator function and parameters for the specified CRC algorithm.
@@ -834,11 +856,15 @@ fn get_calculator_params(algorithm: CrcAlgorithm) -> (CalculatorFn, CrcParams) {
 #[inline(always)]
 fn crc32_iscsi_calculator(state: u64, data: &[u8], _params: CrcParams) -> u64 {
     // both aarch64 and x86 have native CRC-32/ISCSI support, so we can use fusion
-    #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))]
+    #[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))]
     return fusion::crc32_iscsi(state as u32, data) as u64;
 
-    #[cfg(all(not(target_arch = "aarch64"), not(target_arch = "x86_64")))]
-    // fallback to traditional calculation if not aarch64 or x86_64
+    #[cfg(all(
+        not(target_arch = "aarch64"),
+        not(target_arch = "x86_64"),
+        not(target_arch = "x86")
+    ))]
+    // Fallback to traditional calculation for other architectures
     Calculator::calculate(state, data, _params)
 }
 
@@ -945,6 +971,64 @@ mod lib {
         );
     }
 
+    #[test]
+    fn test_get_calculator_target_format() {
+        let target = get_calculator_target(CrcAlgorithm::Crc32IsoHdlc);
+
+        // Target string should not be empty
+        assert!(!target.is_empty());
+
+        // Should follow the expected format with valid architecture prefixes
+        let valid_prefixes = ["aarch64-", "x86_64-", "x86-", "software-"];
+        assert!(
+            valid_prefixes
+                .iter()
+                .any(|prefix| target.starts_with(prefix)),
+            "Target '{}' should start with a valid architecture prefix",
+            target
+        );
+
+        // Should contain intrinsics family and features information
+        let parts: Vec<&str> = target.split('-').collect();
+        assert!(
+            parts.len() >= 3,
+            "Target '{}' should have at least 3 parts: architecture-family-features",
+            target
+        );
+    }
+
+    #[test]
+    fn test_get_calculator_target_consistency() {
+        // Multiple calls should return the same result (deterministic)
+        let target1 = get_calculator_target(CrcAlgorithm::Crc32IsoHdlc);
+        let target2 = get_calculator_target(CrcAlgorithm::Crc32Iscsi);
+        let target3 = get_calculator_target(CrcAlgorithm::Crc64Nvme);
+
+        assert_eq!(
+            target1, target2,
+            "Target should be consistent across different CRC-32 algorithms"
+        );
+        assert_eq!(
+            target1, target3,
+            "Target should be consistent across CRC-32 and CRC-64 algorithms"
+        );
+    }
+
+    #[test]
+    fn test_get_calculator_target_uses_cached_detection() {
+        // This test verifies that the function uses cached feature detection
+        // by checking that multiple calls are consistent and don't perform
+        // redundant feature detection
+
+        let target1 = get_calculator_target(CrcAlgorithm::Crc32IsoHdlc);
+        let target2 = get_calculator_target(CrcAlgorithm::Crc32IsoHdlc);
+
+        assert_eq!(
+            target1, target2,
+            "Cached detection should return identical results"
+        );
+    }
+
     #[test]
     fn test_digest_updates_check() {
         for config in TEST_ALL_CONFIGS {
diff --git a/src/test/future_proof.rs b/src/test/future_proof_tests.rs
similarity index 100%
rename from src/test/future_proof.rs
rename to src/test/future_proof_tests.rs
diff --git a/src/test/mod.rs b/src/test/mod.rs
index 2b7b216..8b84fcc 100644
--- a/src/test/mod.rs
+++ b/src/test/mod.rs
@@ -7,7 +7,7 @@
 
 pub(crate) mod consts;
 pub(crate) mod enums;
-mod future_proof;
+mod future_proof_tests;
 mod structs;
 
 /// Creates a new aligned data vector from the input slice for testing.
diff --git a/tests/checksum_integration_tests.rs b/tests/checksum_integration_tests.rs
new file mode 100644
index 0000000..2eab8dd
--- /dev/null
+++ b/tests/checksum_integration_tests.rs
@@ -0,0 +1,237 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+use std::fs;
+use std::process::Command;
+
+#[test]
+fn test_benchmark_flag_parsing() {
+    let output = Command::new("cargo")
+        .args(&["run", "--bin", "checksum", "--", "-a", "CRC-32/ISCSI", "-b"])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(
+        output.status.success(),
+        "Command should succeed with -b flag"
+    );
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Algorithm: CRC-32/ISCSI"));
+    assert!(stdout.contains("Throughput:"));
+    assert!(stdout.contains("GiB/s"));
+}
+
+#[test]
+fn test_benchmark_with_size_parameter() {
+    let output = Command::new("cargo")
+        .args(&[
+            "run",
+            "--bin",
+            "checksum",
+            "--",
+            "-a",
+            "CRC-32/ISCSI",
+            "-b",
+            "--size",
+            "1024",
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(output.status.success());
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Data Size: 1,024 bytes"));
+}
+
+#[test]
+fn test_benchmark_with_duration_parameter() {
+    let output = Command::new("cargo")
+        .args(&[
+            "run",
+            "--bin",
+            "checksum",
+            "--",
+            "-a",
+            "CRC-32/ISCSI",
+            "-b",
+            "--duration",
+            "1.0",
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(output.status.success());
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Duration: 1."));
+}
+
+#[test]
+fn test_benchmark_invalid_size() {
+    let output = Command::new("cargo")
+        .args(&[
+            "run",
+            "--bin",
+            "checksum",
+            "--",
+            "-a",
+            "CRC-32/ISCSI",
+            "-b",
+            "--size",
+            "0",
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(!output.status.success());
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(stderr.contains("Size must be greater than 0"));
+}
+
+#[test]
+fn test_benchmark_invalid_duration() {
+    let output = Command::new("cargo")
+        .args(&[
+            "run",
+            "--bin",
+            "checksum",
+            "--",
+            "-a",
+            "CRC-32/ISCSI",
+            "-b",
+            "--duration",
+            "0",
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(!output.status.success());
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(stderr.contains("Duration must be greater than 0"));
+}
+
+#[test]
+fn test_benchmark_with_file_input() {
+    // Create a temporary test file
+    let test_file = "test_benchmark_file.txt";
+    fs::write(test_file, "Hello, benchmark world!").expect("Failed to create test file");
+
+    let output = Command::new("cargo")
+        .args(&[
+            "run",
+            "--bin",
+            "checksum",
+            "--",
+            "-a",
+            "CRC-32/ISCSI",
+            "-b",
+            "-f",
+            test_file,
+            "--duration",
+            "0.5",
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    // Clean up
+    let _ = fs::remove_file(test_file);
+
+    assert!(output.status.success());
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Data Size: 23 bytes"));
+}
+
+#[test]
+fn test_benchmark_with_string_input() {
+    let output = Command::new("cargo")
+        .args(&[
+            "run",
+            "--bin",
+            "checksum",
+            "--",
+            "-a",
+            "CRC-32/ISCSI",
+            "-b",
+            "-s",
+            "test string",
+            "--duration",
+            "0.5",
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(output.status.success());
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Data Size: 11 bytes"));
+}
+
+#[test]
+fn test_benchmark_different_algorithms() {
+    let algorithms = ["CRC-32/ISCSI", "CRC-64/NVME"];
+
+    for algorithm in &algorithms {
+        let output = Command::new("cargo")
+            .args(&[
+                "run",
+                "--bin",
+                "checksum",
+                "--",
+                "-a",
+                algorithm,
+                "-b",
+                "--duration",
+                "0.5",
+            ])
+            .output()
+            .expect("Failed to execute command");
+
+        assert!(
+            output.status.success(),
+            "Algorithm {} should work",
+            algorithm
+        );
+        let stdout = String::from_utf8_lossy(&output.stdout);
+        assert!(stdout.contains(&format!("Algorithm: {}", algorithm)));
+    }
+}
+
+#[test]
+fn test_benchmark_size_without_benchmark_flag() {
+    let output = Command::new("cargo")
+        .args(&[
+            "run",
+            "--bin",
+            "checksum",
+            "--",
+            "-a",
+            "CRC-32/ISCSI",
+            "--size",
+            "1024",
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(!output.status.success());
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(stderr.contains("--size and --duration can only be used with -b flag"));
+}
+
+#[test]
+fn test_benchmark_nonexistent_file() {
+    let output = Command::new("cargo")
+        .args(&[
+            "run",
+            "--bin",
+            "checksum",
+            "--",
+            "-a",
+            "CRC-32/ISCSI",
+            "-b",
+            "-f",
+            "nonexistent_file.txt",
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(!output.status.success());
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(stderr.contains("File not found"));
+}

From d9b8291c3bf1d9350fdd2cb0c095a826a0c92fa1 Mon Sep 17 00:00:00 2001
From: Don MacAskill <don@smugmug.com>
Date: Thu, 30 Oct 2025 13:03:57 -0700
Subject: [PATCH 2/2] =?UTF-8?q?Improve=20docs=20(from=20GitHub=20Copilot?=
 =?UTF-8?q?=E2=80=99s=20suggestions)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/crc32/fusion/x86/iscsi/avx512_pclmulqdq.rs  | 2 ++
 src/crc32/fusion/x86/iscsi/avx512_vpclmulqdq.rs | 3 +++
 src/crc32/fusion/x86/mod.rs                     | 6 +++---
 src/feature_detection.rs                        | 2 +-
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/crc32/fusion/x86/iscsi/avx512_pclmulqdq.rs b/src/crc32/fusion/x86/iscsi/avx512_pclmulqdq.rs
index 4bd0c42..b3bc0ba 100644
--- a/src/crc32/fusion/x86/iscsi/avx512_pclmulqdq.rs
+++ b/src/crc32/fusion/x86/iscsi/avx512_pclmulqdq.rs
@@ -11,6 +11,8 @@
 /// ./generate -i avx512 -p crc32c -a v4s3x3
 ///
 /// Modified as necessary for this Rust implementation.
+///
+/// Uses AVX-512 instructions so only available after Rust 1.89 (when AVX-512 stabilized)
 #[rustversion::since(1.89)]
 #[inline]
 #[target_feature(enable = "avx512vl,pclmulqdq")]
diff --git a/src/crc32/fusion/x86/iscsi/avx512_vpclmulqdq.rs b/src/crc32/fusion/x86/iscsi/avx512_vpclmulqdq.rs
index 1b1f2cd..56a7cb0 100644
--- a/src/crc32/fusion/x86/iscsi/avx512_vpclmulqdq.rs
+++ b/src/crc32/fusion/x86/iscsi/avx512_vpclmulqdq.rs
@@ -11,6 +11,9 @@
 /// ./generate -i avx512_vpclmulqdq -p crc32c -a v3x2
 ///
 /// Modified as necessary for this Rust implementation.
+///
+/// Uses AVX-512 VPCLMULQDQ instructions, so only available after Rust 1.89 (when AVX-512
+/// stabilized)
 #[rustversion::since(1.89)]
 #[inline]
 #[target_feature(enable = "avx512vl,vpclmulqdq")]
diff --git a/src/crc32/fusion/x86/mod.rs b/src/crc32/fusion/x86/mod.rs
index bea8c9e..a8043b3 100644
--- a/src/crc32/fusion/x86/mod.rs
+++ b/src/crc32/fusion/x86/mod.rs
@@ -28,8 +28,7 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-/// Safe wrapper for CRC32 iSCSI calculation using AVX-512
-/// CRC32 iSCSI calculation for Rust versions before 1.89
+/// CRC32 iSCSI calculation for Rust versions before 1.89 (pre-AVX-512 support)
 ///
 ///
 /// This function is called by the wrapper layer after feature detection has been performed.
@@ -42,7 +41,8 @@ pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
     unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) }
 }
 
-/// CRC32 iSCSI calculation using the highest available instruction set
+/// CRC32 iSCSI calculation using the highest available instruction set after Rust 1.89
+/// (post-AVX-512 support)
 ///
 /// This function is called by the wrapper layer after feature detection has been performed.
 /// The wrapper layer ensures that only the appropriate implementation is called based on
diff --git a/src/feature_detection.rs b/src/feature_detection.rs
index d77a8fb..b170a74 100644
--- a/src/feature_detection.rs
+++ b/src/feature_detection.rs
@@ -100,7 +100,7 @@ unsafe fn detect_arch_capabilities() -> ArchCapabilities {
 unsafe fn detect_aarch64_features() -> ArchCapabilities {
     use std::arch::is_aarch64_feature_detected;
 
-    // AES is available on essentially all Aarch64 CPUs and provides the PMULL instructions
+    // AES is available on essentially all AArch64 CPUs and provides the PMULL instructions
     let has_aes = is_aarch64_feature_detected!("aes");
 
     // SHA3 is available on modern Aarch64 CPUs, and provides the EOR3 instruction for efficient