diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 180d113..3b93e76 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -6,14 +6,45 @@ on:
   workflow_dispatch:
 
 jobs:
-  test-accelerated:
-    name: Test accelerated (aarch64, x86_64)
+  test-aarch64:
+    name: Test aarch64
     strategy:
       matrix:
-        os: [ubuntu-latest, ubuntu-22.04-arm, ubuntu-24.04-arm, macos-latest]
+        os: [ubuntu-22.04-arm, ubuntu-24.04-arm, macos-14, macos-15, macos-26, macos-latest, windows-11-arm]
         rust-toolchain:
           - "1.81" # minimum for this crate
-          - "1.89" # when VPCLMULQDQ was stabilized
+          - "1.89" # when AVX-512 VPCLMULQDQ was stabilized
+          - "stable"
+          - "nightly"
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4 # not pinning to commit hash since this is a GitHub action, which we trust
+      - uses: actions-rust-lang/setup-rust-toolchain@9d7e65c320fdb52dcd45ffaa68deb6c02c8754d9 # v1.12.0
+        with:
+          toolchain: ${{ matrix.rust-toolchain }}
+          components: rustfmt, clippy
+          cache-key: ${{ matrix.os }}-${{ matrix.rust-toolchain }}
+      - name: Check
+        run: cargo check
+      - name: Architecture check
+        run: cargo run --bin arch-check
+      - if: ${{ matrix.rust-toolchain != 'nightly' }}
+        name: Format
+        run: cargo fmt -- --check
+      - if: ${{ matrix.rust-toolchain != 'nightly' }}
+        name: Clippy
+        run: cargo clippy
+      - name: Test
+        run: cargo test
+
+  test-x86_64:
+    name: Test x86_64
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, ubuntu-22.04, ubuntu-24.04, macos-13, macos-15-intel, windows-2022, windows-2025, windows-latest ]
+        rust-toolchain:
+          - "1.81" # minimum for this crate
+          - "1.89" # when AVX-512 VPCLMULQDQ was stabilized
           - "stable"
           - "nightly"
     runs-on: ${{ matrix.os }}
@@ -38,14 +69,14 @@ jobs:
         run: cargo test
 
   test-x86:
-    name:  Test accelerated (x86)
+    name:  Test x86
     runs-on: ubuntu-latest
     strategy:
       matrix:
         target: [i586-unknown-linux-gnu, i686-unknown-linux-gnu]
         rust-toolchain:
           - "1.81" # minimum for this crate
-          - "1.89" # when VPCLMULQDQ was stabilized
+          - "1.89" # when AVX-512 VPCLMULQDQ was stabilized
           - "stable"
           - "nightly"
     steps:
@@ -71,7 +102,7 @@ jobs:
         target: [powerpc-unknown-linux-gnu, powerpc64-unknown-linux-gnu]
         rust-toolchain:
           - "1.81" # minimum for this crate
-          - "1.89" # when VPCLMULQDQ was stabilized
+          - "1.89" # when AVX-512 VPCLMULQDQ was stabilized
           - "stable"
           - "nightly"
     steps:
diff --git a/.gitignore b/.gitignore
index 24335e6..d97ced5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,5 @@
 /test/test_*.bin
 .idea
 .DS_Store
-.git
\ No newline at end of file
+.git
+.vscode
\ No newline at end of file
diff --git a/.kiro/specs/checksum-benchmark-option/design.md b/.kiro/specs/checksum-benchmark-option/design.md
new file mode 100644
index 0000000..26cf154
--- /dev/null
+++ b/.kiro/specs/checksum-benchmark-option/design.md
@@ -0,0 +1,202 @@
+# Design Document
+
+## Overview
+
+This design extends the existing `bin/checksum.rs` tool with benchmark functionality through a new `-b` flag. The benchmark mode will measure CRC performance using either user-provided data (files/strings) or randomly generated data, reporting throughput in GiB/s along with the acceleration target used.
+
+The design maintains backward compatibility while adding a clean benchmark interface that leverages existing patterns from the `benches/benchmark.rs` implementation.
+
+## Architecture
+
+### Command Line Interface
+
+The tool will extend the existing argument parsing to support:
+- `-b`: Enable benchmark mode
+- `--size <bytes>`: Specify data size for random generation (when no file/string provided)
+- `--duration <seconds>`: Benchmark duration as floating-point seconds (default: 10.0)
+- Existing `-a <algorithm>`: CRC algorithm (required in benchmark mode)
+- Existing `-f <file>` or `-s <string>`: Optional data source for benchmarking
+
+### Data Flow
+
+```
+User Input → Argument Parsing → Mode Detection → Benchmark Execution → Results Display
+                                      ↓
+                              [Normal Checksum Mode]
+                                      ↓
+                              [Existing Functionality]
+```
+
+In benchmark mode:
+1. Parse and validate benchmark parameters
+2. Determine data source (file, string, or generated)
+3. For string/generated data: Load/generate test data once; For file data: use file path directly
+4. Run benchmark loop for specified duration using appropriate checksum function
+5. Calculate and display results
+
+## Components and Interfaces
+
+### Enhanced Config Structure
+
+```rust
+#[derive(Debug)]
+struct Config {
+    algorithm: String,
+    file: Option<String>,
+    string: Option<String>,
+    format: OutputFormat,
+    benchmark: Option<BenchmarkConfig>,
+}
+
+#[derive(Debug)]
+struct BenchmarkConfig {
+    size: Option<usize>,
+    duration: f64,
+}
+```
+
+### Benchmark Execution Module
+
+```rust
+enum BenchmarkData {
+    InMemory(Vec<u8>),
+    File(String),
+}
+
+struct BenchmarkRunner {
+    algorithm: CrcAlgorithm,
+    data: BenchmarkData,
+    duration: f64,
+}
+
+impl BenchmarkRunner {
+    fn new(algorithm: CrcAlgorithm, data: BenchmarkData, duration: f64) -> Self
+    fn run(&self) -> BenchmarkResult
+}
+
+struct BenchmarkResult {
+    iterations: u64,
+    elapsed_seconds: f64,
+    throughput_gibs: f64,
+    time_per_iteration_nanos: f64,
+    acceleration_target: String,
+    data_size: u64,
+}
+```
+
+### Data Generation
+
+The benchmark will reuse the random data generation pattern from `benches/benchmark.rs`:
+
+```rust
+fn generate_random_data(size: usize) -> Vec<u8> {
+    let mut rng = rand::rng();
+    let mut buf = vec![0u8; size];
+    rng.fill_bytes(&mut buf);
+    buf
+}
+```
+
+## Data Models
+
+### Input Data Sources
+
+1. **File Input**: Use `checksum_file()` function to benchmark the entire file I/O and checksum stack
+2. **String Input**: Use string bytes directly with in-memory `checksum()` function
+3. **Generated Data**: Create random data of specified size using `rand::RngCore::fill_bytes()` and use in-memory `checksum()` function
+
+### Benchmark Metrics
+
+- **Iterations**: Number of checksum calculations performed
+- **Elapsed Time**: Actual benchmark duration in seconds
+- **Throughput**: Calculated as `(data_size * iterations) / elapsed_time / (1024^3)` GiB/s
+- **Acceleration Target**: Result from `crc_fast::get_calculator_target(algorithm)`
+
+## Error Handling
+
+### Validation Errors
+
+- Invalid algorithm names (reuse existing validation)
+- Invalid size parameters (non-positive values)
+- Invalid duration parameters (non-positive values)
+- File read errors (reuse existing error handling)
+
+### Runtime Errors
+
+- Memory allocation failures for large data sizes
+- Timer precision issues (fallback to alternative timing methods)
+
+### Error Messages
+
+All errors will follow the existing pattern of displaying the error message followed by usage information.
+
+## Testing Strategy
+
+### Unit Tests
+
+- Argument parsing validation for benchmark flags
+- BenchmarkConfig creation and validation
+- Data generation with various sizes
+- Throughput calculation accuracy
+
+### Integration Tests
+
+- End-to-end benchmark execution with different algorithms
+- File and string input handling in benchmark mode
+- Error handling for invalid parameters
+- Backward compatibility verification
+
+### Performance Validation
+
+- Verify benchmark results are reasonable (within expected ranges)
+- Compare with existing `benches/benchmark.rs` results for consistency
+- Test with various data sizes to ensure linear scaling
+
+## Implementation Notes
+
+### Timing Mechanism
+
+Use `std::time::Instant` for high-precision timing, with different approaches for different data sources:
+
+```rust
+let start = std::time::Instant::now();
+let mut iterations = 0u64;
+
+while start.elapsed().as_secs_f64() < duration {
+    match &self.data {
+        BenchmarkData::InMemory(data) => {
+            std::hint::black_box(checksum(algorithm, data));
+        }
+        BenchmarkData::File(filename) => {
+            std::hint::black_box(checksum_file(algorithm, filename, None).unwrap());
+        }
+    }
+    iterations += 1;
+}
+
+let elapsed = start.elapsed().as_secs_f64();
+```
+
+### Memory Considerations
+
+- Pre-allocate test data once before benchmark loop
+- Use `std::hint::black_box()` to prevent compiler optimizations
+- Consider memory alignment for optimal performance (optional enhancement)
+
+### Output Format
+
+```
+Algorithm: CRC-32/ISCSI
+Acceleration Target: aarch64-neon-sha3
+Data Size: 1,048,576 bytes (1.0 MiB)
+Duration: 10.00 seconds
+Iterations: 12,345
+Throughput: 45.67 GiB/s
+Time per iteration: 810.2 μs
+```
+
+### Default Values
+
+- **Size**: 1,048,576 bytes (1 MiB)
+- **Duration**: 10.0 seconds
+- **Algorithm**: Must be specified via `-a` flag (no default)
\ No newline at end of file
diff --git a/.kiro/specs/checksum-benchmark-option/requirements.md b/.kiro/specs/checksum-benchmark-option/requirements.md
new file mode 100644
index 0000000..4da3195
--- /dev/null
+++ b/.kiro/specs/checksum-benchmark-option/requirements.md
@@ -0,0 +1,52 @@
+# Requirements Document
+
+## Introduction
+
+This feature adds a simple benchmark option to the existing `bin/checksum.rs` tool via a command-line flag. The benchmark will allow users to test performance across different platforms using a single binary, reporting throughput in GiB/s and the acceleration target used. This enables cross-platform performance comparison without requiring a full development environment checkout.
+
+## Glossary
+
+- **Checksum_Tool**: The existing `bin/checksum.rs` binary application
+- **Benchmark_Mode**: A new operational mode that measures and reports performance metrics
+- **Acceleration_Target**: The hardware-specific optimization path returned by `get_calculator_target()`
+- **Throughput_Metric**: Performance measurement expressed in GiB/s (gibibytes per second)
+- **Test_Data**: Randomly generated byte array used for benchmark measurements
+- **Black_Box**: Rust's `std::hint::black_box()` function that prevents compiler optimizations during benchmarking
+
+## Requirements
+
+### Requirement 1
+
+**User Story:** As a developer, I want to run performance benchmarks from the checksum tool, so that I can compare CRC performance across different hardware platforms without setting up a full development environment.
+
+#### Acceptance Criteria
+
+1. WHEN the user provides a `-b` flag, THE Checksum_Tool SHALL enter Benchmark_Mode
+2. WHILE in Benchmark_Mode, THE Checksum_Tool SHALL generate Test_Data of the specified size once and reuse it for all iterations
+3. THE Checksum_Tool SHALL report Throughput_Metric in GiB/s format
+4. THE Checksum_Tool SHALL display the Acceleration_Target used for the benchmark
+5. THE Checksum_Tool SHALL use Black_Box to prevent compiler optimizations during measurement
+
+### Requirement 2
+
+**User Story:** As a developer, I want to specify benchmark parameters, so that I can control the test conditions for consistent cross-platform comparisons.
+
+#### Acceptance Criteria
+
+1. WHEN the user provides `-a` parameter with `-b` flag, THE Checksum_Tool SHALL use the specified CRC algorithm for benchmarking
+2. WHEN the user provides `--size` parameter, THE Checksum_Tool SHALL generate Test_Data of the specified byte size
+3. WHEN the user provides `--duration` parameter, THE Checksum_Tool SHALL run the benchmark for the specified number of seconds
+4. WHERE no benchmark parameters are provided, THE Checksum_Tool SHALL use default values of 1 MiB for size and 10 seconds for duration
+5. THE Checksum_Tool SHALL validate all benchmark parameter values before starting the benchmark
+
+### Requirement 3
+
+**User Story:** As a developer, I want the benchmark to support both file/string input and generated data, so that I can benchmark with specific data or use random data for consistent testing.
+
+#### Acceptance Criteria
+
+1. WHEN the user provides `-b` with `-f` or `-s` flags, THE Checksum_Tool SHALL use the file or string content as Test_Data for benchmarking
+2. WHEN the user provides `-b` with `--size` parameter but no `-f` or `-s` flags, THE Checksum_Tool SHALL generate random Test_Data of the specified size
+3. IF the user provides `-b` without any data source or size specification, THEN THE Checksum_Tool SHALL generate random Test_Data using the default size
+4. THE Checksum_Tool SHALL display appropriate usage information when benchmark parameters are invalid
+5. THE Checksum_Tool SHALL maintain backward compatibility with existing checksum functionality
\ No newline at end of file
diff --git a/.kiro/specs/checksum-benchmark-option/tasks.md b/.kiro/specs/checksum-benchmark-option/tasks.md
new file mode 100644
index 0000000..492dd08
--- /dev/null
+++ b/.kiro/specs/checksum-benchmark-option/tasks.md
@@ -0,0 +1,49 @@
+# Implementation Plan
+
+- [x] 1. Extend command line argument parsing for benchmark options
+  - Add `-b` flag to enable benchmark mode in the argument parser
+  - Add `--size` parameter for specifying random data size
+  - Add `--duration` parameter for benchmark duration (floating-point seconds)
+  - Update the `Config` struct to include optional `BenchmarkConfig`
+  - Update usage/help text to include new benchmark options
+  - _Requirements: 1.1, 2.1, 2.2, 2.3, 2.4_
+
+- [x] 2. Implement benchmark data structures and validation
+  - Create `BenchmarkConfig` struct with size and duration fields
+  - Create `BenchmarkData` enum to handle in-memory vs file data sources
+  - Create `BenchmarkRunner` struct with algorithm, data, and duration
+  - Create `BenchmarkResult` struct with all metrics including time per iteration
+  - Add validation logic for benchmark parameters (positive values)
+  - _Requirements: 2.5, 3.4_
+
+- [x] 3. Implement benchmark execution logic
+  - Create benchmark runner with timing loop using `std::time::Instant`
+  - Implement separate execution paths for in-memory data vs file data
+  - Use `std::hint::black_box()` to prevent compiler optimizations
+  - Calculate throughput in GiB/s and time per iteration with appropriate units
+  - Integrate `get_calculator_target()` for acceleration target reporting
+  - _Requirements: 1.2, 1.3, 1.4, 1.5_
+
+- [x] 4. Implement data source handling
+  - Add random data generation function using `rand::RngCore::fill_bytes()`
+  - Implement logic to determine data source (file, string, or generated)
+  - Handle file size detection for throughput calculations
+  - Create `BenchmarkData` instances based on user input
+  - _Requirements: 3.1, 3.2, 3.3_
+
+- [x] 5. Integrate benchmark mode into main application flow
+  - Modify main function to detect benchmark mode and route accordingly
+  - Ensure mutual exclusivity validation between benchmark and normal modes
+  - Add benchmark result formatting and display
+  - Update error handling to include benchmark-specific errors
+  - Maintain backward compatibility with existing functionality
+  - _Requirements: 3.4, 3.5_
+
+- [x] 6. Add comprehensive testing for benchmark functionality
+  - Write unit tests for argument parsing with benchmark flags
+  - Test benchmark parameter validation (invalid sizes, durations)
+  - Test data source selection logic (file vs string vs generated)
+  - Test benchmark execution with different algorithms
+  - Verify throughput calculation accuracy
+  - Test error handling for invalid benchmark configurations
+  - _Requirements: All requirements_
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index f6ff87c..e48ecb2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,9 +4,9 @@ version = 3
 
 [[package]]
 name = "aho-corasick"
-version = "1.1.3"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
 dependencies = [
  "memchr",
 ]
@@ -19,9 +19,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
 [[package]]
 name = "anstream"
-version = "0.6.20"
+version = "0.6.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192"
+checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a"
 dependencies = [
  "anstyle",
  "anstyle-parse",
@@ -34,9 +34,9 @@ dependencies = [
 
 [[package]]
 name = "anstyle"
-version = "1.0.11"
+version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd"
+checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78"
 
 [[package]]
 name = "anstyle-parse"
@@ -53,7 +53,7 @@ version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2"
 dependencies = [
- "windows-sys",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
@@ -64,7 +64,7 @@ checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a"
 dependencies = [
  "anstyle",
  "once_cell_polyfill",
- "windows-sys",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
@@ -75,9 +75,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
 
 [[package]]
 name = "bitflags"
-version = "2.9.3"
+version = "2.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34efbcccd345379ca2868b2b2c9d3782e9cc58ba87bc7d79d5b53d9c9ae6f25d"
+checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
 
 [[package]]
 name = "block-buffer"
@@ -102,9 +102,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
 [[package]]
 name = "cbindgen"
-version = "0.29.0"
+version = "0.29.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "975982cdb7ad6a142be15bdf84aea7ec6a9e5d4d797c004d43185b24cfe4e684"
+checksum = "befbfd072a8e81c02f8c507aefce431fe5e7d051f83d48a23ffc9b9fe5a11799"
 dependencies = [
  "clap",
  "heck",
@@ -121,9 +121,9 @@ dependencies = [
 
 [[package]]
 name = "cfg-if"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
 
 [[package]]
 name = "ciborium"
@@ -154,18 +154,18 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.46"
+version = "4.5.51"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c5e4fcf9c21d2e544ca1ee9d8552de13019a42aa7dbf32747fa7aaf1df76e57"
+checksum = "4c26d721170e0295f191a69bd9a1f93efcdb0aff38684b61ab5750468972e5f5"
 dependencies = [
  "clap_builder",
 ]
 
 [[package]]
 name = "clap_builder"
-version = "4.5.46"
+version = "4.5.51"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fecb53a0e6fcfb055f686001bc2e2592fa527efaf38dbe81a6a9563562e57d41"
+checksum = "75835f0c7bf681bfd05abe44e965760fea999a5286c6eb2d59883634fd02011a"
 dependencies = [
  "anstream",
  "anstyle",
@@ -175,9 +175,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.7.5"
+version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
+checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
 
 [[package]]
 name = "colorchoice"
@@ -208,6 +208,7 @@ dependencies = [
  "crc",
  "criterion",
  "digest",
+ "indexmap",
  "rand",
  "regex",
  "rustversion",
@@ -311,12 +312,12 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
 
 [[package]]
 name = "errno"
-version = "0.3.13"
+version = "0.3.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad"
+checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -327,9 +328,9 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
 
 [[package]]
 name = "generic-array"
-version = "0.14.7"
+version = "0.14.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+checksum = "4bb6743198531e02858aeaea5398fcc883e71851fcbcb5a2f773e2fb6cb1edf2"
 dependencies = [
  "typenum",
  "version_check",
@@ -337,31 +338,32 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.3.3"
+version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
 dependencies = [
  "cfg-if",
  "libc",
  "r-efi",
- "wasi",
+ "wasip2",
 ]
 
 [[package]]
 name = "half"
-version = "2.6.0"
+version = "2.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9"
+checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
 dependencies = [
  "cfg-if",
  "crunchy",
+ "zerocopy",
 ]
 
 [[package]]
 name = "hashbrown"
-version = "0.15.5"
+version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
+checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d"
 
 [[package]]
 name = "heck"
@@ -371,9 +373,9 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
 [[package]]
 name = "indexmap"
-version = "2.11.0"
+version = "2.11.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2481980430f9f78649238835720ddccc57e52df14ffce1c6f37391d61b563e9"
+checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5"
 dependencies = [
  "equivalent",
  "hashbrown",
@@ -381,9 +383,9 @@ dependencies = [
 
 [[package]]
 name = "is_terminal_polyfill"
-version = "1.70.1"
+version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
 
 [[package]]
 name = "itertools"
@@ -402,9 +404,9 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
 
 [[package]]
 name = "js-sys"
-version = "0.3.77"
+version = "0.3.82"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f"
+checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65"
 dependencies = [
  "once_cell",
  "wasm-bindgen",
@@ -412,27 +414,27 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.175"
+version = "0.2.177"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543"
+checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.9.4"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
+checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039"
 
 [[package]]
 name = "log"
-version = "0.4.27"
+version = "0.4.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
+checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432"
 
 [[package]]
 name = "memchr"
-version = "2.7.5"
+version = "2.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
+checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
 
 [[package]]
 name = "num-traits"
@@ -451,9 +453,9 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
 
 [[package]]
 name = "once_cell_polyfill"
-version = "1.70.1"
+version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
 
 [[package]]
 name = "oorandom"
@@ -500,18 +502,18 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.101"
+version = "1.0.103"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
+checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.40"
+version = "1.0.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1"
 dependencies = [
  "proc-macro2",
 ]
@@ -573,9 +575,9 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.11.2"
+version = "1.12.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912"
+checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -585,9 +587,9 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.4.10"
+version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6"
+checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -596,21 +598,21 @@ dependencies = [
 
 [[package]]
 name = "regex-syntax"
-version = "0.8.6"
+version = "0.8.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001"
+checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
 
 [[package]]
 name = "rustix"
-version = "1.0.8"
+version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8"
+checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e"
 dependencies = [
  "bitflags",
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -636,18 +638,28 @@ dependencies = [
 
 [[package]]
 name = "serde"
-version = "1.0.219"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.219"
+version = "1.0.228"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -656,23 +668,24 @@ dependencies = [
 
 [[package]]
 name = "serde_json"
-version = "1.0.143"
+version = "1.0.145"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d401abef1d108fbd9cbaebc3e46611f4b1021f714a0597a71f41ee463f5f4a5a"
+checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
 dependencies = [
  "itoa",
  "memchr",
  "ryu",
  "serde",
+ "serde_core",
 ]
 
 [[package]]
 name = "serde_spanned"
-version = "0.6.9"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3"
+checksum = "e24345aa0fe688594e73770a5f6d1b216508b4f93484c0026d521acd30134392"
 dependencies = [
- "serde",
+ "serde_core",
 ]
 
 [[package]]
@@ -683,9 +696,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 
 [[package]]
 name = "syn"
-version = "2.0.106"
+version = "2.0.108"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6"
+checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -694,15 +707,15 @@ dependencies = [
 
 [[package]]
 name = "tempfile"
-version = "3.21.0"
+version = "3.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e"
+checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16"
 dependencies = [
  "fastrand",
  "getrandom",
  "once_cell",
  "rustix",
- "windows-sys",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -717,56 +730,54 @@ dependencies = [
 
 [[package]]
 name = "toml"
-version = "0.8.23"
+version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362"
+checksum = "f0dc8b1fb61449e27716ec0e1bdf0f6b8f3e8f6b05391e8497b8b6d7804ea6d8"
 dependencies = [
- "serde",
+ "indexmap",
+ "serde_core",
  "serde_spanned",
  "toml_datetime",
- "toml_edit",
+ "toml_parser",
+ "toml_writer",
+ "winnow",
 ]
 
 [[package]]
 name = "toml_datetime"
-version = "0.6.11"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
+checksum = "f2cdb639ebbc97961c51720f858597f7f24c4fc295327923af55b74c3c724533"
 dependencies = [
- "serde",
+ "serde_core",
 ]
 
 [[package]]
-name = "toml_edit"
-version = "0.22.27"
+name = "toml_parser"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
+checksum = "c0cbe268d35bdb4bb5a56a2de88d0ad0eb70af5384a99d648cd4b3d04039800e"
 dependencies = [
- "indexmap",
- "serde",
- "serde_spanned",
- "toml_datetime",
- "toml_write",
  "winnow",
 ]
 
 [[package]]
-name = "toml_write"
-version = "0.1.2"
+name = "toml_writer"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801"
+checksum = "df8b2b54733674ad286d16267dcfc7a71ed5c776e4ac7aa3c3e2561f7c637bf2"
 
 [[package]]
 name = "typenum"
-version = "1.18.0"
+version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
+checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.18"
+version = "1.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
 
 [[package]]
 name = "utf8parse"
@@ -791,45 +802,32 @@ dependencies = [
 ]
 
 [[package]]
-name = "wasi"
-version = "0.14.3+wasi-0.2.4"
+name = "wasip2"
+version = "1.0.1+wasi-0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a51ae83037bdd272a9e28ce236db8c07016dd0d50c27038b3f407533c030c95"
+checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7"
 dependencies = [
  "wit-bindgen",
 ]
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.100"
+version = "0.2.105"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5"
+checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60"
 dependencies = [
  "cfg-if",
  "once_cell",
  "rustversion",
  "wasm-bindgen-macro",
-]
-
-[[package]]
-name = "wasm-bindgen-backend"
-version = "0.2.100"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6"
-dependencies = [
- "bumpalo",
- "log",
- "proc-macro2",
- "quote",
- "syn",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.100"
+version = "0.2.105"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407"
+checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -837,31 +835,31 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.100"
+version = "0.2.105"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de"
+checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc"
 dependencies = [
+ "bumpalo",
  "proc-macro2",
  "quote",
  "syn",
- "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.100"
+version = "0.2.105"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d"
+checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "web-sys"
-version = "0.3.77"
+version = "0.3.82"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2"
+checksum = "3a1f95c0d03a47f4ae1f7a64643a6bb97465d9b740f0fa8f90ea33915c99a9a1"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -869,18 +867,18 @@ dependencies = [
 
 [[package]]
 name = "winapi-util"
-version = "0.1.10"
+version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0978bf7171b3d90bac376700cb56d606feb40f251a475a5d6634613564460b22"
+checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
 dependencies = [
- "windows-sys",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
 name = "windows-link"
-version = "0.1.3"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
 
 [[package]]
 name = "windows-sys"
@@ -891,11 +889,20 @@ dependencies = [
  "windows-targets",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link",
+]
+
 [[package]]
 name = "windows-targets"
-version = "0.53.3"
+version = "0.53.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91"
+checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3"
 dependencies = [
  "windows-link",
  "windows_aarch64_gnullvm",
@@ -910,81 +917,78 @@ dependencies = [
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764"
+checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c"
+checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3"
+checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3"
 
 [[package]]
 name = "windows_i686_gnullvm"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11"
+checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d"
+checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba"
+checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57"
+checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486"
+checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
 
 [[package]]
 name = "winnow"
 version = "0.7.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf"
-dependencies = [
- "memchr",
-]
 
 [[package]]
 name = "wit-bindgen"
-version = "0.45.0"
+version = "0.46.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "052283831dbae3d879dc7f51f3d92703a316ca49f91540417d38591826127814"
+checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
 
 [[package]]
 name = "zerocopy"
-version = "0.8.26"
+version = "0.8.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f"
+checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c"
 dependencies = [
  "zerocopy-derive",
 ]
 
 [[package]]
 name = "zerocopy-derive"
-version = "0.8.26"
+version = "0.8.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181"
+checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831"
 dependencies = [
  "proc-macro2",
  "quote",
diff --git a/Cargo.toml b/Cargo.toml
index 7108c11..128aef2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,9 +22,12 @@ bench = true
 crc = "3"
 digest = {  version = "0.10", features = ["alloc"] }
 rand = "0.9"
-regex = "1.11"
+regex = "1.12"
 rustversion = "1.0"
 
+# constrain indexmap (transitive) to a version compatible with Rust 1.81.0
+indexmap = { version = ">=2.11.0, <2.12.0", optional = true }
+
 [dev-dependencies]
 criterion = "0.7"
 cbindgen = "0.29"
diff --git a/README.md b/README.md
index 368c157..75e9499 100644
--- a/README.md
+++ b/README.md
@@ -339,73 +339,62 @@ This is a summary of the performance for the most important and popular CRC chec
 
 AKA `crc32c` in many, but not all, implementations.
 
-| Arch    | Brand | CPU             | System                    | Target              | 1KiB (GiB/s) | 1MiB (GiB/s) |
-|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:|
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq   |          ~49 |         ~111 |
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | sse-pclmulqdq       |          ~18 |          ~52 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq   |          ~23 |          ~54 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | sse-pclmulqdq       |          ~11 |          ~20 |
-| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-eor3-pclmulqdq |          ~19 |          ~39 |
-| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pclmulqdq      |          ~10 |          ~17 |
-| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-eor3-pclmulqdq |          ~49 |          ~99 |
-| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq |          ~56 |          ~94 |
+| Arch    | Brand | CPU             | System                    | Target            | 1KiB (GiB/s) | 1MiB (GiB/s) |
+|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:|
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq |          ~52 |         ~111 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq |          ~24 |          ~54 |
+| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-pmull-eor3   |          ~21 |          ~53 |
+| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pmull        |          ~11 |          ~17 |
+| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-pmull-eor3   |          ~49 |          ~99 |
+| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-pmull-eor3   |          ~56 |          ~94 |
 
 ### CRC-32/ISO-HDLC (reflected)
 
 AKA `crc32` in many, but not all, implementations.
 
-| Arch    | Brand | CPU             | System                    | Target              | 1KiB (GiB/s) | 1MiB (GiB/s) |
-|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:|
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-248xl       | avx512-vpclmulqdq   |          ~24 |         ~110 |
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-248xl       | sse-pclmulqdq       |          ~21 |          ~28 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq   |          ~24 |          ~55 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | sse-pclmulqdq       |          ~12 |          ~14 |
-| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-eor3-pclmulqdq |          ~19 |          ~39 |
-| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pclmulqdq      |          ~10 |          ~17 |
-| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-eor3-pclmulqdq |          ~48 |          ~98 |
-| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq |          ~56 |          ~94 |
+| Arch    | Brand | CPU             | System                    | Target            | 1KiB (GiB/s) | 1MiB (GiB/s) |
+|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:|
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-248xl       | avx512-vpclmulqdq |          ~20 |          ~88 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq |          ~15 |          ~55 |
+| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-pmull-eor3   |          ~21 |          ~53 |
+| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pmull        |          ~11 |          ~17 |
+| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-pmull-eor3   |          ~48 |          ~98 |
+| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-pmull-eor3   |          ~56 |          ~94 |
 
 ### CRC-64/NVME (reflected)
 
 [AWS S3's recommended checksum option](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html)
 
-| Arch    | Brand | CPU             | System                    | Target              | 1KiB (GiB/s) | 1MiB (GiB/s) |
-|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:|
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq   |          ~25 |         ~110 |
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | sse-pclmulqdq       |          ~21 |          ~28 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq   |          ~25 |          ~55 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | sse-pclmulqdq       |          ~11 |          ~14 |
-| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-eor3-pclmulqdq |          ~20 |          ~37 |
-| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pclmulqdq      |          ~10 |          ~16 |
-| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-eor3-pclmulqdq |          ~50 |          ~72 |
-| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq |          ~52 |          ~72 |
+| Arch    | Brand | CPU             | System                    | Target            | 1KiB (GiB/s) | 1MiB (GiB/s) |
+|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:|
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq |          ~21 |         ~110 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq |          ~16 |          ~55 |
+| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-pmull-eor3   |          ~21 |          ~41 |
+| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pmull        |          ~11 |          ~16 |
+| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-pmull-eor3   |          ~50 |          ~72 |
+| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-pmull-eor3   |          ~52 |          ~72 |
 
 ### CRC-32/BZIP2 (forward)
 
-| Arch    | Brand | CPU             | System                    | Target              | 1KiB (GiB/s) | 1MiB (GiB/s) |
-|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:|
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq   |          ~23 |          ~56 |
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | sse-pclmulqdq       |          ~19 |          ~28 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq   |          ~21 |          ~43 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | sse-pclmulqdq       |          ~11 |          ~13 |
-| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-eor3-pclmulqdq |          ~16 |          ~32 |
-| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pclmulqdq      |           ~9 |          ~14 |
-| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-eor3-pclmulqdq |          ~41 |          ~59 |
-| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq |          ~47 |          ~64 |
+| Arch    | Brand | CPU             | System                    | Target            | 1KiB (GiB/s) | 1MiB (GiB/s) |
+|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:|
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq |          ~20 |          ~56 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq |          ~14 |          ~43 |
+| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-pmull-eor3   |          ~18 |          ~40 |
+| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pmull        |           ~9 |          ~14 |
+| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-pmull-eor3   |          ~41 |          ~59 |
+| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-pmull-eor3   |          ~47 |          ~64 |
 
 ### CRC-64/ECMA-182 (forward)
 
-| Arch    | Brand | CPU             | System                    | Target              | 1KiB (GiB/s) | 1MiB (GiB/s) |
-|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:|
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq   |          ~24 |          ~56 |
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | sse-pclmulqdq       |          ~19 |          ~28 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq   |          ~21 |          ~43 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | sse-pclmulqdq       |          ~11 |          ~13 |
-| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-eor3-pclmulqdq |          ~18 |          ~31 |
-| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pclmulqdq      |           ~9 |          ~14 |
-| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-eor3-pclmulqdq |          ~40 |          ~59 |
-| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq |          ~46 |          ~61 |
-
+| Arch    | Brand | CPU             | System                    | Target            | 1KiB (GiB/s) | 1MiB (GiB/s) |
+|:--------|:------|:----------------|:--------------------------|:------------------|-------------:|-------------:|
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq |          ~21 |          ~56 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq |          ~14 |          ~43 |
+| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-pmull-eor3   |          ~19 |          ~40 |
+| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pmull        |           ~9 |          ~14 |
+| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-pmull-eor3   |          ~40 |          ~59 |
+| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-pmull-eor3   |          ~46 |          ~61 |
 
 ## Other CRC widths
 
@@ -422,6 +411,7 @@ PRs welcome!
 
 ## References
 
+* [Catalogue of parametrised CRC algorithms](https://reveng.sourceforge.io/crc-catalogue/all.htm)
 * [crc32-fast](https://crates.io/crates/crc32fast) Original `CRC-32/ISO-HDLC` (`crc32`) implementation in `Rust`.
 * [crc64-fast](https://github.com/tikv/crc64fast) Original `CRC-64/XZ` implementation in `Rust`.
 * [crc64fast-nvme](https://github.com/awesomized/crc64fast-nvme) Original `CRC-64/NVME` implementation in `Rust`.
diff --git a/benches/benchmark.rs b/benches/benchmark.rs
index ecec9d8..82ee13a 100644
--- a/benches/benchmark.rs
+++ b/benches/benchmark.rs
@@ -31,11 +31,12 @@ pub const SIZES: &[(&str, i32); 2] = &[
 ];
 
 // these are the most important algorithms in popular use, with forward/reflected coverage
-pub const CRC32_ALGORITHMS: &[CrcAlgorithm; 3] = &[
+pub const CRC32_ALGORITHMS: &[CrcAlgorithm; 4] = &[
     // benchmark both CRC-32/ISCSI and CRC-32/ISO-HDLC since they're special flowers with lots of
     // different acceleration targets.
-    CrcAlgorithm::Crc32Iscsi,   // reflected
-    CrcAlgorithm::Crc32IsoHdlc, // reflected
+    CrcAlgorithm::Crc32Autosar, // reflected
+    CrcAlgorithm::Crc32Iscsi,   // reflected, fusion
+    CrcAlgorithm::Crc32IsoHdlc, // reflected, fusion
     CrcAlgorithm::Crc32Bzip2,   // forward
 ];
 
diff --git a/src/algorithm.rs b/src/algorithm.rs
index fe1459b..e3a83b9 100644
--- a/src/algorithm.rs
+++ b/src/algorithm.rs
@@ -53,12 +53,7 @@ fn extract_keys_array(params: CrcParams) -> [u64; 23] {
 }
 
 /// Main entry point that works for both CRC-32 and CRC-64
-#[inline]
-#[cfg_attr(
-    any(target_arch = "x86", target_arch = "x86_64"),
-    target_feature(enable = "sse3,sse4.1,pclmulqdq")
-)]
-#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))]
+#[inline(always)]
 pub unsafe fn update<T: ArchOps, W: EnhancedCrcWidth>(
     state: W::Value,
     bytes: &[u8],
diff --git a/src/arch/aarch64.rs b/src/arch/aarch64/aes.rs
similarity index 86%
rename from src/arch/aarch64.rs
rename to src/arch/aarch64/aes.rs
index 884ed82..8b9abb7 100644
--- a/src/arch/aarch64.rs
+++ b/src/arch/aarch64/aes.rs
@@ -1,16 +1,22 @@
 // Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
 
-//! This module provides AArch64-specific implementations of the ArchOps trait.
+//! This module provides AArch64-specific implementations of the ArchOps trait for architectures
+//! with AES support.
 
 #![cfg(target_arch = "aarch64")]
 
 use crate::traits::ArchOps;
 use std::arch::aarch64::*;
 
+// Tier-specific ArchOps implementations for AArch64
+
+/// Base AArch64 implementation with AES+NEON optimizations
+/// NEON is implicit with AES support on AArch64
 #[derive(Debug, Copy, Clone)]
-pub struct AArch64Ops;
+pub struct Aarch64AesOps;
 
-impl ArchOps for AArch64Ops {
+// Base implementation for AArch64 AES tier
+impl ArchOps for Aarch64AesOps {
     type Vector = uint8x16_t;
 
     #[inline]
@@ -56,7 +62,6 @@ impl ArchOps for AArch64Ops {
     #[target_feature(enable = "neon")]
     unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2] {
         let x = vreinterpretq_u64_u8(vector);
-
         [vgetq_lane_u64(x, 0), vgetq_lane_u64(x, 1)]
     }
 
@@ -64,7 +69,6 @@ impl ArchOps for AArch64Ops {
     #[target_feature(enable = "neon")]
     unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2] {
         let x = vreinterpretq_p64_u8(vector);
-
         [vgetq_lane_p64(x, 0), vgetq_lane_p64(x, 1)]
     }
 
@@ -89,7 +93,6 @@ impl ArchOps for AArch64Ops {
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector {
-        // AArch64 uses vqtbl1q_u8 for byte shuffle
         vqtbl1q_u8(data, mask)
     }
 
@@ -101,16 +104,13 @@ impl ArchOps for AArch64Ops {
         b: Self::Vector,
         mask: Self::Vector,
     ) -> Self::Vector {
-        // AArch64 needs explicit MSB mask creation and uses vbslq_u8
         let msb_mask = vcltq_s8(vreinterpretq_s8_u8(mask), vdupq_n_s8(0));
-
         vbslq_u8(msb_mask, b, a)
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 with 0 for shifting
         vextq_u8(vdupq_n_u8(0), vector, 8)
     }
 
@@ -123,7 +123,6 @@ impl ArchOps for AArch64Ops {
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn create_compare_mask(&self, vector: Self::Vector) -> Self::Vector {
-        // Create a mask based on MSB for AArch64
         vcltq_s8(vreinterpretq_s8_u8(vector), vdupq_n_s8(0))
     }
 
@@ -136,14 +135,12 @@ impl ArchOps for AArch64Ops {
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 for shifting
         vextq_u8(vector, vdupq_n_u8(0), 4)
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 with 0 for shifting
         vextq_u8(vdupq_n_u8(0), vector, 12) // 16-4=12
     }
 
@@ -158,56 +155,48 @@ impl ArchOps for AArch64Ops {
             // For high=false, place in the low 32 bits of the low 64 bits
             result = vreinterpretq_u64_u32(vsetq_lane_u32(value, vreinterpretq_u32_u64(result), 0));
         }
-
         vreinterpretq_u8_u64(result)
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 with 0 for shifting left
         vextq_u8(vdupq_n_u8(0), vector, 12) // 16-4=12
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 for shifting right
         vextq_u8(vector, vdupq_n_u8(0), 4)
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 for shifting
         vextq_u8(vector, vdupq_n_u8(0), 8)
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 for shifting
         vextq_u8(vector, vdupq_n_u8(0), 5)
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 for shifting
         vextq_u8(vector, vdupq_n_u8(0), 6)
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 for shifting
         vextq_u8(vector, vdupq_n_u8(0), 7)
     }
 
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector {
-        // AArch64 uses vextq_u8 for shifting
         vextq_u8(vector, vdupq_n_u8(0), 12)
     }
 
@@ -216,7 +205,6 @@ impl ArchOps for AArch64Ops {
     unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector {
         let low_32 = vgetq_lane_u32(vreinterpretq_u32_u8(vector), 0);
         let result = vsetq_lane_u32(low_32, vdupq_n_u32(0), 3);
-
         vreinterpretq_u8_u32(result)
     }
 
@@ -232,7 +220,6 @@ impl ArchOps for AArch64Ops {
     #[inline]
     #[target_feature(enable = "aes")]
     unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
-        // Low 64 bits of a, high 64 bits of b
         let a_low = vgetq_lane_p64(vreinterpretq_p64_u8(a), 1);
         let b_high = vgetq_lane_p64(vreinterpretq_p64_u8(b), 0);
         vreinterpretq_u8_p128(vmull_p64(a_low, b_high))
@@ -257,19 +244,6 @@ impl ArchOps for AArch64Ops {
     }
 
     #[inline]
-    #[cfg(target_feature = "sha3")]
-    #[target_feature(enable = "sha3")]
-    unsafe fn xor3_vectors(
-        &self,
-        a: Self::Vector,
-        b: Self::Vector,
-        c: Self::Vector,
-    ) -> Self::Vector {
-        veor3q_u8(a, b, c)
-    }
-
-    #[inline]
-    #[cfg(not(target_feature = "sha3"))]
     #[target_feature(enable = "neon")]
     unsafe fn xor3_vectors(
         &self,
@@ -277,13 +251,11 @@ impl ArchOps for AArch64Ops {
         b: Self::Vector,
         c: Self::Vector,
     ) -> Self::Vector {
-        // Fallback for when SHA3 is not available
         veorq_u8(veorq_u8(a, b), c)
     }
 }
 
-impl AArch64Ops {
-    // Helper methods specific to AArch64
+impl Aarch64AesOps {
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn load_key_pair(&self, idx1: u64, idx2: u64) -> uint8x16_t {
diff --git a/src/arch/aarch64/aes_sha3.rs b/src/arch/aarch64/aes_sha3.rs
new file mode 100644
index 0000000..bc3d365
--- /dev/null
+++ b/src/arch/aarch64/aes_sha3.rs
@@ -0,0 +1,201 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides AArch64-specific implementations of the ArchOps trait for architectures
+//! with AES+SHA3 support.
+
+#![cfg(target_arch = "aarch64")]
+
+use crate::arch::aarch64::aes::Aarch64AesOps;
+use crate::traits::ArchOps;
+use std::arch::aarch64::*;
+
+/// AArch64 AES+SHA3 tier - delegates to AES tier and overrides XOR3 operations
+/// Provides EOR3 instruction for optimal XOR3 performance
+#[derive(Debug, Copy, Clone)]
+pub struct Aarch64AesSha3Ops(Aarch64AesOps);
+
+impl Aarch64AesSha3Ops {
+    #[inline(always)]
+    pub fn new() -> Self {
+        Self(Aarch64AesOps)
+    }
+}
+
+// SHA3 tier implementation - delegates to AES tier and overrides XOR3
+impl ArchOps for Aarch64AesSha3Ops {
+    type Vector = uint8x16_t;
+
+    // Delegate methods to the base AES implementation
+    #[inline(always)]
+    unsafe fn create_vector_from_u64_pair(
+        &self,
+        high: u64,
+        low: u64,
+        reflected: bool,
+    ) -> Self::Vector {
+        self.0.create_vector_from_u64_pair(high, low, reflected)
+    }
+
+    #[inline(always)]
+    unsafe fn create_vector_from_u64_pair_non_reflected(
+        &self,
+        high: u64,
+        low: u64,
+    ) -> Self::Vector {
+        self.0.create_vector_from_u64_pair_non_reflected(high, low)
+    }
+
+    #[inline(always)]
+    unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector {
+        self.0.create_vector_from_u64(value, high)
+    }
+
+    #[inline(always)]
+    unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2] {
+        self.0.extract_u64s(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2] {
+        self.0.extract_poly64s(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn xor_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.xor_vectors(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector {
+        self.0.load_bytes(ptr)
+    }
+
+    #[inline(always)]
+    unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector {
+        self.0.load_aligned(ptr)
+    }
+
+    #[inline(always)]
+    unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector {
+        self.0.shuffle_bytes(data, mask)
+    }
+
+    #[inline(always)]
+    unsafe fn blend_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        mask: Self::Vector,
+    ) -> Self::Vector {
+        self.0.blend_vectors(a, b, mask)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_8(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn set_all_bytes(&self, value: u8) -> Self::Vector {
+        self.0.set_all_bytes(value)
+    }
+
+    #[inline(always)]
+    unsafe fn create_compare_mask(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.create_compare_mask(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn and_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.and_vectors(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_32(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_32(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn create_vector_from_u32(&self, value: u32, high: bool) -> Self::Vector {
+        self.0.create_vector_from_u32(value, high)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_4(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_4(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_8(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_5(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_6(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_7(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_12(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_12(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn carryless_mul_00(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_00(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_01(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn carryless_mul_10(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_10(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_11(a, b)
+    }
+
+    // Override XOR3 to use SHA3 EOR3 instruction when available
+    #[inline]
+    #[target_feature(enable = "sha3")]
+    unsafe fn xor3_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        c: Self::Vector,
+    ) -> Self::Vector {
+        // SHA3 tier always uses EOR3 instruction
+        // Feature detection is handled at the dispatch level
+        veor3q_u8(a, b, c)
+    }
+}
diff --git a/src/arch/aarch64/mod.rs b/src/arch/aarch64/mod.rs
new file mode 100644
index 0000000..4f04674
--- /dev/null
+++ b/src/arch/aarch64/mod.rs
@@ -0,0 +1,8 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides AArch64-specific implementations of the ArchOps trait.
+
+#![cfg(target_arch = "aarch64")]
+
+pub mod aes;
+pub mod aes_sha3;
diff --git a/src/arch/mod.rs b/src/arch/mod.rs
index 669ba2c..aef965d 100644
--- a/src/arch/mod.rs
+++ b/src/arch/mod.rs
@@ -3,167 +3,173 @@
 //! This module provides the main entry point for the SIMD CRC calculation.
 //!
 //! It dispatches to the appropriate architecture-specific implementation
-//! based on the target architecture.
 
 #[cfg(target_arch = "aarch64")]
 use std::arch::is_aarch64_feature_detected;
 
-#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
-use crate::algorithm;
-
 use crate::CrcParams;
 
-#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
-use crate::structs::{Width32, Width64};
-
 #[cfg(target_arch = "aarch64")]
-use aarch64::AArch64Ops;
+use crate::arch::aarch64::aes::Aarch64AesOps;
 
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-use x86::X86Ops;
+#[cfg(target_arch = "aarch64")]
+use crate::arch::aarch64::aes_sha3::Aarch64AesSha3Ops;
 
-#[rustversion::since(1.89)]
-#[cfg(target_arch = "x86_64")]
-use vpclmulqdq::Vpclmulqdq512Ops;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
+use crate::{
+    algorithm,
+    structs::{Width32, Width64},
+};
 
-mod aarch64;
-mod software;
-mod vpclmulqdq;
-mod x86;
+pub mod aarch64;
+pub mod software;
+pub mod x86;
+pub mod x86_64;
 
 /// Main entry point that dispatches to the appropriate architecture
 ///
-///
 /// # Safety
 /// May use native CPU features
-#[inline]
+#[inline(always)]
 #[cfg(target_arch = "aarch64")]
-#[target_feature(enable = "aes")]
 pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
-    let ops = AArch64Ops;
+    use crate::feature_detection::{get_arch_ops, ArchOpsInstance};
+
+    match get_arch_ops() {
+        ArchOpsInstance::Aarch64AesSha3(ops) => update_aarch64_aes_sha3(state, bytes, params, *ops),
+        ArchOpsInstance::Aarch64Aes(ops) => update_aarch64_aes(state, bytes, params, *ops),
+        ArchOpsInstance::SoftwareFallback => {
+            if !is_aarch64_feature_detected!("aes") || !is_aarch64_feature_detected!("neon") {
+                #[cfg(any(not(target_feature = "aes"), not(target_feature = "neon")))]
+                {
+                    // Use software implementation when no SIMD support is available
+                    return crate::arch::software::update(state, bytes, params);
+                }
+            }
 
-    match params.width {
-        64 => algorithm::update::<AArch64Ops, Width64>(state, bytes, params, &ops),
-        32 => algorithm::update::<AArch64Ops, Width32>(state as u32, bytes, params, &ops) as u64,
-        _ => panic!("Unsupported CRC width: {}", params.width),
+            // This should likely never happen, but just in case
+            panic!("aarch64 features missing (NEON and/or AES)");
+        }
     }
 }
 
-#[rustversion::before(1.89)]
-#[inline]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")]
-pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
-    update_x86_sse(state, bytes, params)
-}
-
-#[rustversion::since(1.89)]
 #[inline]
-#[cfg(target_arch = "x86")]
-#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")]
-pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
-    update_x86_sse(state, bytes, params)
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "aes")]
+unsafe fn update_aarch64_aes(
+    state: u64,
+    bytes: &[u8],
+    params: CrcParams,
+    ops: Aarch64AesOps,
+) -> u64 {
+    match params.width {
+        64 => algorithm::update::<_, Width64>(state, bytes, params, &ops),
+        32 => algorithm::update::<_, Width32>(state as u32, bytes, params, &ops) as u64,
+        _ => panic!("Unsupported CRC width: {}", params.width),
+    }
 }
 
-#[rustversion::since(1.89)]
 #[inline]
-#[cfg(target_arch = "x86_64")]
-#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")]
-pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
-    use std::arch::is_x86_feature_detected;
-
-    if bytes.len() >= 256
-        && is_x86_feature_detected!("vpclmulqdq")
-        && is_x86_feature_detected!("avx512f")
-        && is_x86_feature_detected!("avx512vl")
-    {
-        let ops = Vpclmulqdq512Ops::new();
-
-        return match params.width {
-            64 => algorithm::update::<Vpclmulqdq512Ops, Width64>(state, bytes, params, &ops),
-            32 => algorithm::update::<Vpclmulqdq512Ops, Width32>(state as u32, bytes, params, &ops)
-                as u64,
-            _ => panic!("Unsupported CRC width: {}", params.width),
-        };
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "aes,sha3")]
+unsafe fn update_aarch64_aes_sha3(
+    state: u64,
+    bytes: &[u8],
+    params: CrcParams,
+    ops: Aarch64AesSha3Ops,
+) -> u64 {
+    match params.width {
+        64 => algorithm::update::<_, Width64>(state, bytes, params, &ops),
+        32 => algorithm::update::<_, Width32>(state as u32, bytes, params, &ops) as u64,
+        _ => panic!("Unsupported CRC width: {}", params.width),
     }
-
-    // fallback to the standard x86 SSE implementation
-    update_x86_sse(state, bytes, params)
 }
 
-#[inline]
-#[cfg(all(
-    not(target_arch = "x86"),
-    not(target_arch = "x86_64"),
-    not(target_arch = "aarch64")
-))]
+/// Main entry point for x86/x86_64 (Rust 1.89+ which supports AVX-512)
+///
+/// # Safety
+/// May use native CPU features
+#[rustversion::since(1.89)]
+#[inline(always)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
-    software::update(state, bytes, params)
-}
+    use crate::feature_detection::{get_arch_ops, ArchOpsInstance};
 
-#[inline]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")]
-unsafe fn update_x86_sse(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
-    let ops = X86Ops;
+    match get_arch_ops() {
+        #[cfg(target_arch = "x86_64")]
+        ArchOpsInstance::X86_64Avx512Vpclmulqdq(ops) => match params.width {
+            64 => algorithm::update::<_, Width64>(state, bytes, params, ops),
+            32 => algorithm::update::<_, Width32>(state as u32, bytes, params, ops) as u64,
+            _ => panic!("Unsupported CRC width: {}", params.width),
+        },
+        #[cfg(target_arch = "x86_64")]
+        ArchOpsInstance::X86_64Avx512Pclmulqdq(ops) => match params.width {
+            64 => algorithm::update::<_, Width64>(state, bytes, params, ops),
+            32 => algorithm::update::<_, Width32>(state as u32, bytes, params, ops) as u64,
+            _ => panic!("Unsupported CRC width: {}", params.width),
+        },
+        ArchOpsInstance::X86SsePclmulqdq(ops) => match params.width {
+            64 => algorithm::update::<_, Width64>(state, bytes, params, ops),
+            32 => algorithm::update::<_, Width32>(state as u32, bytes, params, ops) as u64,
+            _ => panic!("Unsupported CRC width: {}", params.width),
+        },
+        ArchOpsInstance::SoftwareFallback => {
+            #[cfg(target_arch = "x86")]
+            crate::arch::x86_software_update(state, bytes, params);
 
-    match params.width {
-        64 => algorithm::update::<X86Ops, Width64>(state, bytes, params, &ops),
-        32 => algorithm::update::<X86Ops, Width32>(state as u32, bytes, params, &ops) as u64,
-        _ => panic!("Unsupported CRC width: {}", params.width),
+            // This should never happen, but just in case
+            panic!("x86 features missing (SSE4.1 && PCLMULQDQ)");
+        }
     }
 }
 
+/// Main entry point for x86/x86_64 (Rust < 1.89 with no AVX-512 support)
+///
+/// # Safety
+/// May use native CPU features
 #[rustversion::before(1.89)]
-pub fn get_target() -> String {
-    #[cfg(target_arch = "aarch64")]
-    {
-        if is_aarch64_feature_detected!("sha3") {
-            return "aarch64-neon-eor3-pclmulqdq".to_string();
-        }
+#[inline(always)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
+    use crate::feature_detection::{get_arch_ops, ArchOpsInstance};
 
-        "aarch64-neon-pclmulqdq".to_string()
+    match get_arch_ops() {
+        ArchOpsInstance::X86SsePclmulqdq(ops) => match params.width {
+            64 => algorithm::update::<_, Width64>(state, bytes, params, ops),
+            32 => algorithm::update::<_, Width32>(state as u32, bytes, params, ops) as u64,
+            _ => panic!("Unsupported CRC width: {}", params.width),
+        },
+        ArchOpsInstance::SoftwareFallback => x86_software_update(state, bytes, params),
     }
-
-    #[allow(unreachable_code)]
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    return "x86-sse-pclmulqdq".to_string();
-
-    #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))]
-    return "software-fallback-tables".to_string();
 }
 
-#[rustversion::since(1.89)]
-pub fn get_target() -> String {
-    #[cfg(target_arch = "aarch64")]
-    {
-        if is_aarch64_feature_detected!("sha3") {
-            return "aarch64-neon-eor3-pclmulqdq".to_string();
-        }
-
-        "aarch64-neon-pclmulqdq".to_string()
-    }
-
-    #[cfg(target_arch = "x86_64")]
-    {
-        if is_x86_feature_detected!("vpclmulqdq")
-            && is_x86_feature_detected!("avx512f")
-            && is_x86_feature_detected!("avx512vl")
+#[inline(always)]
+#[allow(unused)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn x86_software_update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
+    if !is_x86_feature_detected!("sse4.1") || !is_x86_feature_detected!("pclmulqdq") {
+        #[cfg(all(
+            target_arch = "x86",
+            any(not(target_feature = "sse4.1"), not(target_feature = "pclmulqdq"))
+        ))]
         {
-            return "x86_64-avx512-vpclmulqdq".to_string();
-        }
-
-        if is_x86_feature_detected!("avx2") {
-            return "x86_64-avx2-pclmulqdq".to_string();
+            // Use software implementation when no SIMD support is available
+            crate::arch::software::update(state, bytes, params);
         }
     }
 
-    #[allow(unreachable_code)]
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    return "x86-sse-pclmulqdq".to_string();
+    // This should never happen, but just in case
+    panic!("x86 features missing (SSE4.1 && PCLMULQDQ)");
+}
 
-    #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))]
-    return "software-fallback-tables".to_string();
+#[inline]
+#[cfg(all(
+    not(target_arch = "x86"),
+    not(target_arch = "x86_64"),
+    not(target_arch = "aarch64")
+))]
+pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
+    crate::arch::software::update(state, bytes, params)
 }
 
 #[cfg(test)]
diff --git a/src/arch/software.rs b/src/arch/software.rs
index 4739a1b..9cac286 100644
--- a/src/arch/software.rs
+++ b/src/arch/software.rs
@@ -1,66 +1,102 @@
 // Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
 
 //! This module contains a software fallback for unsupported architectures.
-
-#![cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
+//!
+//! Software fallback is conditionally compiled based on target architecture:
+//! - Always included for non-SIMD architectures (not x86/x86_64/aarch64)
+//! - Included for x86 when SSE4.1/PCLMULQDQ may not be available
+//! - Included for aarch64 for runtime fallback when AES is not detected
+//! - Excluded for x86_64 since SSE4.1/PCLMULQDQ are always available (but included for testing)
+
+#![cfg(any(
+    // Non-aarch64/x86/x86_64 architectures always need software fallback
+    not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")),
+    // x86 may not have SSE4.1/PCLMULQDQ support
+    all(target_arch = "x86", any(not(target_feature = "sse4.1"), not(target_feature = "pclmulqdq"))),
+    // aarch64 needs software fallback for runtime detection when AES is not available...
+    // NEON doesn't guarantee AES, so for rare outlier CPUs this might not work 100%...
+    all(target_arch = "aarch64", not(target_feature = "aes")),
+    // Include for testing on all architectures
+    test
+))]
 
 use crate::consts::CRC_64_NVME;
 use crate::CrcAlgorithm;
 use crate::CrcParams;
 use crc::{Algorithm, Table};
 
+#[allow(unused)]
 const RUST_CRC32_AIXM: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_AIXM);
 
+#[allow(unused)]
 const RUST_CRC32_AUTOSAR: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_AUTOSAR);
 
+#[allow(unused)]
 const RUST_CRC32_BASE91_D: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_BASE91_D);
 
+#[allow(unused)]
 const RUST_CRC32_BZIP2: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_BZIP2);
 
+#[allow(unused)]
 const RUST_CRC32_CD_ROM_EDC: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_CD_ROM_EDC);
 
+#[allow(unused)]
 const RUST_CRC32_CKSUM: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_CKSUM);
 
+#[allow(unused)]
 const RUST_CRC32_ISCSI: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_ISCSI);
 
+#[allow(unused)]
 const RUST_CRC32_ISO_HDLC: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_ISO_HDLC);
 
+#[allow(unused)]
 const RUST_CRC32_JAMCRC: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_JAMCRC);
 
+#[allow(unused)]
 const RUST_CRC32_MEF: crc::Crc<u32, Table<16>> = crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_MEF);
 
+#[allow(unused)]
 const RUST_CRC32_MPEG_2: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_MPEG_2);
 
+#[allow(unused)]
 const RUST_CRC32_XFER: crc::Crc<u32, Table<16>> =
     crc::Crc::<u32, Table<16>>::new(&crc::CRC_32_XFER);
 
+#[allow(unused)]
 const RUST_CRC64_ECMA_182: crc::Crc<u64, Table<16>> =
     crc::Crc::<u64, Table<16>>::new(&crc::CRC_64_ECMA_182);
 
+#[allow(unused)]
 const RUST_CRC64_GO_ISO: crc::Crc<u64, Table<16>> =
     crc::Crc::<u64, Table<16>>::new(&crc::CRC_64_GO_ISO);
 
+#[allow(unused)]
 const RUST_CRC64_MS: crc::Crc<u64, Table<16>> = crc::Crc::<u64, Table<16>>::new(&crc::CRC_64_MS);
 
+#[allow(unused)]
 const RUST_CRC64_NVME: crc::Crc<u64, Table<16>> = crc::Crc::<u64, Table<16>>::new(&CRC_64_NVME);
 
+#[allow(unused)]
 const RUST_CRC64_REDIS: crc::Crc<u64, Table<16>> =
     crc::Crc::<u64, Table<16>>::new(&crc::CRC_64_REDIS);
 
+#[allow(unused)]
 const RUST_CRC64_WE: crc::Crc<u64, Table<16>> = crc::Crc::<u64, Table<16>>::new(&crc::CRC_64_WE);
 
+#[allow(unused)]
 const RUST_CRC64_XZ: crc::Crc<u64, Table<16>> = crc::Crc::<u64, Table<16>>::new(&crc::CRC_64_XZ);
 
+#[allow(unused)]
 // Dispatch function that handles the generic case
 pub(crate) fn update(state: u64, data: &[u8], params: CrcParams) -> u64 {
     match params.width {
@@ -80,7 +116,7 @@ pub(crate) fn update(state: u64, data: &[u8], params: CrcParams) -> u64 {
                 CrcAlgorithm::Crc32Xfer => RUST_CRC32_XFER,
                 CrcAlgorithm::Crc32Custom => {
                     let algorithm: Algorithm<u32> = Algorithm {
-                        width: params.width as u8,
+                        width: params.width,
                         poly: params.poly as u32,
                         init: params.init as u32,
                         refin: params.refin,
@@ -110,13 +146,13 @@ pub(crate) fn update(state: u64, data: &[u8], params: CrcParams) -> u64 {
                 CrcAlgorithm::Crc64Xz => RUST_CRC64_XZ,
                 CrcAlgorithm::Crc64Custom => {
                     let algorithm: Algorithm<u64> = Algorithm {
-                        width: params.width as u8,
-                        poly: params.poly as u64,
-                        init: params.init as u64,
+                        width: params.width,
+                        poly: params.poly,
+                        init: params.init,
                         refin: params.refin,
                         refout: params.refout,
-                        xorout: params.xorout as u64,
-                        check: params.check as u64,
+                        xorout: params.xorout,
+                        check: params.check,
                         residue: 0x0000000000000000, // unused in this context
                     };
 
diff --git a/src/arch/x86/mod.rs b/src/arch/x86/mod.rs
new file mode 100644
index 0000000..bead067
--- /dev/null
+++ b/src/arch/x86/mod.rs
@@ -0,0 +1,7 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides x86-specific implementations of the ArchOps trait.
+
+#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+
+pub mod sse;
diff --git a/src/arch/x86.rs b/src/arch/x86/sse.rs
similarity index 85%
rename from src/arch/x86.rs
rename to src/arch/x86/sse.rs
index 08a8b4a..adaf8b8 100644
--- a/src/arch/x86.rs
+++ b/src/arch/x86/sse.rs
@@ -4,7 +4,7 @@
 //!
 //! This module is designed to work with both x86 and x86_64 architectures.
 //!
-//! It uses the SSE2 and SSE4.1 instruction sets for SIMD operations.
+//! It uses the SSE4.1 instruction sets for SIMD operations.
 
 #![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 
@@ -16,10 +16,15 @@ use std::arch::x86_64::*;
 
 use crate::traits::ArchOps;
 
+// Tier-specific ArchOps implementations for x86/x86_64
+
+/// Base x86/x86_64 SSE+PCLMULQDQ implementation - baseline performance for both architectures
+/// Uses SSE4.1 and PCLMULQDQ instructions
 #[derive(Debug, Copy, Clone)]
-pub struct X86Ops;
+pub struct X86SsePclmulqdqOps;
 
-impl ArchOps for X86Ops {
+// Base implementation for x86/x86_64 SSE+PCLMULQDQ tier
+impl ArchOps for X86SsePclmulqdqOps {
     type Vector = __m128i;
 
     #[inline]
@@ -52,7 +57,6 @@ impl ArchOps for X86Ops {
     #[inline]
     #[target_feature(enable = "sse4.1")]
     unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector {
-        // x86 uses custom helper
         self.create_u64_vector(value, high)
     }
 
@@ -78,21 +82,18 @@ impl ArchOps for X86Ops {
     #[inline]
     #[target_feature(enable = "sse2")]
     unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector {
-        // x86 requires cast to __m128i*
         _mm_loadu_si128(ptr as *const __m128i)
     }
 
     #[inline]
     #[target_feature(enable = "sse2")]
     unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector {
-        // x86 requires cast to __m128i*
         _mm_loadu_si128(ptr as *const __m128i)
     }
 
     #[inline]
     #[target_feature(enable = "ssse3")]
     unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector {
-        // x86 uses specific SSSE3 instruction
         _mm_shuffle_epi8(data, mask)
     }
 
@@ -104,14 +105,12 @@ impl ArchOps for X86Ops {
         b: Self::Vector,
         mask: Self::Vector,
     ) -> Self::Vector {
-        // x86 has native blend that uses MSB automatically
         _mm_blendv_epi8(a, b, mask)
     }
 
     #[inline]
     #[target_feature(enable = "sse2")]
     unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector {
-        // x86 has a dedicated shift instruction
         _mm_slli_si128(vector, 8)
     }
 
@@ -227,23 +226,6 @@ impl ArchOps for X86Ops {
         _mm_clmulepi64_si128(a, b, 0x11)
     }
 
-    #[rustversion::since(1.89)]
-    #[inline]
-    #[target_feature(enable = "sse4.1")]
-    unsafe fn xor3_vectors(
-        &self,
-        a: Self::Vector,
-        b: Self::Vector,
-        c: Self::Vector,
-    ) -> Self::Vector {
-        if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
-            return self.xor3_vectors_avx512(a, b, c);
-        }
-
-        self.xor3_vectors_sse(a, b, c)
-    }
-
-    #[rustversion::before(1.89)]
     #[inline]
     #[target_feature(enable = "sse4.1")]
     unsafe fn xor3_vectors(
@@ -252,12 +234,12 @@ impl ArchOps for X86Ops {
         b: Self::Vector,
         c: Self::Vector,
     ) -> Self::Vector {
-        self.xor3_vectors_sse(a, b, c)
+        // SSE tier always uses two XOR operations
+        _mm_xor_si128(_mm_xor_si128(a, b), c)
     }
 }
 
-impl X86Ops {
-    // Helper methods specific to x86/x86_64
+impl X86SsePclmulqdqOps {
     #[inline]
     #[target_feature(enable = "sse2")]
     unsafe fn set_epi64x(&self, e1: u64, e0: u64) -> __m128i {
@@ -318,20 +300,4 @@ impl X86Ops {
             lo | (hi << 32)
         }
     }
-
-    #[rustversion::since(1.89)]
-    #[inline]
-    #[target_feature(enable = "avx512f,avx512vl")]
-    unsafe fn xor3_vectors_avx512(&self, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-        _mm_ternarylogic_epi64(
-            a, b, c, 0x96, // XOR3
-        )
-    }
-
-    #[inline]
-    #[target_feature(enable = "sse4.1")]
-    unsafe fn xor3_vectors_sse(&self, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-        // x86 doesn't have native XOR3 in SSE, use two XORs
-        _mm_xor_si128(_mm_xor_si128(a, b), c)
-    }
 }
diff --git a/src/arch/x86_64/avx512.rs b/src/arch/x86_64/avx512.rs
new file mode 100644
index 0000000..d26ec2f
--- /dev/null
+++ b/src/arch/x86_64/avx512.rs
@@ -0,0 +1,220 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides x86_64-specific implementations of the ArchOps trait.
+//!
+//! It uses the AVX-512 instruction sets for SIMD operations.
+
+#![cfg(target_arch = "x86_64")]
+
+#[rustversion::since(1.89)]
+use std::arch::x86_64::*;
+
+#[rustversion::since(1.89)]
+use crate::arch::x86::sse::X86SsePclmulqdqOps;
+#[rustversion::since(1.89)]
+use crate::traits::ArchOps;
+
+/// x86_64-only AVX512+PCLMULQDQ tier - delegates to SSE tier and overrides XOR3 operations
+/// Uses AVX512 ternary logic for XOR3 operations with PCLMULQDQ
+#[rustversion::since(1.89)]
+#[derive(Debug, Copy, Clone)]
+pub struct X86_64Avx512PclmulqdqOps(X86SsePclmulqdqOps);
+
+#[rustversion::since(1.89)]
+impl X86_64Avx512PclmulqdqOps {
+    #[inline(always)]
+    pub fn new() -> Self {
+        Self(X86SsePclmulqdqOps)
+    }
+}
+
+#[rustversion::since(1.89)]
+impl ArchOps for X86_64Avx512PclmulqdqOps {
+    type Vector = __m128i;
+
+    // Delegate all methods to the base SSE implementation
+    #[inline(always)]
+    unsafe fn create_vector_from_u64_pair(
+        &self,
+        high: u64,
+        low: u64,
+        reflected: bool,
+    ) -> Self::Vector {
+        self.0.create_vector_from_u64_pair(high, low, reflected)
+    }
+
+    #[inline(always)]
+    unsafe fn create_vector_from_u64_pair_non_reflected(
+        &self,
+        high: u64,
+        low: u64,
+    ) -> Self::Vector {
+        self.0.create_vector_from_u64_pair_non_reflected(high, low)
+    }
+
+    #[inline(always)]
+    unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector {
+        self.0.create_vector_from_u64(value, high)
+    }
+
+    #[inline(always)]
+    unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2] {
+        self.0.extract_u64s(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2] {
+        self.0.extract_poly64s(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn xor_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.xor_vectors(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector {
+        self.0.load_bytes(ptr)
+    }
+
+    #[inline(always)]
+    unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector {
+        self.0.load_aligned(ptr)
+    }
+
+    #[inline(always)]
+    unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector {
+        self.0.shuffle_bytes(data, mask)
+    }
+
+    #[inline(always)]
+    unsafe fn blend_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        mask: Self::Vector,
+    ) -> Self::Vector {
+        self.0.blend_vectors(a, b, mask)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_8(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn set_all_bytes(&self, value: u8) -> Self::Vector {
+        self.0.set_all_bytes(value)
+    }
+
+    #[inline(always)]
+    unsafe fn create_compare_mask(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.create_compare_mask(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn and_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.and_vectors(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_32(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_32(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn create_vector_from_u32(&self, value: u32, high: bool) -> Self::Vector {
+        self.0.create_vector_from_u32(value, high)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_4(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_4(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_8(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_5(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_6(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_7(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_12(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_12(vector)
+    }
+
+    #[inline(always)]
+    unsafe fn carryless_mul_00(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_00(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_01(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn carryless_mul_10(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_10(a, b)
+    }
+
+    #[inline(always)]
+    unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_11(a, b)
+    }
+
+    #[rustversion::since(1.89)]
+    #[inline]
+    #[target_feature(enable = "avx512vl")]
+    unsafe fn xor3_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        c: Self::Vector,
+    ) -> Self::Vector {
+        // AVX512 tier always uses ternary logic
+        _mm_ternarylogic_epi64(a, b, c, 0x96) // XOR3 operation
+    }
+
+    // Fallback for older Rust versions
+    #[rustversion::before(1.89)]
+    #[inline(always)]
+    unsafe fn xor3_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        c: Self::Vector,
+    ) -> Self::Vector {
+        // Rust < 1.89 doesn't have _mm_ternarylogic_epi64, fall back to SSE
+        self.0.xor3_vectors(a, b, c)
+    }
+}
diff --git a/src/arch/x86_64/avx512_vpclmulqdq.rs b/src/arch/x86_64/avx512_vpclmulqdq.rs
new file mode 100644
index 0000000..881ca7e
--- /dev/null
+++ b/src/arch/x86_64/avx512_vpclmulqdq.rs
@@ -0,0 +1,630 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides AVX-512 and VPCLMULQDQ-specific implementations of the ArchOps trait.
+//!
+//! It performs folding using 4 x ZMM registers of 512-bits each.
+
+#![cfg(target_arch = "x86_64")]
+
+#[rustversion::since(1.89)]
+use crate::arch::x86::sse::X86SsePclmulqdqOps;
+
+#[rustversion::since(1.89)]
+use crate::enums::Reflector;
+
+#[rustversion::since(1.89)]
+use crate::structs::CrcState;
+
+#[rustversion::since(1.89)]
+use crate::traits::{ArchOps, EnhancedCrcWidth};
+
+#[rustversion::since(1.89)]
+use std::arch::x86_64::*;
+
+#[rustversion::since(1.89)]
+use std::ops::BitXor;
+
+/// Implements the ArchOps trait using 512-bit AVX-512 and VPCLMULQDQ instructions at 512 bits.
+/// Delegates to X86SsePclmulqdqOps for standard 128-bit operations
+#[rustversion::since(1.89)]
+#[derive(Debug, Copy, Clone)]
+pub struct X86_64Avx512VpclmulqdqOps(X86SsePclmulqdqOps);
+
+#[rustversion::since(1.89)]
+impl X86_64Avx512VpclmulqdqOps {
+    #[inline(always)]
+    pub fn new() -> Self {
+        Self(X86SsePclmulqdqOps)
+    }
+}
+
+// Wrapper for __m512i to make it easier to work with
+#[rustversion::since(1.89)]
+#[derive(Debug, Copy, Clone)]
+struct Simd512(__m512i);
+
+#[rustversion::since(1.89)]
+impl Simd512 {
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    #[allow(clippy::too_many_arguments)]
+    unsafe fn new(x7: u64, x6: u64, x5: u64, x4: u64, x3: u64, x2: u64, x1: u64, x0: u64) -> Self {
+        Self(_mm512_set_epi64(
+            x7 as i64, x6 as i64, x5 as i64, x4 as i64, x3 as i64, x2 as i64, x1 as i64, x0 as i64,
+        ))
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512vl,vpclmulqdq")]
+    unsafe fn fold_64(&self, coeff: &Self, new_data: &Self) -> Self {
+        // Use 512-bit ternary logic XOR3 with carryless multiplication
+        Self(_mm512_ternarylogic_epi64(
+            _mm512_clmulepi64_epi128(self.0, coeff.0, 0), // Low parts
+            _mm512_clmulepi64_epi128(self.0, coeff.0, 17), // High parts
+            new_data.0,
+            0x96, // XOR3 operation
+        ))
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn extract_u64s(&self) -> [u64; 8] {
+        let mut result = [0u64; 8];
+        _mm512_storeu_si512(result.as_mut_ptr().cast(), self.0);
+
+        result
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn load_from_ptr(ptr: *const u8) -> Self {
+        Self(_mm512_loadu_si512(ptr as *const __m512i))
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn to_128i_extract<const INDEX: i32>(self) -> __m128i {
+        _mm512_extracti32x4_epi32(self.0, INDEX)
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    unsafe fn xor(&self, other: &Self) -> Self {
+        Self(_mm512_xor_si512(self.0, other.0))
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    #[allow(unused)]
+    unsafe fn print_hex(&self, prefix: &str) {
+        let values = self.extract_u64s();
+        println!(
+            "{}={:#016x}_{:016x}_{:016x}_{:016x}_{:016x}_{:016x}_{:016x}_{:016x}",
+            prefix,
+            values[7],
+            values[6],
+            values[5],
+            values[4],
+            values[3],
+            values[2],
+            values[1],
+            values[0]
+        );
+    }
+}
+
+#[rustversion::since(1.89)]
+impl X86_64Avx512VpclmulqdqOps {
+    /// Process aligned blocks using VPCLMULQDQ with 4 x 512-bit registers
+    ///
+    /// Note that #[inline(always)] loses the inlining performance boost, despite no native
+    /// target_features being used directly. Odd since that's not how Rust's docs make it sound...
+    #[inline]
+    #[target_feature(enable = "avx512vl,avx512bw,vpclmulqdq")]
+    unsafe fn process_blocks<W: EnhancedCrcWidth>(
+        &self,
+        state: &mut CrcState<<X86_64Avx512VpclmulqdqOps as ArchOps>::Vector>,
+        first: &[__m128i; 8],
+        rest: &[[__m128i; 8]],
+        keys: [u64; 23],
+        reflected: bool,
+    ) -> W::Value
+    where
+        W::Value: Copy + BitXor<Output = W::Value>,
+    {
+        let state_u64s = self.extract_u64s(state.value);
+
+        let positioned_state = if reflected {
+            Simd512::new(0, 0, 0, 0, 0, 0, 0, state_u64s[0])
+        } else {
+            Simd512::new(state_u64s[1], 0, 0, 0, 0, 0, 0, 0)
+        };
+
+        let reflector = create_reflector512(reflected);
+
+        // Load first 256 bytes (2nd half is rest[0] since these are 128-byte blocks)
+        let first_ptr = first.as_ptr() as *const u8;
+        let first_rest_ptr = rest[0].as_ptr() as *const u8;
+
+        let mut x = [
+            reflect_bytes512(&reflector, Simd512::load_from_ptr(first_ptr)),
+            reflect_bytes512(&reflector, Simd512::load_from_ptr(first_ptr.add(64))),
+            reflect_bytes512(&reflector, Simd512::load_from_ptr(first_rest_ptr)),
+            reflect_bytes512(&reflector, Simd512::load_from_ptr(first_rest_ptr.add(64))),
+        ];
+
+        x[0] = positioned_state.xor(&x[0]);
+
+        let coeff = self.create_avx512_256byte_coefficient(keys, reflected);
+
+        let remaining_rest = &rest[1..];
+        let pair_count = remaining_rest.len() / 2;
+
+        for i in 0..pair_count {
+            let block1_ptr = remaining_rest[i * 2].as_ptr() as *const u8;
+            let block2_ptr = remaining_rest[i * 2 + 1].as_ptr() as *const u8;
+
+            x[0] = x[0].fold_64(
+                &coeff,
+                &reflect_bytes512(&reflector, Simd512::load_from_ptr(block1_ptr)),
+            );
+            x[1] = x[1].fold_64(
+                &coeff,
+                &reflect_bytes512(&reflector, Simd512::load_from_ptr(block1_ptr.add(64))),
+            );
+            x[2] = x[2].fold_64(
+                &coeff,
+                &reflect_bytes512(&reflector, Simd512::load_from_ptr(block2_ptr)),
+            );
+            x[3] = x[3].fold_64(
+                &coeff,
+                &reflect_bytes512(&reflector, Simd512::load_from_ptr(block2_ptr.add(64))),
+            );
+        }
+
+        let processed_pairs = pair_count * 2;
+        let remaining_single_count = remaining_rest.len() - processed_pairs;
+
+        if remaining_single_count > 0 {
+            // We have 1 unprocessed block (128 bytes)
+            // Fold 4×512 down to 2×512 and process the remaining block with 2-register mode
+            let folded_2reg = self.fold_from_4x512_to_2x256(x, keys, reflected);
+            let coeff_2reg = self.create_avx512_128byte_coefficient(keys, reflected);
+
+            let last_block_ptr = remaining_rest[processed_pairs].as_ptr() as *const u8;
+
+            let final_x = [
+                folded_2reg[0].fold_64(
+                    &coeff_2reg,
+                    &reflect_bytes512(&reflector, Simd512::load_from_ptr(last_block_ptr)),
+                ),
+                folded_2reg[1].fold_64(
+                    &coeff_2reg,
+                    &reflect_bytes512(&reflector, Simd512::load_from_ptr(last_block_ptr.add(64))),
+                ),
+            ];
+
+            let folded = self.fold_from_2x512_to_1x128(final_x, keys, reflected);
+
+            return W::perform_final_reduction(folded, reflected, keys, self);
+        }
+
+        // All blocks processed in pairs - fold from 4 x 512-bit to 1 x 128-bit
+        let folded = self.fold_from_4x512_to_1x128(x, keys, reflected);
+
+        W::perform_final_reduction(folded, reflected, keys, self)
+    }
+
+    /// Create a folding coefficient for AVX-512 for 128-byte folding distances
+    #[inline(always)]
+    unsafe fn create_avx512_128byte_coefficient(
+        &self,
+        keys: [u64; 23],
+        reflected: bool,
+    ) -> Simd512 {
+        let (k1, k2) = if reflected {
+            (keys[3], keys[4])
+        } else {
+            (keys[4], keys[3])
+        };
+
+        // Replicate the coefficient pair
+        Simd512::new(k1, k2, k1, k2, k1, k2, k1, k2)
+    }
+
+    /// Create a folding coefficient for AVX-512 for 256-byte folding distances
+    #[inline(always)]
+    unsafe fn create_avx512_256byte_coefficient(
+        &self,
+        keys: [u64; 23],
+        reflected: bool,
+    ) -> Simd512 {
+        let (k1, k2) = if reflected {
+            (keys[21], keys[22])
+        } else {
+            (keys[22], keys[21])
+        };
+
+        // Replicate the coefficient pair
+        Simd512::new(k1, k2, k1, k2, k1, k2, k1, k2)
+    }
+
+    /// Fold from 4 x 512-bit to 1 x 128-bit
+    #[inline(always)]
+    unsafe fn fold_from_4x512_to_1x128(
+        &self,
+        x: [Simd512; 4],
+        keys: [u64; 23],
+        reflected: bool,
+    ) -> __m128i {
+        // Step 1: Fold 4 x 512-bit to 2 x 512-bit
+        let x2 = self.fold_from_4x512_to_2x256(x, keys, reflected);
+
+        // Step 2: Fold 2 x 512-bit to 1 x 128-bit
+        self.fold_from_2x512_to_1x128(x2, keys, reflected)
+    }
+
+    /// Fold from 4 x 512-bit to 2 x 512-bit
+    #[inline(always)]
+    unsafe fn fold_from_4x512_to_2x256(
+        &self,
+        x: [Simd512; 4],
+        keys: [u64; 23],
+        reflected: bool,
+    ) -> [Simd512; 2] {
+        // This folds registers that are 128 bytes apart (x[0] with x[2], x[1] with x[3])
+        let coeff = self.create_avx512_128byte_coefficient(keys, reflected);
+
+        // Fold pairs:
+        // x[0] (bytes 0-63) + x[2] (bytes 128-191) → result[0]
+        // x[1] (bytes 64-127) + x[3] (bytes 192-255) → result[1]
+        [x[0].fold_64(&coeff, &x[2]), x[1].fold_64(&coeff, &x[3])]
+    }
+
+    /// Fold from 2 x 512-bit to 1 x 128-bit
+    #[inline(always)]
+    unsafe fn fold_from_2x512_to_1x128(
+        &self,
+        x: [Simd512; 2],
+        keys: [u64; 23],
+        reflected: bool,
+    ) -> __m128i {
+        // Create the fold coefficients for different distances
+        let fold_coefficients = [
+            self.create_vector_from_u64_pair(keys[10], keys[9], reflected), // 112 bytes
+            self.create_vector_from_u64_pair(keys[12], keys[11], reflected), // 96 bytes
+            self.create_vector_from_u64_pair(keys[14], keys[13], reflected), // 80 bytes
+            self.create_vector_from_u64_pair(keys[16], keys[15], reflected), // 64 bytes
+            self.create_vector_from_u64_pair(keys[18], keys[17], reflected), // 48 bytes
+            self.create_vector_from_u64_pair(keys[20], keys[19], reflected), // 32 bytes
+            self.create_vector_from_u64_pair(keys[2], keys[1], reflected),  // 16 bytes
+        ];
+
+        // Extract the 8 x 128-bit vectors from the 2 x 512-bit vectors (this is faster than
+        // using 256-bit intrinsics for 1KiB payloads)
+        let v128 = if reflected {
+            [
+                x[0].to_128i_extract::<0>(), // 256-x0.low
+                x[0].to_128i_extract::<1>(), // 256-x0.high
+                x[0].to_128i_extract::<2>(), // 256-x1.low
+                x[0].to_128i_extract::<3>(), // 256-x1.high
+                x[1].to_128i_extract::<0>(), // 256-x2.low
+                x[1].to_128i_extract::<1>(), // 256-x2.high
+                x[1].to_128i_extract::<2>(), // 256-x3.low
+                x[1].to_128i_extract::<3>(), // 256-x3.high
+            ]
+        } else {
+            [
+                x[0].to_128i_extract::<3>(), // 256-x1.high
+                x[0].to_128i_extract::<2>(), // 256-x1.low
+                x[0].to_128i_extract::<1>(), // 256-x0.high
+                x[0].to_128i_extract::<0>(), // 256-x0.low
+                x[1].to_128i_extract::<3>(), // 256-x3.high
+                x[1].to_128i_extract::<2>(), // 256-x3.low
+                x[1].to_128i_extract::<1>(), // 256-x2.high
+                x[1].to_128i_extract::<0>(), // 256-x2.low
+            ]
+        };
+
+        // Fold the 8 xmm registers to 1 xmm register
+        let mut res = v128[7];
+
+        for (i, &coeff) in fold_coefficients.iter().enumerate() {
+            let folded_h = self.carryless_mul_00(v128[i], coeff);
+            let folded_l = self.carryless_mul_11(v128[i], coeff);
+            res = self.xor3_vectors(folded_h, folded_l, res);
+        }
+
+        res
+    }
+}
+
+// 512-bit version of the Reflector
+#[rustversion::since(1.89)]
+#[derive(Clone, Copy)]
+enum Reflector512 {
+    NoReflector,
+    ForwardReflector { smask: Simd512 },
+}
+
+// Function to create the appropriate reflector based on CRC parameters
+#[rustversion::since(1.89)]
+#[inline(always)]
+unsafe fn create_reflector512(reflected: bool) -> Reflector512 {
+    if reflected {
+        Reflector512::NoReflector
+    } else {
+        // Load shuffle mask
+        let smask = Simd512::new(
+            0x08090a0b0c0d0e0f,
+            0x0001020304050607,
+            0x08090a0b0c0d0e0f,
+            0x0001020304050607,
+            0x08090a0b0c0d0e0f,
+            0x0001020304050607,
+            0x08090a0b0c0d0e0f,
+            0x0001020304050607,
+        );
+        Reflector512::ForwardReflector { smask }
+    }
+}
+
+// Function to apply reflection to a 512-bit vector
+#[rustversion::since(1.89)]
+#[inline(always)]
+unsafe fn reflect_bytes512(reflector: &Reflector512, data: Simd512) -> Simd512 {
+    match reflector {
+        Reflector512::NoReflector => data,
+        Reflector512::ForwardReflector { smask } => shuffle_bytes512(data, *smask),
+    }
+}
+
+// pre-compute the reverse indices for 512-bit shuffling
+#[rustversion::since(1.89)]
+static REVERSE_INDICES_512: __m512i =
+    unsafe { std::mem::transmute([7u64, 6u64, 5u64, 4u64, 3u64, 2u64, 1u64, 0u64]) };
+
+// Implement a 512-bit byte shuffle function
+#[rustversion::since(1.89)]
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+unsafe fn shuffle_bytes512(data: Simd512, mask: Simd512) -> Simd512 {
+    Simd512(_mm512_permutexvar_epi64(
+        // Reverse the order using 512-bit permutation
+        REVERSE_INDICES_512,                 // reverse indices
+        _mm512_shuffle_epi8(data.0, mask.0), // shuffled data
+    ))
+}
+
+// Delegate all ArchOps methods to the inner X86SsePclmulqdqOps instance
+#[rustversion::since(1.89)]
+impl ArchOps for X86_64Avx512VpclmulqdqOps {
+    type Vector = __m128i;
+
+    #[inline(always)]
+    unsafe fn process_enhanced_simd_blocks<W: EnhancedCrcWidth>(
+        &self,
+        state: &mut CrcState<Self::Vector>,
+        first: &[Self::Vector; 8],
+        rest: &[[Self::Vector; 8]],
+        _reflector: &Reflector<Self::Vector>,
+        keys: [u64; 23],
+    ) -> bool
+    where
+        Self::Vector: Copy,
+    {
+        // Update the state with the result
+        *state = W::create_state(
+            self.process_blocks::<W>(state, first, rest, keys, state.reflected),
+            state.reflected,
+            self,
+        );
+
+        // Return true to indicate we handled it
+        true
+    }
+
+    // Delegate all other methods to X86SsePclmulqdqOps
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn create_vector_from_u64_pair(
+        &self,
+        high: u64,
+        low: u64,
+        reflected: bool,
+    ) -> Self::Vector {
+        self.0.create_vector_from_u64_pair(high, low, reflected)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn create_vector_from_u64_pair_non_reflected(
+        &self,
+        high: u64,
+        low: u64,
+    ) -> Self::Vector {
+        self.0.create_vector_from_u64_pair_non_reflected(high, low)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector {
+        self.0.create_vector_from_u64(value, high)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2] {
+        self.0.extract_u64s(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2] {
+        self.0.extract_poly64s(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn xor_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.xor_vectors(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector {
+        self.0.load_bytes(ptr)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector {
+        self.0.load_aligned(ptr)
+    }
+
+    #[inline]
+    #[target_feature(enable = "ssse3")]
+    unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector {
+        self.0.shuffle_bytes(data, mask)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn blend_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        mask: Self::Vector,
+    ) -> Self::Vector {
+        self.0.blend_vectors(a, b, mask)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_8(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn set_all_bytes(&self, value: u8) -> Self::Vector {
+        self.0.set_all_bytes(value)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn create_compare_mask(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.create_compare_mask(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn and_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.and_vectors(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_32(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_32(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn create_vector_from_u32(&self, value: u32, high: bool) -> Self::Vector {
+        self.0.create_vector_from_u32(value, high)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_4(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_4(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_8(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_5(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_6(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_7(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_right_12(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector {
+        self.0.shift_left_12(vector)
+    }
+
+    #[inline]
+    #[target_feature(enable = "pclmulqdq")]
+    unsafe fn carryless_mul_00(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_00(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "pclmulqdq")]
+    unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_01(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "pclmulqdq")]
+    unsafe fn carryless_mul_10(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_10(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "pclmulqdq")]
+    unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
+        self.0.carryless_mul_11(a, b)
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512vl")]
+    unsafe fn xor3_vectors(
+        &self,
+        a: Self::Vector,
+        b: Self::Vector,
+        c: Self::Vector,
+    ) -> Self::Vector {
+        // VPCLMULQDQ implementation always uses AVX512 ternary logic
+        // Feature detection is handled at the dispatch level
+        _mm_ternarylogic_epi64(
+            a, b, c, 0x96, // XOR3 operation
+        )
+    }
+}
diff --git a/src/arch/x86_64/mod.rs b/src/arch/x86_64/mod.rs
new file mode 100644
index 0000000..4d1422d
--- /dev/null
+++ b/src/arch/x86_64/mod.rs
@@ -0,0 +1,8 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides x86_64-specific implementations of the ArchOps trait.
+
+#![cfg(target_arch = "x86_64")]
+
+pub mod avx512;
+pub mod avx512_vpclmulqdq;
diff --git a/src/bin/arch-check.rs b/src/bin/arch-check.rs
index b1dc3ef..023b2a0 100644
--- a/src/bin/arch-check.rs
+++ b/src/bin/arch-check.rs
@@ -1,3 +1,5 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
 //! This is a simple program that checks if the target architecture supports certain features.
 
 #[cfg(target_arch = "aarch64")]
diff --git a/src/bin/checksum.rs b/src/bin/checksum.rs
index dd02b08..6c017bb 100644
--- a/src/bin/checksum.rs
+++ b/src/bin/checksum.rs
@@ -1,6 +1,9 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
 //! This is a simple program to calculate a checksum from the command line
 
 use crc_fast::{checksum, checksum_file, CrcAlgorithm};
+use rand::RngCore;
 use std::env;
 use std::process::ExitCode;
 use std::str::FromStr;
@@ -11,6 +14,36 @@ struct Config {
     file: Option<String>,
     string: Option<String>,
     format: OutputFormat,
+    benchmark: Option<BenchmarkConfig>,
+}
+
+#[derive(Debug)]
+struct BenchmarkConfig {
+    size: Option<usize>,
+    duration: f64,
+}
+
+#[derive(Debug)]
+enum BenchmarkData {
+    InMemory(Vec<u8>),
+    File(String),
+}
+
+#[derive(Debug)]
+struct BenchmarkRunner {
+    algorithm: CrcAlgorithm,
+    data: BenchmarkData,
+    duration: f64,
+}
+
+#[derive(Debug)]
+struct BenchmarkResult {
+    iterations: u64,
+    elapsed_seconds: f64,
+    throughput_gibs: f64,
+    time_per_iteration_nanos: f64,
+    acceleration_target: String,
+    data_size: u64,
 }
 
 #[derive(Debug, Clone)]
@@ -19,20 +52,144 @@ enum OutputFormat {
     Decimal,
 }
 
+impl BenchmarkConfig {
+    fn validate(&self) -> Result<(), String> {
+        if self.duration <= 0.0 {
+            return Err("Duration must be greater than 0".to_string());
+        }
+
+        if let Some(size) = self.size {
+            if size == 0 {
+                return Err("Size must be greater than 0".to_string());
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl BenchmarkRunner {
+    fn new(algorithm: CrcAlgorithm, data: BenchmarkData, duration: f64) -> Self {
+        Self {
+            algorithm,
+            data,
+            duration,
+        }
+    }
+
+    fn run(&self) -> Result<BenchmarkResult, String> {
+        use std::time::Instant;
+
+        let start = Instant::now();
+        let mut iterations = 0u64;
+
+        while start.elapsed().as_secs_f64() < self.duration {
+            match &self.data {
+                BenchmarkData::InMemory(data) => {
+                    std::hint::black_box(checksum(self.algorithm, data));
+                }
+                BenchmarkData::File(filename) => {
+                    match checksum_file(self.algorithm, filename, None) {
+                        Ok(result) => {
+                            std::hint::black_box(result);
+                        }
+                        Err(e) => {
+                            return Err(format!("Failed to read file during benchmark: {}", e));
+                        }
+                    }
+                }
+            }
+            iterations += 1;
+        }
+
+        let elapsed = start.elapsed().as_secs_f64();
+        let data_size = match &self.data {
+            BenchmarkData::InMemory(data) => data.len() as u64,
+            BenchmarkData::File(filename) => std::fs::metadata(filename)
+                .map(|m| m.len())
+                .map_err(|e| format!("Failed to get file size: {}", e))?,
+        };
+
+        let acceleration_target = crc_fast::get_calculator_target(self.algorithm);
+
+        Ok(BenchmarkResult::new(
+            iterations,
+            elapsed,
+            acceleration_target,
+            data_size,
+        ))
+    }
+}
+
+impl BenchmarkResult {
+    fn new(
+        iterations: u64,
+        elapsed_seconds: f64,
+        acceleration_target: String,
+        data_size: u64,
+    ) -> Self {
+        let throughput_gibs = if elapsed_seconds > 0.0 {
+            (data_size as f64 * iterations as f64) / elapsed_seconds / (1024.0 * 1024.0 * 1024.0)
+        } else {
+            0.0
+        };
+
+        let time_per_iteration_nanos = if iterations > 0 {
+            elapsed_seconds * 1_000_000_000.0 / iterations as f64
+        } else {
+            0.0
+        };
+
+        Self {
+            iterations,
+            elapsed_seconds,
+            throughput_gibs,
+            time_per_iteration_nanos,
+            acceleration_target,
+            data_size,
+        }
+    }
+}
+
+fn generate_random_data(size: usize) -> Result<Vec<u8>, String> {
+    // Check for reasonable size limits to prevent memory issues
+    if size > 1_073_741_824 {
+        // 1 GiB limit
+        return Err("Data size too large (maximum 1 GiB)".to_string());
+    }
+
+    // Use vec! macro to avoid clippy warning about slow initialization
+    let mut buf = vec![0u8; size];
+    let mut rng = rand::rng();
+    rng.fill_bytes(&mut buf);
+    Ok(buf)
+}
+
 fn print_usage() {
     println!("Usage: checksum -a algorithm [-f file] [-s string] [--format hex|decimal]");
+    println!(
+        "       checksum -a algorithm -b [--size bytes] [--duration seconds] [-f file] [-s string]"
+    );
     println!();
     println!("Example: checksum -a CRC-32/ISCSI -f myfile.txt");
     println!("Example: checksum -a CRC-64/NVME -s 'Hello, world!' --format decimal");
+    println!("Example: checksum -a CRC-32/ISCSI -b --size 1048576 --duration 5.0");
     println!();
     println!("Options:");
     println!("  -a algorithm        Specify the checksum algorithm (required)");
     println!("  -f file             Calculate checksum for the specified file");
+    println!("  -h, --help          Show this help message");
     println!("  -s string           Calculate checksum for the specified string");
     println!("  --format hex|decimal Output format (default: hex)");
-    println!("  -h, --help          Show this help message");
     println!();
-    println!("Note: Either -f or -s must be provided, but not both.");
+    println!("Benchmarking:");
+    println!("  -b                  Enable benchmark mode");
+    println!("  --duration seconds  Benchmark duration in seconds (default: 10.0)");
+    println!("  --size bytes        Data size for random generation in benchmark mode (default: 1048576 [1MiB])");
+    println!();
+    println!();
+    println!("Note: In normal mode, either -f or -s must be provided, but not both.");
+    println!("      In benchmark mode (-b), -f or -s are optional for using specific data.");
 }
 
 fn parse_args() -> Result<Config, String> {
@@ -51,6 +208,9 @@ fn parse_args() -> Result<Config, String> {
     let mut file: Option<String> = None;
     let mut string: Option<String> = None;
     let mut format = OutputFormat::Hex; // Default to hex
+    let mut benchmark_mode = false;
+    let mut benchmark_size: Option<usize> = None;
+    let mut benchmark_duration = 10.0; // Default duration
 
     let mut i = 1; // Skip program name
     while i < args.len() {
@@ -98,6 +258,30 @@ fn parse_args() -> Result<Config, String> {
                 }
                 i += 2;
             }
+            "-b" => {
+                benchmark_mode = true;
+                i += 1;
+            }
+            "--size" => {
+                if i + 1 >= args.len() {
+                    return Err("Missing size value after --size flag".to_string());
+                }
+                benchmark_size = Some(
+                    args[i + 1]
+                        .parse::<usize>()
+                        .map_err(|_| format!("Invalid size value: {}", args[i + 1]))?,
+                );
+                i += 2;
+            }
+            "--duration" => {
+                if i + 1 >= args.len() {
+                    return Err("Missing duration value after --duration flag".to_string());
+                }
+                benchmark_duration = args[i + 1]
+                    .parse::<f64>()
+                    .map_err(|_| format!("Invalid duration value: {}", args[i + 1]))?;
+                i += 2;
+            }
             arg => {
                 return Err(format!("Unknown argument: {}", arg));
             }
@@ -107,15 +291,40 @@ fn parse_args() -> Result<Config, String> {
     // Validate required arguments
     let algorithm = algorithm.ok_or("Algorithm (-a) is required")?;
 
-    if file.is_none() && string.is_none() {
-        return Err("Either -f (file) or -s (string) must be provided".to_string());
+    // Validate mutual exclusivity between benchmark and normal modes
+    if !benchmark_mode && (benchmark_size.is_some() || benchmark_duration != 10.0) {
+        return Err("--size and --duration can only be used with -b flag".to_string());
+    }
+
+    // Create benchmark config if in benchmark mode
+    let benchmark = if benchmark_mode {
+        let config = BenchmarkConfig {
+            size: benchmark_size,
+            duration: benchmark_duration,
+        };
+        config.validate()?;
+        Some(config)
+    } else {
+        None
+    };
+
+    // Validate input requirements based on mode
+    if benchmark.is_none() {
+        // Normal mode: require either file or string input
+        if file.is_none() && string.is_none() {
+            return Err(
+                "Either -f (file) or -s (string) must be provided in normal mode".to_string(),
+            );
+        }
     }
+    // Benchmark mode: file and string are optional (will use generated data if neither provided)
 
     Ok(Config {
         algorithm,
         file,
         string,
         format,
+        benchmark,
     })
 }
 
@@ -123,6 +332,11 @@ fn calculate_checksum(config: &Config) -> Result<(), String> {
     let algorithm = CrcAlgorithm::from_str(&config.algorithm)
         .map_err(|_| format!("Invalid algorithm: {}", config.algorithm))?;
 
+    // Check if benchmark mode is enabled
+    if let Some(benchmark_config) = &config.benchmark {
+        return run_benchmark(config, benchmark_config, algorithm);
+    }
+
     let checksum = if let Some(ref filename) = config.file {
         checksum_file(algorithm, filename, None).unwrap()
     } else if let Some(ref text) = config.string {
@@ -139,6 +353,91 @@ fn calculate_checksum(config: &Config) -> Result<(), String> {
     Ok(())
 }
 
+fn run_benchmark(
+    config: &Config,
+    benchmark_config: &BenchmarkConfig,
+    algorithm: CrcAlgorithm,
+) -> Result<(), String> {
+    // Determine data source and create BenchmarkData
+    let data = if let Some(ref filename) = config.file {
+        // Validate file exists before benchmarking
+        if !std::path::Path::new(filename).exists() {
+            return Err(format!("File not found: {}", filename));
+        }
+        BenchmarkData::File(filename.clone())
+    } else if let Some(ref text) = config.string {
+        BenchmarkData::InMemory(text.as_bytes().to_vec())
+    } else {
+        // Generate random data with specified size or default (1 MiB)
+        let size = benchmark_config.size.unwrap_or(1_048_576);
+        let random_data = generate_random_data(size)?;
+        BenchmarkData::InMemory(random_data)
+    };
+
+    // Create and run benchmark
+    let runner = BenchmarkRunner::new(algorithm, data, benchmark_config.duration);
+    let result = runner.run()?;
+
+    // Display results with algorithm name
+    display_benchmark_results(&result, &config.algorithm);
+
+    Ok(())
+}
+
+// Format numbers with comma separators for better readability
+fn format_number_with_commas(n: u64) -> String {
+    let s = n.to_string();
+    let mut result = String::new();
+    let chars: Vec<char> = s.chars().collect();
+
+    for (i, ch) in chars.iter().enumerate() {
+        if i > 0 && (chars.len() - i) % 3 == 0 {
+            result.push(',');
+        }
+        result.push(*ch);
+    }
+
+    result
+}
+
+fn display_benchmark_results(result: &BenchmarkResult, algorithm_name: &str) {
+    println!("Algorithm: {}", algorithm_name);
+    println!("Acceleration Target: {}", result.acceleration_target);
+
+    // Format data size with appropriate units
+    let (size_value, size_unit) = if result.data_size >= 1_048_576 {
+        (result.data_size as f64 / 1_048_576.0, "MiB")
+    } else if result.data_size >= 1024 {
+        (result.data_size as f64 / 1024.0, "KiB")
+    } else {
+        (result.data_size as f64, "bytes")
+    };
+
+    println!(
+        "Data Size: {} bytes ({:.1} {})",
+        format_number_with_commas(result.data_size),
+        size_value,
+        size_unit
+    );
+    println!("Duration: {:.2} seconds", result.elapsed_seconds);
+    println!(
+        "Iterations: {}",
+        format_number_with_commas(result.iterations)
+    );
+    println!("Throughput: {:.2} GiB/s", result.throughput_gibs);
+
+    // Format time per iteration with appropriate units
+    let (time_value, time_unit) = if result.time_per_iteration_nanos >= 1_000_000.0 {
+        (result.time_per_iteration_nanos / 1_000_000.0, "ms")
+    } else if result.time_per_iteration_nanos >= 1_000.0 {
+        (result.time_per_iteration_nanos / 1_000.0, "μs")
+    } else {
+        (result.time_per_iteration_nanos, "ns")
+    };
+
+    println!("Time per iteration: {:.1} {}", time_value, time_unit);
+}
+
 fn main() -> ExitCode {
     match parse_args() {
         Ok(config) => {
@@ -162,3 +461,261 @@ fn main() -> ExitCode {
 
     ExitCode::SUCCESS
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::str::FromStr;
+
+    #[test]
+    fn test_benchmark_config_validation_valid() {
+        let config = BenchmarkConfig {
+            size: Some(1024),
+            duration: 5.0,
+        };
+        assert!(config.validate().is_ok());
+    }
+
+    #[test]
+    fn test_benchmark_config_validation_zero_duration() {
+        let config = BenchmarkConfig {
+            size: Some(1024),
+            duration: 0.0,
+        };
+        assert!(config.validate().is_err());
+        assert_eq!(
+            config.validate().unwrap_err(),
+            "Duration must be greater than 0"
+        );
+    }
+
+    #[test]
+    fn test_benchmark_config_validation_negative_duration() {
+        let config = BenchmarkConfig {
+            size: Some(1024),
+            duration: -1.0,
+        };
+        assert!(config.validate().is_err());
+        assert_eq!(
+            config.validate().unwrap_err(),
+            "Duration must be greater than 0"
+        );
+    }
+
+    #[test]
+    fn test_benchmark_config_validation_zero_size() {
+        let config = BenchmarkConfig {
+            size: Some(0),
+            duration: 5.0,
+        };
+        assert!(config.validate().is_err());
+        assert_eq!(
+            config.validate().unwrap_err(),
+            "Size must be greater than 0"
+        );
+    }
+
+    #[test]
+    fn test_benchmark_config_validation_none_size() {
+        let config = BenchmarkConfig {
+            size: None,
+            duration: 5.0,
+        };
+        assert!(config.validate().is_ok());
+    }
+
+    #[test]
+    fn test_generate_random_data_valid_size() {
+        let data = generate_random_data(1024).unwrap();
+        assert_eq!(data.len(), 1024);
+    }
+
+    #[test]
+    fn test_generate_random_data_large_size() {
+        let result = generate_random_data(1_073_741_825); // 1 GiB + 1
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err(), "Data size too large (maximum 1 GiB)");
+    }
+
+    #[test]
+    fn test_benchmark_result_calculation() {
+        let result = BenchmarkResult::new(
+            1000,               // iterations
+            2.0,                // elapsed_seconds
+            "test".to_string(), // acceleration_target
+            1024,               // data_size
+        );
+
+        assert_eq!(result.iterations, 1000);
+        assert_eq!(result.elapsed_seconds, 2.0);
+        assert_eq!(result.data_size, 1024);
+
+        // Throughput should be (1024 * 1000) / 2.0 / (1024^3) GiB/s
+        let expected_throughput = (1024.0 * 1000.0) / 2.0 / (1024.0 * 1024.0 * 1024.0);
+        assert!((result.throughput_gibs - expected_throughput).abs() < 1e-10);
+
+        // Time per iteration should be 2.0 * 1e9 / 1000 nanoseconds
+        let expected_time_per_iter = 2.0 * 1_000_000_000.0 / 1000.0;
+        assert!((result.time_per_iteration_nanos - expected_time_per_iter).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_benchmark_runner_creation() {
+        let algorithm = CrcAlgorithm::from_str("CRC-32/ISCSI").unwrap();
+        let data = BenchmarkData::InMemory(vec![1, 2, 3, 4]);
+        let runner = BenchmarkRunner::new(algorithm, data, 1.0);
+
+        assert_eq!(runner.duration, 1.0);
+        assert_eq!(runner.algorithm, algorithm);
+    }
+
+    #[test]
+    fn test_benchmark_runner_execution_in_memory() {
+        let algorithm = CrcAlgorithm::from_str("CRC-32/ISCSI").unwrap();
+        let data = BenchmarkData::InMemory(vec![1, 2, 3, 4]);
+        let runner = BenchmarkRunner::new(algorithm, data, 0.1); // Short duration for test
+
+        let result = runner.run().unwrap();
+        assert!(result.iterations > 0);
+        assert!(result.elapsed_seconds > 0.0);
+        assert_eq!(result.data_size, 4);
+        assert!(result.throughput_gibs >= 0.0);
+    }
+
+    #[test]
+    fn test_parse_args_benchmark_mode() {
+        // We can't easily mock std::env::args in unit tests, so we'll test the parsing logic
+        // by creating a Config directly and validating its structure
+        let config = Config {
+            algorithm: "CRC-32/ISCSI".to_string(),
+            file: None,
+            string: None,
+            format: OutputFormat::Hex,
+            benchmark: Some(BenchmarkConfig {
+                size: Some(1024),
+                duration: 5.0,
+            }),
+        };
+
+        assert!(config.benchmark.is_some());
+        let benchmark_config = config.benchmark.unwrap();
+        assert_eq!(benchmark_config.size, Some(1024));
+        assert_eq!(benchmark_config.duration, 5.0);
+    }
+
+    #[test]
+    fn test_parse_args_normal_mode() {
+        let config = Config {
+            algorithm: "CRC-32/ISCSI".to_string(),
+            file: Some("test.txt".to_string()),
+            string: None,
+            format: OutputFormat::Hex,
+            benchmark: None,
+        };
+
+        assert!(config.benchmark.is_none());
+        assert_eq!(config.file, Some("test.txt".to_string()));
+    }
+
+    #[test]
+    fn test_data_source_selection_file() {
+        // Test that file input creates File variant
+        let config = Config {
+            algorithm: "CRC-32/ISCSI".to_string(),
+            file: Some("test.txt".to_string()),
+            string: None,
+            format: OutputFormat::Hex,
+            benchmark: Some(BenchmarkConfig {
+                size: None,
+                duration: 1.0,
+            }),
+        };
+
+        // This would be tested in the run_benchmark function
+        // We can verify the config structure is correct for file input
+        assert!(config.file.is_some());
+        assert!(config.string.is_none());
+    }
+
+    #[test]
+    fn test_data_source_selection_string() {
+        let config = Config {
+            algorithm: "CRC-32/ISCSI".to_string(),
+            file: None,
+            string: Some("test data".to_string()),
+            format: OutputFormat::Hex,
+            benchmark: Some(BenchmarkConfig {
+                size: None,
+                duration: 1.0,
+            }),
+        };
+
+        assert!(config.file.is_none());
+        assert!(config.string.is_some());
+    }
+
+    #[test]
+    fn test_data_source_selection_generated() {
+        let config = Config {
+            algorithm: "CRC-32/ISCSI".to_string(),
+            file: None,
+            string: None,
+            format: OutputFormat::Hex,
+            benchmark: Some(BenchmarkConfig {
+                size: Some(1024),
+                duration: 1.0,
+            }),
+        };
+
+        // When neither file nor string is provided, generated data should be used
+        assert!(config.file.is_none());
+        assert!(config.string.is_none());
+        assert_eq!(config.benchmark.as_ref().unwrap().size, Some(1024));
+    }
+
+    #[test]
+    fn test_throughput_calculation_accuracy() {
+        // Test with known values to verify calculation accuracy
+        let data_size = 1_048_576u64; // 1 MiB
+        let iterations = 1000u64;
+        let elapsed_seconds = 1.0;
+
+        let result =
+            BenchmarkResult::new(iterations, elapsed_seconds, "test".to_string(), data_size);
+
+        // Expected throughput: (1 MiB * 1000) / 1 second = 1000 MiB/s = ~0.9537 GiB/s
+        let expected_gibs =
+            (data_size as f64 * iterations as f64) / elapsed_seconds / (1024.0 * 1024.0 * 1024.0);
+        assert!((result.throughput_gibs - expected_gibs).abs() < 1e-10);
+    }
+
+    #[test]
+    fn test_output_format_variants() {
+        let hex_format = OutputFormat::Hex;
+        let decimal_format = OutputFormat::Decimal;
+
+        // Test that both variants can be created and are different
+        match hex_format {
+            OutputFormat::Hex => assert!(true),
+            OutputFormat::Decimal => assert!(false),
+        }
+
+        match decimal_format {
+            OutputFormat::Decimal => assert!(true),
+            OutputFormat::Hex => assert!(false),
+        }
+    }
+
+    #[test]
+    fn test_format_number_with_commas() {
+        assert_eq!(format_number_with_commas(0), "0");
+        assert_eq!(format_number_with_commas(123), "123");
+        assert_eq!(format_number_with_commas(1234), "1,234");
+        assert_eq!(format_number_with_commas(12345), "12,345");
+        assert_eq!(format_number_with_commas(123456), "123,456");
+        assert_eq!(format_number_with_commas(1234567), "1,234,567");
+        assert_eq!(format_number_with_commas(12345678), "12,345,678");
+        assert_eq!(format_number_with_commas(123456789), "123,456,789");
+        assert_eq!(format_number_with_commas(1000000000), "1,000,000,000");
+    }
+}
diff --git a/src/bin/get-custom-params.rs b/src/bin/get-custom-params.rs
index ac5df47..d730788 100644
--- a/src/bin/get-custom-params.rs
+++ b/src/bin/get-custom-params.rs
@@ -1,3 +1,5 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
 //! This is a simple program to get custom CRC parameters from the command line.
 
 use std::env;
diff --git a/src/crc32/fusion/aarch64.rs b/src/crc32/fusion/aarch64.rs
deleted file mode 100644
index c9f0207..0000000
--- a/src/crc32/fusion/aarch64.rs
+++ /dev/null
@@ -1,1073 +0,0 @@
-//! Provides CRC-32/ISCSI and CRC-32/ISO-HDLC calculations using a fusion of native CLMUL
-//! instructions and native CRC calculation instructions on aarch64.
-//!
-//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
-//!
-//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-//! with the help of Claude.ai using:
-//!
-//! ./generate -i neon -p crc32c -a v12e_v1
-//! ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3
-//! ./generate -i neon -p crc32 -a v12e_v1
-//!
-//! Modified as necessary for this Rust implementation.
-//!
-//! MIT licensed.
-
-#![cfg(target_arch = "aarch64")]
-
-use std::arch::aarch64::*;
-
-/// Safe wrapper for CRC32 iSCSI calculation
-#[inline]
-#[cfg(target_feature = "sha3")]
-pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
-    unsafe {
-        const LARGE_BUFFER_THRESHOLD: usize = 1024;
-
-        // Select implementation based on buffer size
-        if data.len() <= LARGE_BUFFER_THRESHOLD {
-            crc32_iscsi_v12e_v1(crc, data.as_ptr(), data.len())
-        } else {
-            crc32_iscsi_eor3_v9s3x2e_s3(crc, data.as_ptr(), data.len())
-        }
-    }
-}
-
-#[inline]
-#[cfg(not(target_feature = "sha3"))]
-pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
-    unsafe { crc32_iscsi_v12e_v1(crc, data.as_ptr(), data.len()) }
-}
-
-/// Safe wrapper for CRC32 ISO-HDLC calculation
-#[inline]
-#[cfg(target_feature = "sha3")]
-pub fn crc32_iso_hdlc(crc: u32, data: &[u8]) -> u32 {
-    unsafe {
-        const LARGE_BUFFER_THRESHOLD: usize = 1024;
-
-        // Select implementation based on buffer size
-        if data.len() <= LARGE_BUFFER_THRESHOLD {
-            crc32_iso_hdlc_v12e_v1(crc, data.as_ptr(), data.len())
-        } else {
-            crc32_iso_hdlc_eor3_v9s3x2e_s3(crc, data.as_ptr(), data.len())
-        }
-    }
-}
-
-#[inline]
-#[cfg(not(target_feature = "sha3"))]
-pub fn crc32_iso_hdlc(crc: u32, data: &[u8]) -> u32 {
-    unsafe { crc32_iso_hdlc_v12e_v1(crc, data.as_ptr(), data.len()) }
-}
-
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes")]
-unsafe fn clmul_lo_eor3(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    // Polynomial multiply low parts - convert u128 result to uint64x2_t
-    let result = vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0));
-    vreinterpretq_u64_p128(result)
-}
-
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes")]
-unsafe fn clmul_hi_eor3(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    // Polynomial multiply high parts - convert u128 result to uint64x2_t
-    let result = vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1));
-    vreinterpretq_u64_p128(result)
-}
-
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes")]
-unsafe fn clmul_scalar(a: u32, b: u32) -> uint64x2_t {
-    // Polynomial multiply scalars - convert u128 result to uint64x2_t
-    let result = vmull_p64(a as u64, b as u64);
-    vreinterpretq_u64_p128(result)
-}
-
-// x^n mod P, in log(n) time
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes")]
-unsafe fn xnmodp_crc32_iscsi(mut n: u64) -> u32 {
-    let mut stack = !1u64;
-    let mut acc: u32;
-    let mut low: u32;
-
-    while n > 191 {
-        stack = (stack << 1) + (n & 1);
-        n = (n >> 1) - 16;
-    }
-    stack = !stack;
-    acc = 0x80000000u32 >> (n & 31);
-    n >>= 5;
-
-    while n > 0 {
-        // ARM CRC32 instruction
-        acc = unsafe { __crc32cw(acc, 0) };
-        n -= 1;
-    }
-
-    while {
-        low = (stack & 1) as u32;
-        stack >>= 1;
-        stack != 0
-    } {
-        unsafe {
-            // Convert to polynomial type and square it
-            let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64));
-            let squared = vmull_p8(x, x);
-            let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0);
-            acc = __crc32cd(0, y << low);
-        }
-    }
-    acc
-}
-
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes")]
-unsafe fn crc_shift_iscsi(crc: u32, nbytes: usize) -> uint64x2_t {
-    clmul_scalar(crc, xnmodp_crc32_iscsi((nbytes * 8 - 33) as u64))
-}
-
-// x^n mod P, in log(n) time
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes")]
-unsafe fn xnmodp_iso_hdlc(mut n: u64) -> u32 {
-    let mut stack = !1u64;
-    let mut acc: u32;
-    let mut low: u32;
-
-    while n > 191 {
-        stack = (stack << 1) + (n & 1);
-        n = (n >> 1) - 16;
-    }
-    stack = !stack;
-    acc = 0x80000000u32 >> (n & 31);
-    n >>= 5;
-
-    while n > 0 {
-        // ARM CRC32 instruction (ISO-HDLC uses standard CRC32, not CRC32C)
-        acc = unsafe { __crc32w(acc, 0) };
-        n -= 1;
-    }
-
-    while {
-        low = (stack & 1) as u32;
-        stack >>= 1;
-        stack != 0
-    } {
-        unsafe {
-            // Convert to polynomial type and square it
-            let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64));
-            let squared = vmull_p8(x, x);
-            let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0);
-            acc = __crc32d(0, y << low);
-        }
-    }
-    acc
-}
-
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes")]
-unsafe fn crc_shift_iso_hdlc(crc: u32, nbytes: usize) -> uint64x2_t {
-    clmul_scalar(crc, xnmodp_iso_hdlc((nbytes * 8 - 33) as u64))
-}
-
-/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-/// using:
-///
-/// ./generate -i neon_eor3 -p crc32c -a v9s3x2e_s3
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes,sha3")]
-unsafe fn crc32_iscsi_eor3_v9s3x2e_s3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
-    // Align to 8-byte boundary
-    while len > 0 && (buf as usize & 7) != 0 {
-        crc0 = __crc32cb(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    // Handle 8-byte alignment
-    if (buf as usize & 8) != 0 && len >= 8 {
-        crc0 = __crc32cd(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    if len >= 192 {
-        let end = buf.add(len);
-        let blk = len / 192;
-        let klen = blk * 16;
-        let buf2 = buf.add(klen * 3);
-        let limit = buf.add(klen).sub(32);
-        let mut crc1 = 0u32;
-        let mut crc2 = 0u32;
-
-        // First vector chunk
-        let mut x0 = vld1q_u64(buf2 as *const u64);
-        let mut x1 = vld1q_u64(buf2.add(16) as *const u64);
-        let mut x2 = vld1q_u64(buf2.add(32) as *const u64);
-        let mut x3 = vld1q_u64(buf2.add(48) as *const u64);
-        let mut x4 = vld1q_u64(buf2.add(64) as *const u64);
-        let mut x5 = vld1q_u64(buf2.add(80) as *const u64);
-        let mut x6 = vld1q_u64(buf2.add(96) as *const u64);
-        let mut x7 = vld1q_u64(buf2.add(112) as *const u64);
-        let mut x8 = vld1q_u64(buf2.add(128) as *const u64);
-
-        let k_vals: [u64; 2] = [0x7e908048, 0xc96cfdc0];
-        let mut k = vld1q_u64(k_vals.as_ptr());
-        let mut buf2 = buf2.add(144);
-
-        // Main loop
-        while buf <= limit {
-            let y0 = clmul_lo_eor3(x0, k);
-            x0 = clmul_hi_eor3(x0, k);
-            let y1 = clmul_lo_eor3(x1, k);
-            x1 = clmul_hi_eor3(x1, k);
-            let y2 = clmul_lo_eor3(x2, k);
-            x2 = clmul_hi_eor3(x2, k);
-            let y3 = clmul_lo_eor3(x3, k);
-            x3 = clmul_hi_eor3(x3, k);
-            let y4 = clmul_lo_eor3(x4, k);
-            x4 = clmul_hi_eor3(x4, k);
-            let y5 = clmul_lo_eor3(x5, k);
-            x5 = clmul_hi_eor3(x5, k);
-            let y6 = clmul_lo_eor3(x6, k);
-            x6 = clmul_hi_eor3(x6, k);
-            let y7 = clmul_lo_eor3(x7, k);
-            x7 = clmul_hi_eor3(x7, k);
-            let y8 = clmul_lo_eor3(x8, k);
-            x8 = clmul_hi_eor3(x8, k);
-
-            x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64));
-            x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64));
-            x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64));
-            x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64));
-            x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64));
-            x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64));
-            x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64));
-            x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64));
-            x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64));
-
-            crc0 = __crc32cd(crc0, *(buf as *const u64));
-            crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
-            crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
-            crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64));
-            crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64));
-            crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64));
-
-            buf = buf.add(16);
-            buf2 = buf2.add(144);
-        }
-
-        // Reduce x0 ... x8 to just x0
-        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_eor3(x0, k);
-        x0 = clmul_hi_eor3(x0, k);
-        x0 = veor3q_u64(x0, y0, x1);
-        x1 = x2;
-        x2 = x3;
-        x3 = x4;
-        x4 = x5;
-        x5 = x6;
-        x6 = x7;
-        x7 = x8;
-
-        let y0 = clmul_lo_eor3(x0, k);
-        x0 = clmul_hi_eor3(x0, k);
-        let y2 = clmul_lo_eor3(x2, k);
-        x2 = clmul_hi_eor3(x2, k);
-        let y4 = clmul_lo_eor3(x4, k);
-        x4 = clmul_hi_eor3(x4, k);
-        let y6 = clmul_lo_eor3(x6, k);
-        x6 = clmul_hi_eor3(x6, k);
-
-        x0 = veor3q_u64(x0, y0, x1);
-        x2 = veor3q_u64(x2, y2, x3);
-        x4 = veor3q_u64(x4, y4, x5);
-        x6 = veor3q_u64(x6, y6, x7);
-
-        let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_eor3(x0, k);
-        x0 = clmul_hi_eor3(x0, k);
-        let y4 = clmul_lo_eor3(x4, k);
-        x4 = clmul_hi_eor3(x4, k);
-
-        x0 = veor3q_u64(x0, y0, x2);
-        x4 = veor3q_u64(x4, y4, x6);
-
-        let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_eor3(x0, k);
-        x0 = clmul_hi_eor3(x0, k);
-        x0 = veor3q_u64(x0, y0, x4);
-
-        // Final scalar chunk
-        crc0 = __crc32cd(crc0, *(buf as *const u64));
-        crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
-        crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
-        crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64));
-        crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64));
-        crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64));
-
-        let vc0 = crc_shift_iscsi(crc0, klen * 2 + blk * 144);
-        let vc1 = crc_shift_iscsi(crc1, klen + blk * 144);
-        let vc2 = crc_shift_iscsi(crc2, blk * 144);
-        let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
-
-        // Reduce 128 bits to 32 bits, and multiply by x^32
-        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
-        crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1));
-
-        buf = buf2;
-        len = end.offset_from(buf) as usize;
-    }
-
-    if len >= 32 {
-        let klen = ((len - 8) / 24) * 8;
-        let mut crc1 = 0u32;
-        let mut crc2 = 0u32;
-
-        // Main loop
-        loop {
-            crc0 = __crc32cd(crc0, *(buf as *const u64));
-            crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
-            crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
-            buf = buf.add(8);
-            len -= 24;
-            if len < 32 {
-                break;
-            }
-        }
-
-        let vc0 = crc_shift_iscsi(crc0, klen * 2 + 8);
-        let vc1 = crc_shift_iscsi(crc1, klen + 8);
-        let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
-
-        // Final 8 bytes
-        buf = buf.add(klen * 2);
-        crc0 = crc2;
-        crc0 = __crc32cd(crc0, *(buf as *const u64) ^ vc);
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    while len >= 8 {
-        crc0 = __crc32cd(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    while len > 0 {
-        crc0 = __crc32cb(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    crc0
-}
-
-#[inline]
-#[target_feature(enable = "aes")]
-unsafe fn clmul_lo_e(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
-    // Polynomial multiply low parts and XOR with c
-    let mul_result = vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0));
-    let mul_vec = vreinterpretq_u64_p128(mul_result);
-    veorq_u64(mul_vec, c)
-}
-
-#[inline]
-#[target_feature(enable = "aes")]
-unsafe fn clmul_hi_e(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
-    // Polynomial multiply high parts and XOR with c
-    let mul_result = vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1));
-    let mul_vec = vreinterpretq_u64_p128(mul_result);
-    veorq_u64(mul_vec, c)
-}
-
-/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-/// using:
-///
-/// ./generate -i neon -p crc32c -a v12e_v1
-#[inline]
-#[target_feature(enable = "aes")]
-unsafe fn crc32_iscsi_v12e_v1(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
-    // Align to 8-byte boundary
-    while len > 0 && (buf as usize & 7) != 0 {
-        crc0 = __crc32cb(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    // Handle 8-byte alignment
-    if (buf as usize & 8) != 0 && len >= 8 {
-        crc0 = __crc32cd(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    if len >= 192 {
-        let end = buf.add(len);
-        let limit = buf.add(len - 192);
-
-        // First vector chunk
-        let mut x0 = vld1q_u64(buf as *const u64);
-        let mut x1 = vld1q_u64(buf.add(16) as *const u64);
-        let mut x2 = vld1q_u64(buf.add(32) as *const u64);
-        let mut x3 = vld1q_u64(buf.add(48) as *const u64);
-        let mut x4 = vld1q_u64(buf.add(64) as *const u64);
-        let mut x5 = vld1q_u64(buf.add(80) as *const u64);
-        let mut x6 = vld1q_u64(buf.add(96) as *const u64);
-        let mut x7 = vld1q_u64(buf.add(112) as *const u64);
-        let mut x8 = vld1q_u64(buf.add(128) as *const u64);
-        let mut x9 = vld1q_u64(buf.add(144) as *const u64);
-        let mut x10 = vld1q_u64(buf.add(160) as *const u64);
-        let mut x11 = vld1q_u64(buf.add(176) as *const u64);
-
-        let k_vals: [u64; 2] = [0xa87ab8a8, 0xab7aff2a];
-        let mut k = vld1q_u64(k_vals.as_ptr());
-
-        // Create CRC vector and XOR with first vector
-        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
-        x0 = veorq_u64(crc_vec, x0);
-        buf = buf.add(192);
-
-        // Main loop
-        while buf <= limit {
-            let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64));
-            x0 = clmul_hi_e(x0, k, y0);
-            let y1 = clmul_lo_e(x1, k, vld1q_u64(buf.add(16) as *const u64));
-            x1 = clmul_hi_e(x1, k, y1);
-            let y2 = clmul_lo_e(x2, k, vld1q_u64(buf.add(32) as *const u64));
-            x2 = clmul_hi_e(x2, k, y2);
-            let y3 = clmul_lo_e(x3, k, vld1q_u64(buf.add(48) as *const u64));
-            x3 = clmul_hi_e(x3, k, y3);
-            let y4 = clmul_lo_e(x4, k, vld1q_u64(buf.add(64) as *const u64));
-            x4 = clmul_hi_e(x4, k, y4);
-            let y5 = clmul_lo_e(x5, k, vld1q_u64(buf.add(80) as *const u64));
-            x5 = clmul_hi_e(x5, k, y5);
-            let y6 = clmul_lo_e(x6, k, vld1q_u64(buf.add(96) as *const u64));
-            x6 = clmul_hi_e(x6, k, y6);
-            let y7 = clmul_lo_e(x7, k, vld1q_u64(buf.add(112) as *const u64));
-            x7 = clmul_hi_e(x7, k, y7);
-            let y8 = clmul_lo_e(x8, k, vld1q_u64(buf.add(128) as *const u64));
-            x8 = clmul_hi_e(x8, k, y8);
-            let y9 = clmul_lo_e(x9, k, vld1q_u64(buf.add(144) as *const u64));
-            x9 = clmul_hi_e(x9, k, y9);
-            let y10 = clmul_lo_e(x10, k, vld1q_u64(buf.add(160) as *const u64));
-            x10 = clmul_hi_e(x10, k, y10);
-            let y11 = clmul_lo_e(x11, k, vld1q_u64(buf.add(176) as *const u64));
-            x11 = clmul_hi_e(x11, k, y11);
-            buf = buf.add(192);
-        }
-
-        // Reduce x0 ... x11 to just x0
-        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_e(x0, k, x1);
-        x0 = clmul_hi_e(x0, k, y0);
-        let y2 = clmul_lo_e(x2, k, x3);
-        x2 = clmul_hi_e(x2, k, y2);
-        let y4 = clmul_lo_e(x4, k, x5);
-        x4 = clmul_hi_e(x4, k, y4);
-        let y6 = clmul_lo_e(x6, k, x7);
-        x6 = clmul_hi_e(x6, k, y6);
-        let y8 = clmul_lo_e(x8, k, x9);
-        x8 = clmul_hi_e(x8, k, y8);
-        let y10 = clmul_lo_e(x10, k, x11);
-        x10 = clmul_hi_e(x10, k, y10);
-
-        let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_e(x0, k, x2);
-        x0 = clmul_hi_e(x0, k, y0);
-        let y4 = clmul_lo_e(x4, k, x6);
-        x4 = clmul_hi_e(x4, k, y4);
-        let y8 = clmul_lo_e(x8, k, x10);
-        x8 = clmul_hi_e(x8, k, y8);
-
-        let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_e(x0, k, x4);
-        x0 = clmul_hi_e(x0, k, y0);
-        x4 = x8;
-        let y0 = clmul_lo_e(x0, k, x4);
-        x0 = clmul_hi_e(x0, k, y0);
-
-        // Reduce 128 bits to 32 bits, and multiply by x^32
-        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
-        crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
-        len = end.offset_from(buf) as usize;
-    }
-
-    if len >= 16 {
-        // First vector chunk
-        let mut x0 = vld1q_u64(buf as *const u64);
-
-        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
-        let k = vld1q_u64(k_vals.as_ptr());
-
-        // Create CRC vector and XOR with first vector
-        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
-        x0 = veorq_u64(crc_vec, x0);
-        buf = buf.add(16);
-        len -= 16;
-
-        // Main loop
-        while len >= 16 {
-            let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64));
-            x0 = clmul_hi_e(x0, k, y0);
-            buf = buf.add(16);
-            len -= 16;
-        }
-
-        // Reduce 128 bits to 32 bits, and multiply by x^32
-        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
-        crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
-    }
-
-    while len >= 8 {
-        crc0 = __crc32cd(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    while len > 0 {
-        crc0 = __crc32cb(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    crc0
-}
-
-/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-/// using:
-///
-/// ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3
-#[inline]
-#[cfg(target_feature = "sha3")]
-#[target_feature(enable = "aes,sha3")]
-unsafe fn crc32_iso_hdlc_eor3_v9s3x2e_s3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
-    // Align to 8-byte boundary
-    while len > 0 && (buf as usize & 7) != 0 {
-        crc0 = __crc32b(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    // Handle 8-byte alignment
-    if (buf as usize & 8) != 0 && len >= 8 {
-        crc0 = __crc32d(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    if len >= 192 {
-        let end = buf.add(len);
-        let blk = len / 192;
-        let klen = blk * 16;
-        let buf2 = buf.add(klen * 3);
-        let limit = buf.add(klen).sub(32);
-        let mut crc1 = 0u32;
-        let mut crc2 = 0u32;
-
-        // First vector chunk
-        let mut x0 = vld1q_u64(buf2 as *const u64);
-        let mut x1 = vld1q_u64(buf2.add(16) as *const u64);
-        let mut x2 = vld1q_u64(buf2.add(32) as *const u64);
-        let mut x3 = vld1q_u64(buf2.add(48) as *const u64);
-        let mut x4 = vld1q_u64(buf2.add(64) as *const u64);
-        let mut x5 = vld1q_u64(buf2.add(80) as *const u64);
-        let mut x6 = vld1q_u64(buf2.add(96) as *const u64);
-        let mut x7 = vld1q_u64(buf2.add(112) as *const u64);
-        let mut x8 = vld1q_u64(buf2.add(128) as *const u64);
-
-        // ISO-HDLC specific constants
-        let k_vals: [u64; 2] = [0x26b70c3d, 0x3f41287a];
-        let mut k = vld1q_u64(k_vals.as_ptr());
-        let mut buf2 = buf2.add(144);
-
-        // Main loop
-        while buf <= limit {
-            let y0 = clmul_lo_eor3(x0, k);
-            x0 = clmul_hi_eor3(x0, k);
-            let y1 = clmul_lo_eor3(x1, k);
-            x1 = clmul_hi_eor3(x1, k);
-            let y2 = clmul_lo_eor3(x2, k);
-            x2 = clmul_hi_eor3(x2, k);
-            let y3 = clmul_lo_eor3(x3, k);
-            x3 = clmul_hi_eor3(x3, k);
-            let y4 = clmul_lo_eor3(x4, k);
-            x4 = clmul_hi_eor3(x4, k);
-            let y5 = clmul_lo_eor3(x5, k);
-            x5 = clmul_hi_eor3(x5, k);
-            let y6 = clmul_lo_eor3(x6, k);
-            x6 = clmul_hi_eor3(x6, k);
-            let y7 = clmul_lo_eor3(x7, k);
-            x7 = clmul_hi_eor3(x7, k);
-            let y8 = clmul_lo_eor3(x8, k);
-            x8 = clmul_hi_eor3(x8, k);
-
-            x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64));
-            x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64));
-            x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64));
-            x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64));
-            x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64));
-            x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64));
-            x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64));
-            x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64));
-            x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64));
-
-            crc0 = __crc32d(crc0, *(buf as *const u64));
-            crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64));
-            crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64));
-            crc0 = __crc32d(crc0, *(buf.add(8) as *const u64));
-            crc1 = __crc32d(crc1, *(buf.add(klen + 8) as *const u64));
-            crc2 = __crc32d(crc2, *(buf.add(klen * 2 + 8) as *const u64));
-
-            buf = buf.add(16);
-            buf2 = buf2.add(144);
-        }
-
-        // Reduce x0 ... x8 to just x0
-        let k_vals: [u64; 2] = [0xae689191, 0xccaa009e];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_eor3(x0, k);
-        x0 = clmul_hi_eor3(x0, k);
-        x0 = veor3q_u64(x0, y0, x1);
-        x1 = x2;
-        x2 = x3;
-        x3 = x4;
-        x4 = x5;
-        x5 = x6;
-        x6 = x7;
-        x7 = x8;
-
-        let y0 = clmul_lo_eor3(x0, k);
-        x0 = clmul_hi_eor3(x0, k);
-        let y2 = clmul_lo_eor3(x2, k);
-        x2 = clmul_hi_eor3(x2, k);
-        let y4 = clmul_lo_eor3(x4, k);
-        x4 = clmul_hi_eor3(x4, k);
-        let y6 = clmul_lo_eor3(x6, k);
-        x6 = clmul_hi_eor3(x6, k);
-
-        x0 = veor3q_u64(x0, y0, x1);
-        x2 = veor3q_u64(x2, y2, x3);
-        x4 = veor3q_u64(x4, y4, x5);
-        x6 = veor3q_u64(x6, y6, x7);
-
-        let k_vals: [u64; 2] = [0xf1da05aa, 0x81256527];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_eor3(x0, k);
-        x0 = clmul_hi_eor3(x0, k);
-        let y4 = clmul_lo_eor3(x4, k);
-        x4 = clmul_hi_eor3(x4, k);
-
-        x0 = veor3q_u64(x0, y0, x2);
-        x4 = veor3q_u64(x4, y4, x6);
-
-        let k_vals: [u64; 2] = [0x8f352d95, 0x1d9513d7];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_eor3(x0, k);
-        x0 = clmul_hi_eor3(x0, k);
-        x0 = veor3q_u64(x0, y0, x4);
-
-        // Final scalar chunk
-        crc0 = __crc32d(crc0, *(buf as *const u64));
-        crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64));
-        crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64));
-        crc0 = __crc32d(crc0, *(buf.add(8) as *const u64));
-        crc1 = __crc32d(crc1, *(buf.add(klen + 8) as *const u64));
-        crc2 = __crc32d(crc2, *(buf.add(klen * 2 + 8) as *const u64));
-
-        let vc0 = crc_shift_iso_hdlc(crc0, klen * 2 + blk * 144);
-        let vc1 = crc_shift_iso_hdlc(crc1, klen + blk * 144);
-        let vc2 = crc_shift_iso_hdlc(crc2, blk * 144);
-        let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
-
-        // Reduce 128 bits to 32 bits, and multiply by x^32
-        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
-        crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1));
-
-        buf = buf2;
-        len = end.offset_from(buf) as usize;
-    }
-
-    if len >= 32 {
-        let klen = ((len - 8) / 24) * 8;
-        let mut crc1 = 0u32;
-        let mut crc2 = 0u32;
-
-        // Main loop
-        loop {
-            crc0 = __crc32d(crc0, *(buf as *const u64));
-            crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64));
-            crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64));
-            buf = buf.add(8);
-            len -= 24;
-            if len < 32 {
-                break;
-            }
-        }
-
-        let vc0 = crc_shift_iso_hdlc(crc0, klen * 2 + 8);
-        let vc1 = crc_shift_iso_hdlc(crc1, klen + 8);
-        let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
-
-        // Final 8 bytes
-        buf = buf.add(klen * 2);
-        crc0 = crc2;
-        crc0 = __crc32d(crc0, *(buf as *const u64) ^ vc);
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    while len >= 8 {
-        crc0 = __crc32d(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    while len > 0 {
-        crc0 = __crc32b(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    crc0
-}
-
-/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-/// using:
-///
-/// ./generate -i neon -p crc32 -a v12e_v1
-#[inline]
-#[target_feature(enable = "aes")]
-unsafe fn crc32_iso_hdlc_v12e_v1(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
-    // Align to 8-byte boundary
-    while len > 0 && (buf as usize & 7) != 0 {
-        crc0 = __crc32b(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    // Handle 8-byte alignment
-    if (buf as usize & 8) != 0 && len >= 8 {
-        crc0 = __crc32d(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    if len >= 192 {
-        let end = buf.add(len);
-        let limit = buf.add(len - 192);
-
-        // First vector chunk
-        let mut x0 = vld1q_u64(buf as *const u64);
-        let mut x1 = vld1q_u64(buf.add(16) as *const u64);
-        let mut x2 = vld1q_u64(buf.add(32) as *const u64);
-        let mut x3 = vld1q_u64(buf.add(48) as *const u64);
-        let mut x4 = vld1q_u64(buf.add(64) as *const u64);
-        let mut x5 = vld1q_u64(buf.add(80) as *const u64);
-        let mut x6 = vld1q_u64(buf.add(96) as *const u64);
-        let mut x7 = vld1q_u64(buf.add(112) as *const u64);
-        let mut x8 = vld1q_u64(buf.add(128) as *const u64);
-        let mut x9 = vld1q_u64(buf.add(144) as *const u64);
-        let mut x10 = vld1q_u64(buf.add(160) as *const u64);
-        let mut x11 = vld1q_u64(buf.add(176) as *const u64);
-
-        // ISO-HDLC specific constants for small implementation
-        let k_vals: [u64; 2] = [0x596c8d81, 0xf5e48c85];
-        let mut k = vld1q_u64(k_vals.as_ptr());
-
-        // Create CRC vector and XOR with first vector
-        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
-        x0 = veorq_u64(crc_vec, x0);
-        buf = buf.add(192);
-
-        // Main loop
-        while buf <= limit {
-            let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64));
-            x0 = clmul_hi_e(x0, k, y0);
-            let y1 = clmul_lo_e(x1, k, vld1q_u64(buf.add(16) as *const u64));
-            x1 = clmul_hi_e(x1, k, y1);
-            let y2 = clmul_lo_e(x2, k, vld1q_u64(buf.add(32) as *const u64));
-            x2 = clmul_hi_e(x2, k, y2);
-            let y3 = clmul_lo_e(x3, k, vld1q_u64(buf.add(48) as *const u64));
-            x3 = clmul_hi_e(x3, k, y3);
-            let y4 = clmul_lo_e(x4, k, vld1q_u64(buf.add(64) as *const u64));
-            x4 = clmul_hi_e(x4, k, y4);
-            let y5 = clmul_lo_e(x5, k, vld1q_u64(buf.add(80) as *const u64));
-            x5 = clmul_hi_e(x5, k, y5);
-            let y6 = clmul_lo_e(x6, k, vld1q_u64(buf.add(96) as *const u64));
-            x6 = clmul_hi_e(x6, k, y6);
-            let y7 = clmul_lo_e(x7, k, vld1q_u64(buf.add(112) as *const u64));
-            x7 = clmul_hi_e(x7, k, y7);
-            let y8 = clmul_lo_e(x8, k, vld1q_u64(buf.add(128) as *const u64));
-            x8 = clmul_hi_e(x8, k, y8);
-            let y9 = clmul_lo_e(x9, k, vld1q_u64(buf.add(144) as *const u64));
-            x9 = clmul_hi_e(x9, k, y9);
-            let y10 = clmul_lo_e(x10, k, vld1q_u64(buf.add(160) as *const u64));
-            x10 = clmul_hi_e(x10, k, y10);
-            let y11 = clmul_lo_e(x11, k, vld1q_u64(buf.add(176) as *const u64));
-            x11 = clmul_hi_e(x11, k, y11);
-            buf = buf.add(192);
-        }
-
-        // Reduce x0 ... x11 to just x0
-        let k_vals: [u64; 2] = [0xae689191, 0xccaa009e];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_e(x0, k, x1);
-        x0 = clmul_hi_e(x0, k, y0);
-        let y2 = clmul_lo_e(x2, k, x3);
-        x2 = clmul_hi_e(x2, k, y2);
-        let y4 = clmul_lo_e(x4, k, x5);
-        x4 = clmul_hi_e(x4, k, y4);
-        let y6 = clmul_lo_e(x6, k, x7);
-        x6 = clmul_hi_e(x6, k, y6);
-        let y8 = clmul_lo_e(x8, k, x9);
-        x8 = clmul_hi_e(x8, k, y8);
-        let y10 = clmul_lo_e(x10, k, x11);
-        x10 = clmul_hi_e(x10, k, y10);
-
-        let k_vals: [u64; 2] = [0xf1da05aa, 0x81256527];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_e(x0, k, x2);
-        x0 = clmul_hi_e(x0, k, y0);
-        let y4 = clmul_lo_e(x4, k, x6);
-        x4 = clmul_hi_e(x4, k, y4);
-        let y8 = clmul_lo_e(x8, k, x10);
-        x8 = clmul_hi_e(x8, k, y8);
-
-        let k_vals: [u64; 2] = [0x8f352d95, 0x1d9513d7];
-        k = vld1q_u64(k_vals.as_ptr());
-
-        let y0 = clmul_lo_e(x0, k, x4);
-        x0 = clmul_hi_e(x0, k, y0);
-        x4 = x8;
-        let y0 = clmul_lo_e(x0, k, x4);
-        x0 = clmul_hi_e(x0, k, y0);
-
-        // Reduce 128 bits to 32 bits, and multiply by x^32
-        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
-        crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
-        len = end.offset_from(buf) as usize;
-    }
-
-    if len >= 16 {
-        // First vector chunk
-        let mut x0 = vld1q_u64(buf as *const u64);
-
-        let k_vals: [u64; 2] = [0xae689191, 0xccaa009e];
-        let k = vld1q_u64(k_vals.as_ptr());
-
-        // Create CRC vector and XOR with first vector
-        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
-        x0 = veorq_u64(crc_vec, x0);
-        buf = buf.add(16);
-        len -= 16;
-
-        // Main loop
-        while len >= 16 {
-            let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64));
-            x0 = clmul_hi_e(x0, k, y0);
-            buf = buf.add(16);
-            len -= 16;
-        }
-
-        // Reduce 128 bits to 32 bits, and multiply by x^32
-        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
-        crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
-    }
-
-    while len >= 8 {
-        crc0 = __crc32d(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    while len > 0 {
-        crc0 = __crc32b(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    crc0
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::test::consts::TEST_CHECK_STRING;
-    use crc::{Crc, Table};
-    use rand::{rng, Rng};
-
-    const RUST_CRC32_ISO_HDLC: Crc<u32, Table<16>> =
-        Crc::<u32, Table<16>>::new(&crc::CRC_32_ISO_HDLC);
-
-    const RUST_CRC32_ISCSI: Crc<u32, Table<16>> = Crc::<u32, Table<16>>::new(&crc::CRC_32_ISCSI);
-
-    #[test]
-    fn test_crc32_iso_hdlc_check() {
-        assert_eq!(
-            crc32_iso_hdlc(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
-            0xcbf43926
-        );
-    }
-
-    #[test]
-    fn test_crc32_iso_hdlc_small_all_lengths() {
-        for len in 1..=255 {
-            crc32_iso_hdlc_random(len)
-        }
-    }
-
-    #[test]
-    fn test_crc32_iso_hdlc_medium_lengths() {
-        // Test each length from 256 to 1024, which should fold and include handling remainders
-        for len in 256..=1024 {
-            crc32_iso_hdlc_random(len)
-        }
-    }
-
-    #[test]
-    fn test_crc32_iso_hdlc_large_lengths() {
-        // Test 1 MiB just before, at, and just after the folding boundaries
-        for len in 1048575..1048577 {
-            crc32_iso_hdlc_random(len)
-        }
-    }
-
-    #[test]
-    fn test_crc32_iscsi_check() {
-        assert_eq!(
-            crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
-            0xe3069283
-        );
-    }
-
-    #[test]
-    fn test_crc32_iscsi_small_all_lengths() {
-        for len in 1..=255 {
-            crc32_iscsi_random(len);
-        }
-    }
-
-    #[test]
-    fn test_crc32_iscsi_medium_lengths() {
-        // Test each length from 256 to 1024, which should fold and include handling remainders
-        for len in 256..=1024 {
-            crc32_iscsi_random(len);
-        }
-    }
-
-    #[test]
-    fn test_crc32_iscsi_large_lengths() {
-        // Test 1 MiB just before, at, and just after the folding boundaries
-        for len in 1048575..1048577 {
-            crc32_iscsi_random(len);
-        }
-    }
-
-    #[cfg(target_feature = "sha3")]
-    fn crc32_iso_hdlc_random(len: usize) {
-        let mut data = vec![0u8; len];
-        rng().fill(&mut data[..]);
-
-        let checksum = RUST_CRC32_ISO_HDLC.checksum(&data);
-
-        assert_eq!(crc32_iso_hdlc(0xffffffff, &data) ^ 0xffffffff, checksum);
-
-        unsafe {
-            assert_eq!(
-                crc32_iso_hdlc_eor3_v9s3x2e_s3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                checksum
-            );
-
-            assert_eq!(
-                crc32_iso_hdlc_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                checksum
-            );
-        }
-    }
-
-    #[cfg(not(target_feature = "sha3"))]
-    fn crc32_iso_hdlc_random(len: usize) {
-        let mut data = vec![0u8; len];
-        rng().fill(&mut data[..]);
-
-        let checksum = RUST_CRC32_ISO_HDLC.checksum(&data);
-
-        assert_eq!(crc32_iso_hdlc(0xffffffff, &data) ^ 0xffffffff, checksum);
-
-        unsafe {
-            assert_eq!(
-                crc32_iso_hdlc_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                checksum
-            );
-        }
-    }
-
-    #[cfg(target_feature = "sha3")]
-    fn crc32_iscsi_random(len: usize) {
-        let mut data = vec![0u8; len];
-        rng().fill(&mut data[..]);
-
-        let checksum = RUST_CRC32_ISCSI.checksum(&data);
-
-        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
-
-        unsafe {
-            assert_eq!(
-                crc32_iscsi_eor3_v9s3x2e_s3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                checksum
-            );
-
-            assert_eq!(
-                crc32_iscsi_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                checksum
-            );
-        }
-    }
-
-    #[cfg(not(target_feature = "sha3"))]
-    fn crc32_iscsi_random(len: usize) {
-        let mut data = vec![0u8; len];
-        rng().fill(&mut data[..]);
-
-        let checksum = RUST_CRC32_ISCSI.checksum(&data);
-
-        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
-
-        unsafe {
-            assert_eq!(
-                crc32_iscsi_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                checksum
-            );
-        }
-    }
-}
diff --git a/src/crc32/fusion/aarch64/iscsi/crc_pmull.rs b/src/crc32/fusion/aarch64/iscsi/crc_pmull.rs
new file mode 100644
index 0000000..1bb2473
--- /dev/null
+++ b/src/crc32/fusion/aarch64/iscsi/crc_pmull.rs
@@ -0,0 +1,179 @@
+//! Provides CRC-32/ISCSI calculations using a fusion of native CLMUL
+//! instructions and native CRC calculation instructions on aarch64.
+//!
+//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
+//!
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! Modified as necessary for this Rust implementation.
+//!
+//! MIT licensed.
+
+#![cfg(target_arch = "aarch64")]
+
+use crate::crc32::fusion::aarch64::{clmul_hi_and_xor, clmul_lo_and_xor};
+use std::arch::aarch64::{
+    __crc32cb, __crc32cd, veorq_u64, vgetq_lane_u64, vld1q_u64, vmovq_n_u64, vsetq_lane_u64,
+};
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// with the help of Claude.ai using:
+///
+/// ./generate -i neon -p crc32c -a v12e_v1
+///
+/// Modified as necessary for this Rust implementation.
+#[inline]
+#[target_feature(enable = "crc,aes")]
+pub(crate) unsafe fn crc32_iscsi_v12e_v1(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
+    // Align to 8-byte boundary
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = __crc32cb(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = __crc32cd(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 192 {
+        let end = buf.add(len);
+        let limit = buf.add(len - 192);
+
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf as *const u64);
+        let mut x1 = vld1q_u64(buf.add(16) as *const u64);
+        let mut x2 = vld1q_u64(buf.add(32) as *const u64);
+        let mut x3 = vld1q_u64(buf.add(48) as *const u64);
+        let mut x4 = vld1q_u64(buf.add(64) as *const u64);
+        let mut x5 = vld1q_u64(buf.add(80) as *const u64);
+        let mut x6 = vld1q_u64(buf.add(96) as *const u64);
+        let mut x7 = vld1q_u64(buf.add(112) as *const u64);
+        let mut x8 = vld1q_u64(buf.add(128) as *const u64);
+        let mut x9 = vld1q_u64(buf.add(144) as *const u64);
+        let mut x10 = vld1q_u64(buf.add(160) as *const u64);
+        let mut x11 = vld1q_u64(buf.add(176) as *const u64);
+
+        let k_vals: [u64; 2] = [0xa87ab8a8, 0xab7aff2a];
+        let mut k = vld1q_u64(k_vals.as_ptr());
+
+        // Create CRC vector and XOR with first vector
+        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
+        x0 = veorq_u64(crc_vec, x0);
+        buf = buf.add(192);
+
+        // Main loop
+        while buf <= limit {
+            let y0 = clmul_lo_and_xor(x0, k, vld1q_u64(buf as *const u64));
+            x0 = clmul_hi_and_xor(x0, k, y0);
+            let y1 = clmul_lo_and_xor(x1, k, vld1q_u64(buf.add(16) as *const u64));
+            x1 = clmul_hi_and_xor(x1, k, y1);
+            let y2 = clmul_lo_and_xor(x2, k, vld1q_u64(buf.add(32) as *const u64));
+            x2 = clmul_hi_and_xor(x2, k, y2);
+            let y3 = clmul_lo_and_xor(x3, k, vld1q_u64(buf.add(48) as *const u64));
+            x3 = clmul_hi_and_xor(x3, k, y3);
+            let y4 = clmul_lo_and_xor(x4, k, vld1q_u64(buf.add(64) as *const u64));
+            x4 = clmul_hi_and_xor(x4, k, y4);
+            let y5 = clmul_lo_and_xor(x5, k, vld1q_u64(buf.add(80) as *const u64));
+            x5 = clmul_hi_and_xor(x5, k, y5);
+            let y6 = clmul_lo_and_xor(x6, k, vld1q_u64(buf.add(96) as *const u64));
+            x6 = clmul_hi_and_xor(x6, k, y6);
+            let y7 = clmul_lo_and_xor(x7, k, vld1q_u64(buf.add(112) as *const u64));
+            x7 = clmul_hi_and_xor(x7, k, y7);
+            let y8 = clmul_lo_and_xor(x8, k, vld1q_u64(buf.add(128) as *const u64));
+            x8 = clmul_hi_and_xor(x8, k, y8);
+            let y9 = clmul_lo_and_xor(x9, k, vld1q_u64(buf.add(144) as *const u64));
+            x9 = clmul_hi_and_xor(x9, k, y9);
+            let y10 = clmul_lo_and_xor(x10, k, vld1q_u64(buf.add(160) as *const u64));
+            x10 = clmul_hi_and_xor(x10, k, y10);
+            let y11 = clmul_lo_and_xor(x11, k, vld1q_u64(buf.add(176) as *const u64));
+            x11 = clmul_hi_and_xor(x11, k, y11);
+            buf = buf.add(192);
+        }
+
+        // Reduce x0 ... x11 to just x0
+        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_and_xor(x0, k, x1);
+        x0 = clmul_hi_and_xor(x0, k, y0);
+        let y2 = clmul_lo_and_xor(x2, k, x3);
+        x2 = clmul_hi_and_xor(x2, k, y2);
+        let y4 = clmul_lo_and_xor(x4, k, x5);
+        x4 = clmul_hi_and_xor(x4, k, y4);
+        let y6 = clmul_lo_and_xor(x6, k, x7);
+        x6 = clmul_hi_and_xor(x6, k, y6);
+        let y8 = clmul_lo_and_xor(x8, k, x9);
+        x8 = clmul_hi_and_xor(x8, k, y8);
+        let y10 = clmul_lo_and_xor(x10, k, x11);
+        x10 = clmul_hi_and_xor(x10, k, y10);
+
+        let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_and_xor(x0, k, x2);
+        x0 = clmul_hi_and_xor(x0, k, y0);
+        let y4 = clmul_lo_and_xor(x4, k, x6);
+        x4 = clmul_hi_and_xor(x4, k, y4);
+        let y8 = clmul_lo_and_xor(x8, k, x10);
+        x8 = clmul_hi_and_xor(x8, k, y8);
+
+        let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_and_xor(x0, k, x4);
+        x0 = clmul_hi_and_xor(x0, k, y0);
+        x4 = x8;
+        let y0 = clmul_lo_and_xor(x0, k, x4);
+        x0 = clmul_hi_and_xor(x0, k, y0);
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
+        len = end.offset_from(buf) as usize;
+    }
+
+    if len >= 16 {
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf as *const u64);
+
+        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
+        let k = vld1q_u64(k_vals.as_ptr());
+
+        // Create CRC vector and XOR with first vector
+        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
+        x0 = veorq_u64(crc_vec, x0);
+        buf = buf.add(16);
+        len -= 16;
+
+        // Main loop
+        while len >= 16 {
+            let y0 = clmul_lo_and_xor(x0, k, vld1q_u64(buf as *const u64));
+            x0 = clmul_hi_and_xor(x0, k, y0);
+            buf = buf.add(16);
+            len -= 16;
+        }
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
+    }
+
+    while len >= 8 {
+        crc0 = __crc32cd(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len > 0 {
+        crc0 = __crc32cb(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
diff --git a/src/crc32/fusion/aarch64/iscsi/crc_pmull_sha3.rs b/src/crc32/fusion/aarch64/iscsi/crc_pmull_sha3.rs
new file mode 100644
index 0000000..137cb5c
--- /dev/null
+++ b/src/crc32/fusion/aarch64/iscsi/crc_pmull_sha3.rs
@@ -0,0 +1,266 @@
+//! Provides CRC-32/ISCSI calculations using a fusion of native CLMUL
+//! instructions and native CRC calculation instructions on aarch64.
+//!
+//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
+//!
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! Modified as necessary for this Rust implementation.
+//!
+//! MIT licensed.
+
+#![cfg(target_arch = "aarch64")]
+
+use crate::crc32::fusion::aarch64::{clmul_hi, clmul_lo, clmul_scalar};
+use std::arch::aarch64::{
+    __crc32cb, __crc32cd, __crc32cw, uint64x2_t, veor3q_u64, veorq_u64, vgetq_lane_u64, vld1q_u64,
+    vmov_n_u64, vmull_p8, vreinterpret_p8_u64, vreinterpretq_u64_p16,
+};
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// with the help of Claude.ai using:
+///
+/// ./generate -i neon_eor3 -p crc32c -a v9s3x2e_s3
+///
+/// Modified as necessary for this Rust implementation.
+#[inline]
+#[target_feature(enable = "crc,aes,sha3")]
+pub(crate) unsafe fn crc32_iscsi_eor3_v9s3x2e_s3(
+    mut crc0: u32,
+    mut buf: *const u8,
+    mut len: usize,
+) -> u32 {
+    // Align to 8-byte boundary
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = __crc32cb(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = __crc32cd(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 192 {
+        let end = buf.add(len);
+        let blk = len / 192;
+        let klen = blk * 16;
+        let buf2 = buf.add(klen * 3);
+        let limit = buf.add(klen).sub(32);
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf2 as *const u64);
+        let mut x1 = vld1q_u64(buf2.add(16) as *const u64);
+        let mut x2 = vld1q_u64(buf2.add(32) as *const u64);
+        let mut x3 = vld1q_u64(buf2.add(48) as *const u64);
+        let mut x4 = vld1q_u64(buf2.add(64) as *const u64);
+        let mut x5 = vld1q_u64(buf2.add(80) as *const u64);
+        let mut x6 = vld1q_u64(buf2.add(96) as *const u64);
+        let mut x7 = vld1q_u64(buf2.add(112) as *const u64);
+        let mut x8 = vld1q_u64(buf2.add(128) as *const u64);
+
+        let k_vals: [u64; 2] = [0x7e908048, 0xc96cfdc0];
+        let mut k = vld1q_u64(k_vals.as_ptr());
+        let mut buf2 = buf2.add(144);
+
+        // Main loop
+        while buf <= limit {
+            let y0 = clmul_lo(x0, k);
+            x0 = clmul_hi(x0, k);
+            let y1 = clmul_lo(x1, k);
+            x1 = clmul_hi(x1, k);
+            let y2 = clmul_lo(x2, k);
+            x2 = clmul_hi(x2, k);
+            let y3 = clmul_lo(x3, k);
+            x3 = clmul_hi(x3, k);
+            let y4 = clmul_lo(x4, k);
+            x4 = clmul_hi(x4, k);
+            let y5 = clmul_lo(x5, k);
+            x5 = clmul_hi(x5, k);
+            let y6 = clmul_lo(x6, k);
+            x6 = clmul_hi(x6, k);
+            let y7 = clmul_lo(x7, k);
+            x7 = clmul_hi(x7, k);
+            let y8 = clmul_lo(x8, k);
+            x8 = clmul_hi(x8, k);
+
+            x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64));
+            x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64));
+            x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64));
+            x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64));
+            x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64));
+            x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64));
+            x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64));
+            x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64));
+            x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64));
+
+            crc0 = __crc32cd(crc0, *(buf as *const u64));
+            crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
+            crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
+            crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64));
+            crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64));
+            crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+
+            buf = buf.add(16);
+            buf2 = buf2.add(144);
+        }
+
+        // Reduce x0 ... x8 to just x0
+        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo(x0, k);
+        x0 = clmul_hi(x0, k);
+        x0 = veor3q_u64(x0, y0, x1);
+        x1 = x2;
+        x2 = x3;
+        x3 = x4;
+        x4 = x5;
+        x5 = x6;
+        x6 = x7;
+        x7 = x8;
+
+        let y0 = clmul_lo(x0, k);
+        x0 = clmul_hi(x0, k);
+        let y2 = clmul_lo(x2, k);
+        x2 = clmul_hi(x2, k);
+        let y4 = clmul_lo(x4, k);
+        x4 = clmul_hi(x4, k);
+        let y6 = clmul_lo(x6, k);
+        x6 = clmul_hi(x6, k);
+
+        x0 = veor3q_u64(x0, y0, x1);
+        x2 = veor3q_u64(x2, y2, x3);
+        x4 = veor3q_u64(x4, y4, x5);
+        x6 = veor3q_u64(x6, y6, x7);
+
+        let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo(x0, k);
+        x0 = clmul_hi(x0, k);
+        let y4 = clmul_lo(x4, k);
+        x4 = clmul_hi(x4, k);
+
+        x0 = veor3q_u64(x0, y0, x2);
+        x4 = veor3q_u64(x4, y4, x6);
+
+        let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo(x0, k);
+        x0 = clmul_hi(x0, k);
+        x0 = veor3q_u64(x0, y0, x4);
+
+        // Final scalar chunk
+        crc0 = __crc32cd(crc0, *(buf as *const u64));
+        crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
+        crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
+        crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64));
+        crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64));
+        crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+
+        let vc0 = crc_shift_iscsi(crc0, klen * 2 + blk * 144);
+        let vc1 = crc_shift_iscsi(crc1, klen + blk * 144);
+        let vc2 = crc_shift_iscsi(crc2, blk * 144);
+        let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1));
+
+        buf = buf2;
+        len = end.offset_from(buf) as usize;
+    }
+
+    if len >= 32 {
+        let klen = ((len - 8) / 24) * 8;
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // Main loop
+        loop {
+            crc0 = __crc32cd(crc0, *(buf as *const u64));
+            crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
+            crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
+            buf = buf.add(8);
+            len -= 24;
+            if len < 32 {
+                break;
+            }
+        }
+
+        let vc0 = crc_shift_iscsi(crc0, klen * 2 + 8);
+        let vc1 = crc_shift_iscsi(crc1, klen + 8);
+        let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
+
+        // Final 8 bytes
+        buf = buf.add(klen * 2);
+        crc0 = crc2;
+        crc0 = __crc32cd(crc0, *(buf as *const u64) ^ vc);
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len >= 8 {
+        crc0 = __crc32cd(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len > 0 {
+        crc0 = __crc32cb(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn crc_shift_iscsi(crc: u32, nbytes: usize) -> uint64x2_t {
+    clmul_scalar(crc, xnmodp_crc32_iscsi((nbytes * 8 - 33) as u64))
+}
+
+// x^n mod P, in log(n) time
+#[inline]
+#[target_feature(enable = "crc,aes")]
+unsafe fn xnmodp_crc32_iscsi(mut n: u64) -> u32 {
+    let mut stack = !1u64;
+    let mut acc: u32;
+    let mut low: u32;
+
+    while n > 191 {
+        stack = (stack << 1) + (n & 1);
+        n = (n >> 1) - 16;
+    }
+    stack = !stack;
+    acc = 0x80000000u32 >> (n & 31);
+    n >>= 5;
+
+    while n > 0 {
+        // ARM CRC32 instruction
+        acc = __crc32cw(acc, 0);
+        n -= 1;
+    }
+
+    while {
+        low = (stack & 1) as u32;
+        stack >>= 1;
+        stack != 0
+    } {
+        // Convert to polynomial type and square it
+        let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64));
+        let squared = vmull_p8(x, x);
+        let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0);
+        acc = __crc32cd(0, y << low);
+    }
+    acc
+}
diff --git a/src/crc32/fusion/aarch64/iscsi/mod.rs b/src/crc32/fusion/aarch64/iscsi/mod.rs
new file mode 100644
index 0000000..653ea8a
--- /dev/null
+++ b/src/crc32/fusion/aarch64/iscsi/mod.rs
@@ -0,0 +1,9 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides aarch64-specific implementations of CRC-32/ISCSI calculations using
+//! fusion techniques.
+
+#![cfg(target_arch = "aarch64")]
+
+pub(crate) mod crc_pmull;
+pub(crate) mod crc_pmull_sha3;
diff --git a/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull.rs b/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull.rs
new file mode 100644
index 0000000..b57a959
--- /dev/null
+++ b/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull.rs
@@ -0,0 +1,184 @@
+//! Provides CRC-32/ISO-HDLC calculations using a fusion of native CLMUL
+//! instructions and native CRC calculation instructions on aarch64.
+//!
+//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
+//!
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! Modified as necessary for this Rust implementation.
+//!
+//! MIT licensed.
+
+#![cfg(target_arch = "aarch64")]
+
+use crate::crc32::fusion::aarch64::{clmul_hi_and_xor, clmul_lo_and_xor};
+use std::arch::aarch64::{
+    __crc32b, __crc32d, veorq_u64, vgetq_lane_u64, vld1q_u64, vmovq_n_u64, vsetq_lane_u64,
+};
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// with the help of Claude.ai using:
+///
+/// ./generate -i neon -p crc32 -a v12e_v1
+///
+/// Modified as necessary for this Rust implementation.
+#[inline]
+#[target_feature(enable = "crc,aes")]
+pub(crate) unsafe fn crc32_iso_hdlc_v12e_v1(
+    mut crc0: u32,
+    mut buf: *const u8,
+    mut len: usize,
+) -> u32 {
+    // Align to 8-byte boundary
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = __crc32b(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = __crc32d(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 192 {
+        let end = buf.add(len);
+        let limit = buf.add(len - 192);
+
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf as *const u64);
+        let mut x1 = vld1q_u64(buf.add(16) as *const u64);
+        let mut x2 = vld1q_u64(buf.add(32) as *const u64);
+        let mut x3 = vld1q_u64(buf.add(48) as *const u64);
+        let mut x4 = vld1q_u64(buf.add(64) as *const u64);
+        let mut x5 = vld1q_u64(buf.add(80) as *const u64);
+        let mut x6 = vld1q_u64(buf.add(96) as *const u64);
+        let mut x7 = vld1q_u64(buf.add(112) as *const u64);
+        let mut x8 = vld1q_u64(buf.add(128) as *const u64);
+        let mut x9 = vld1q_u64(buf.add(144) as *const u64);
+        let mut x10 = vld1q_u64(buf.add(160) as *const u64);
+        let mut x11 = vld1q_u64(buf.add(176) as *const u64);
+
+        // ISO-HDLC specific constants for small implementation
+        let k_vals: [u64; 2] = [0x596c8d81, 0xf5e48c85];
+        let mut k = vld1q_u64(k_vals.as_ptr());
+
+        // Create CRC vector and XOR with first vector
+        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
+        x0 = veorq_u64(crc_vec, x0);
+        buf = buf.add(192);
+
+        // Main loop
+        while buf <= limit {
+            let y0 = clmul_lo_and_xor(x0, k, vld1q_u64(buf as *const u64));
+            x0 = clmul_hi_and_xor(x0, k, y0);
+            let y1 = clmul_lo_and_xor(x1, k, vld1q_u64(buf.add(16) as *const u64));
+            x1 = clmul_hi_and_xor(x1, k, y1);
+            let y2 = clmul_lo_and_xor(x2, k, vld1q_u64(buf.add(32) as *const u64));
+            x2 = clmul_hi_and_xor(x2, k, y2);
+            let y3 = clmul_lo_and_xor(x3, k, vld1q_u64(buf.add(48) as *const u64));
+            x3 = clmul_hi_and_xor(x3, k, y3);
+            let y4 = clmul_lo_and_xor(x4, k, vld1q_u64(buf.add(64) as *const u64));
+            x4 = clmul_hi_and_xor(x4, k, y4);
+            let y5 = clmul_lo_and_xor(x5, k, vld1q_u64(buf.add(80) as *const u64));
+            x5 = clmul_hi_and_xor(x5, k, y5);
+            let y6 = clmul_lo_and_xor(x6, k, vld1q_u64(buf.add(96) as *const u64));
+            x6 = clmul_hi_and_xor(x6, k, y6);
+            let y7 = clmul_lo_and_xor(x7, k, vld1q_u64(buf.add(112) as *const u64));
+            x7 = clmul_hi_and_xor(x7, k, y7);
+            let y8 = clmul_lo_and_xor(x8, k, vld1q_u64(buf.add(128) as *const u64));
+            x8 = clmul_hi_and_xor(x8, k, y8);
+            let y9 = clmul_lo_and_xor(x9, k, vld1q_u64(buf.add(144) as *const u64));
+            x9 = clmul_hi_and_xor(x9, k, y9);
+            let y10 = clmul_lo_and_xor(x10, k, vld1q_u64(buf.add(160) as *const u64));
+            x10 = clmul_hi_and_xor(x10, k, y10);
+            let y11 = clmul_lo_and_xor(x11, k, vld1q_u64(buf.add(176) as *const u64));
+            x11 = clmul_hi_and_xor(x11, k, y11);
+            buf = buf.add(192);
+        }
+
+        // Reduce x0 ... x11 to just x0
+        let k_vals: [u64; 2] = [0xae689191, 0xccaa009e];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_and_xor(x0, k, x1);
+        x0 = clmul_hi_and_xor(x0, k, y0);
+        let y2 = clmul_lo_and_xor(x2, k, x3);
+        x2 = clmul_hi_and_xor(x2, k, y2);
+        let y4 = clmul_lo_and_xor(x4, k, x5);
+        x4 = clmul_hi_and_xor(x4, k, y4);
+        let y6 = clmul_lo_and_xor(x6, k, x7);
+        x6 = clmul_hi_and_xor(x6, k, y6);
+        let y8 = clmul_lo_and_xor(x8, k, x9);
+        x8 = clmul_hi_and_xor(x8, k, y8);
+        let y10 = clmul_lo_and_xor(x10, k, x11);
+        x10 = clmul_hi_and_xor(x10, k, y10);
+
+        let k_vals: [u64; 2] = [0xf1da05aa, 0x81256527];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_and_xor(x0, k, x2);
+        x0 = clmul_hi_and_xor(x0, k, y0);
+        let y4 = clmul_lo_and_xor(x4, k, x6);
+        x4 = clmul_hi_and_xor(x4, k, y4);
+        let y8 = clmul_lo_and_xor(x8, k, x10);
+        x8 = clmul_hi_and_xor(x8, k, y8);
+
+        let k_vals: [u64; 2] = [0x8f352d95, 0x1d9513d7];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_and_xor(x0, k, x4);
+        x0 = clmul_hi_and_xor(x0, k, y0);
+        x4 = x8;
+        let y0 = clmul_lo_and_xor(x0, k, x4);
+        x0 = clmul_hi_and_xor(x0, k, y0);
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
+        len = end.offset_from(buf) as usize;
+    }
+
+    if len >= 16 {
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf as *const u64);
+
+        let k_vals: [u64; 2] = [0xae689191, 0xccaa009e];
+        let k = vld1q_u64(k_vals.as_ptr());
+
+        // Create CRC vector and XOR with first vector
+        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
+        x0 = veorq_u64(crc_vec, x0);
+        buf = buf.add(16);
+        len -= 16;
+
+        // Main loop
+        while len >= 16 {
+            let y0 = clmul_lo_and_xor(x0, k, vld1q_u64(buf as *const u64));
+            x0 = clmul_hi_and_xor(x0, k, y0);
+            buf = buf.add(16);
+            len -= 16;
+        }
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
+    }
+
+    while len >= 8 {
+        crc0 = __crc32d(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len > 0 {
+        crc0 = __crc32b(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
diff --git a/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull_sha3.rs b/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull_sha3.rs
new file mode 100644
index 0000000..3483774
--- /dev/null
+++ b/src/crc32/fusion/aarch64/iso_hdlc/crc_pmull_sha3.rs
@@ -0,0 +1,267 @@
+//! Provides CRC-32/ISO-HDLC calculations using a fusion of native CLMUL
+//! instructions and native CRC calculation instructions on aarch64.
+//!
+//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
+//!
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! Modified as necessary for this Rust implementation.
+//!
+//! MIT licensed.
+
+#![cfg(target_arch = "aarch64")]
+
+use crate::crc32::fusion::aarch64::{clmul_hi, clmul_lo, clmul_scalar};
+use std::arch::aarch64::{
+    __crc32b, __crc32d, __crc32w, uint64x2_t, veor3q_u64, veorq_u64, vgetq_lane_u64, vld1q_u64,
+    vmov_n_u64, vmull_p8, vreinterpret_p8_u64, vreinterpretq_u64_p16,
+};
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// with the help of Claude.ai using:
+///
+/// ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3
+///
+/// Modified as necessary for this Rust implementation.
+#[inline]
+#[target_feature(enable = "crc,aes,sha3")]
+pub(crate) unsafe fn crc32_iso_hdlc_eor3_v9s3x2e_s3(
+    mut crc0: u32,
+    mut buf: *const u8,
+    mut len: usize,
+) -> u32 {
+    // Align to 8-byte boundary
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = __crc32b(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = __crc32d(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 192 {
+        let end = buf.add(len);
+        let blk = len / 192;
+        let klen = blk * 16;
+        let buf2 = buf.add(klen * 3);
+        let limit = buf.add(klen).sub(32);
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf2 as *const u64);
+        let mut x1 = vld1q_u64(buf2.add(16) as *const u64);
+        let mut x2 = vld1q_u64(buf2.add(32) as *const u64);
+        let mut x3 = vld1q_u64(buf2.add(48) as *const u64);
+        let mut x4 = vld1q_u64(buf2.add(64) as *const u64);
+        let mut x5 = vld1q_u64(buf2.add(80) as *const u64);
+        let mut x6 = vld1q_u64(buf2.add(96) as *const u64);
+        let mut x7 = vld1q_u64(buf2.add(112) as *const u64);
+        let mut x8 = vld1q_u64(buf2.add(128) as *const u64);
+
+        // ISO-HDLC specific constants
+        let k_vals: [u64; 2] = [0x26b70c3d, 0x3f41287a];
+        let mut k = vld1q_u64(k_vals.as_ptr());
+        let mut buf2 = buf2.add(144);
+
+        // Main loop
+        while buf <= limit {
+            let y0 = clmul_lo(x0, k);
+            x0 = clmul_hi(x0, k);
+            let y1 = clmul_lo(x1, k);
+            x1 = clmul_hi(x1, k);
+            let y2 = clmul_lo(x2, k);
+            x2 = clmul_hi(x2, k);
+            let y3 = clmul_lo(x3, k);
+            x3 = clmul_hi(x3, k);
+            let y4 = clmul_lo(x4, k);
+            x4 = clmul_hi(x4, k);
+            let y5 = clmul_lo(x5, k);
+            x5 = clmul_hi(x5, k);
+            let y6 = clmul_lo(x6, k);
+            x6 = clmul_hi(x6, k);
+            let y7 = clmul_lo(x7, k);
+            x7 = clmul_hi(x7, k);
+            let y8 = clmul_lo(x8, k);
+            x8 = clmul_hi(x8, k);
+
+            x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64));
+            x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64));
+            x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64));
+            x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64));
+            x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64));
+            x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64));
+            x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64));
+            x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64));
+            x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64));
+
+            crc0 = __crc32d(crc0, *(buf as *const u64));
+            crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64));
+            crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64));
+            crc0 = __crc32d(crc0, *(buf.add(8) as *const u64));
+            crc1 = __crc32d(crc1, *(buf.add(klen + 8) as *const u64));
+            crc2 = __crc32d(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+
+            buf = buf.add(16);
+            buf2 = buf2.add(144);
+        }
+
+        // Reduce x0 ... x8 to just x0
+        let k_vals: [u64; 2] = [0xae689191, 0xccaa009e];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo(x0, k);
+        x0 = clmul_hi(x0, k);
+        x0 = veor3q_u64(x0, y0, x1);
+        x1 = x2;
+        x2 = x3;
+        x3 = x4;
+        x4 = x5;
+        x5 = x6;
+        x6 = x7;
+        x7 = x8;
+
+        let y0 = clmul_lo(x0, k);
+        x0 = clmul_hi(x0, k);
+        let y2 = clmul_lo(x2, k);
+        x2 = clmul_hi(x2, k);
+        let y4 = clmul_lo(x4, k);
+        x4 = clmul_hi(x4, k);
+        let y6 = clmul_lo(x6, k);
+        x6 = clmul_hi(x6, k);
+
+        x0 = veor3q_u64(x0, y0, x1);
+        x2 = veor3q_u64(x2, y2, x3);
+        x4 = veor3q_u64(x4, y4, x5);
+        x6 = veor3q_u64(x6, y6, x7);
+
+        let k_vals: [u64; 2] = [0xf1da05aa, 0x81256527];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo(x0, k);
+        x0 = clmul_hi(x0, k);
+        let y4 = clmul_lo(x4, k);
+        x4 = clmul_hi(x4, k);
+
+        x0 = veor3q_u64(x0, y0, x2);
+        x4 = veor3q_u64(x4, y4, x6);
+
+        let k_vals: [u64; 2] = [0x8f352d95, 0x1d9513d7];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo(x0, k);
+        x0 = clmul_hi(x0, k);
+        x0 = veor3q_u64(x0, y0, x4);
+
+        // Final scalar chunk
+        crc0 = __crc32d(crc0, *(buf as *const u64));
+        crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64));
+        crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64));
+        crc0 = __crc32d(crc0, *(buf.add(8) as *const u64));
+        crc1 = __crc32d(crc1, *(buf.add(klen + 8) as *const u64));
+        crc2 = __crc32d(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+
+        let vc0 = crc_shift_iso_hdlc(crc0, klen * 2 + blk * 144);
+        let vc1 = crc_shift_iso_hdlc(crc1, klen + blk * 144);
+        let vc2 = crc_shift_iso_hdlc(crc2, blk * 144);
+        let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1));
+
+        buf = buf2;
+        len = end.offset_from(buf) as usize;
+    }
+
+    if len >= 32 {
+        let klen = ((len - 8) / 24) * 8;
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // Main loop
+        loop {
+            crc0 = __crc32d(crc0, *(buf as *const u64));
+            crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64));
+            crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64));
+            buf = buf.add(8);
+            len -= 24;
+            if len < 32 {
+                break;
+            }
+        }
+
+        let vc0 = crc_shift_iso_hdlc(crc0, klen * 2 + 8);
+        let vc1 = crc_shift_iso_hdlc(crc1, klen + 8);
+        let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
+
+        // Final 8 bytes
+        buf = buf.add(klen * 2);
+        crc0 = crc2;
+        crc0 = __crc32d(crc0, *(buf as *const u64) ^ vc);
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len >= 8 {
+        crc0 = __crc32d(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len > 0 {
+        crc0 = __crc32b(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn crc_shift_iso_hdlc(crc: u32, nbytes: usize) -> uint64x2_t {
+    clmul_scalar(crc, xnmodp_iso_hdlc((nbytes * 8 - 33) as u64))
+}
+
+// x^n mod P, in log(n) time
+#[inline]
+#[target_feature(enable = "crc,aes")]
+unsafe fn xnmodp_iso_hdlc(mut n: u64) -> u32 {
+    let mut stack = !1u64;
+    let mut acc: u32;
+    let mut low: u32;
+
+    while n > 191 {
+        stack = (stack << 1) + (n & 1);
+        n = (n >> 1) - 16;
+    }
+    stack = !stack;
+    acc = 0x80000000u32 >> (n & 31);
+    n >>= 5;
+
+    while n > 0 {
+        // ARM CRC32 instruction (ISO-HDLC uses standard CRC32, not CRC32C)
+        acc = __crc32w(acc, 0);
+        n -= 1;
+    }
+
+    while {
+        low = (stack & 1) as u32;
+        stack >>= 1;
+        stack != 0
+    } {
+        // Convert to polynomial type and square it
+        let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64));
+        let squared = vmull_p8(x, x);
+        let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0);
+        acc = __crc32d(0, y << low);
+    }
+    acc
+}
diff --git a/src/crc32/fusion/aarch64/iso_hdlc/mod.rs b/src/crc32/fusion/aarch64/iso_hdlc/mod.rs
new file mode 100644
index 0000000..2f3db58
--- /dev/null
+++ b/src/crc32/fusion/aarch64/iso_hdlc/mod.rs
@@ -0,0 +1,9 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides aarch64-specific implementations of CRC-32/ISO-HDLC calculations using
+//! fusion techniques.
+
+#![cfg(target_arch = "aarch64")]
+
+pub(crate) mod crc_pmull;
+pub(crate) mod crc_pmull_sha3;
diff --git a/src/crc32/fusion/aarch64/mod.rs b/src/crc32/fusion/aarch64/mod.rs
new file mode 100644
index 0000000..de15ab4
--- /dev/null
+++ b/src/crc32/fusion/aarch64/mod.rs
@@ -0,0 +1,275 @@
+//! Provides CRC-32/ISCSI and CRC-32/ISO-HDLC calculations using a fusion of native CLMUL
+//! instructions and native CRC calculation instructions on aarch64.
+//!
+//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
+//!
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! Modified as necessary for this Rust implementation.
+//!
+//! MIT licensed.
+
+#![cfg(target_arch = "aarch64")]
+
+mod iscsi;
+mod iso_hdlc;
+
+use std::arch::aarch64::*;
+use std::arch::is_aarch64_feature_detected;
+
+use iscsi::crc_pmull::crc32_iscsi_v12e_v1;
+use iscsi::crc_pmull_sha3::crc32_iscsi_eor3_v9s3x2e_s3;
+use iso_hdlc::crc_pmull::crc32_iso_hdlc_v12e_v1;
+use iso_hdlc::crc_pmull_sha3::crc32_iso_hdlc_eor3_v9s3x2e_s3;
+
+#[inline(always)]
+pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
+    if is_aarch64_feature_detected!("sha3") {
+        unsafe { crc32_iscsi_aes_sha3(crc, data) }
+    } else {
+        unsafe { crc32_iscsi_aes(crc, data) }
+    }
+}
+
+#[inline(always)]
+pub fn crc32_iso_hdlc(crc: u32, data: &[u8]) -> u32 {
+    if is_aarch64_feature_detected!("sha3") {
+        unsafe { crc32_iso_hdlc_aes_sha3(crc, data) }
+    } else {
+        unsafe { crc32_iso_hdlc_aes(crc, data) }
+    }
+}
+
+/// Safe wrapper for CRC32 iSCSI calculation
+#[inline]
+#[target_feature(enable = "crc,aes,sha3")]
+unsafe fn crc32_iscsi_aes_sha3(crc: u32, data: &[u8]) -> u32 {
+    unsafe {
+        const LARGE_BUFFER_THRESHOLD: usize = 1024;
+
+        // Select implementation based on buffer size
+        if data.len() <= LARGE_BUFFER_THRESHOLD {
+            crc32_iscsi_v12e_v1(crc, data.as_ptr(), data.len())
+        } else {
+            crc32_iscsi_eor3_v9s3x2e_s3(crc, data.as_ptr(), data.len())
+        }
+    }
+}
+
+#[inline]
+#[target_feature(enable = "crc,aes")]
+unsafe fn crc32_iscsi_aes(crc: u32, data: &[u8]) -> u32 {
+    unsafe { crc32_iscsi_v12e_v1(crc, data.as_ptr(), data.len()) }
+}
+
+/// Safe wrapper for CRC32 ISO-HDLC calculation
+#[inline]
+#[target_feature(enable = "crc,aes,sha3")]
+unsafe fn crc32_iso_hdlc_aes_sha3(crc: u32, data: &[u8]) -> u32 {
+    unsafe {
+        const LARGE_BUFFER_THRESHOLD: usize = 1024;
+
+        // Select implementation based on buffer size
+        if data.len() <= LARGE_BUFFER_THRESHOLD {
+            crc32_iso_hdlc_v12e_v1(crc, data.as_ptr(), data.len())
+        } else {
+            crc32_iso_hdlc_eor3_v9s3x2e_s3(crc, data.as_ptr(), data.len())
+        }
+    }
+}
+
+#[inline]
+#[target_feature(enable = "crc,aes")]
+unsafe fn crc32_iso_hdlc_aes(crc: u32, data: &[u8]) -> u32 {
+    unsafe { crc32_iso_hdlc_v12e_v1(crc, data.as_ptr(), data.len()) }
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn clmul_lo(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    // Polynomial multiply low parts - convert u128 result to uint64x2_t
+    let result = vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0));
+    vreinterpretq_u64_p128(result)
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn clmul_hi(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    // Polynomial multiply high parts - convert u128 result to uint64x2_t
+    let result = vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1));
+    vreinterpretq_u64_p128(result)
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn clmul_scalar(a: u32, b: u32) -> uint64x2_t {
+    // Polynomial multiply scalars - convert u128 result to uint64x2_t
+    let result = vmull_p64(a as u64, b as u64);
+    vreinterpretq_u64_p128(result)
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn clmul_lo_and_xor(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    veorq_u64(clmul_lo(a, b), c)
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn clmul_hi_and_xor(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    veorq_u64(clmul_hi(a, b), c)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::test::consts::TEST_CHECK_STRING;
+    use crc::{Crc, Table};
+    use rand::{rng, Rng};
+
+    const RUST_CRC32_ISO_HDLC: Crc<u32, Table<16>> =
+        Crc::<u32, Table<16>>::new(&crc::CRC_32_ISO_HDLC);
+
+    const RUST_CRC32_ISCSI: Crc<u32, Table<16>> = Crc::<u32, Table<16>>::new(&crc::CRC_32_ISCSI);
+
+    #[test]
+    fn test_crc32_iso_hdlc_check() {
+        assert_eq!(
+            crc32_iso_hdlc(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
+            0xcbf43926
+        );
+    }
+
+    #[test]
+    fn test_crc32_iso_hdlc_small_all_lengths() {
+        for len in 1..=255 {
+            crc32_iso_hdlc_random(len)
+        }
+    }
+
+    #[test]
+    fn test_crc32_iso_hdlc_medium_lengths() {
+        // Test each length from 256 to 1024, which should fold and include handling remainders
+        for len in 256..=1024 {
+            crc32_iso_hdlc_random(len)
+        }
+    }
+
+    #[test]
+    fn test_crc32_iso_hdlc_large_lengths() {
+        // Test 1 MiB just before, at, and just after the folding boundaries
+        for len in 1048575..1048577 {
+            crc32_iso_hdlc_random(len)
+        }
+    }
+
+    #[test]
+    fn test_crc32_iscsi_check() {
+        assert_eq!(
+            crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
+            0xe3069283
+        );
+    }
+
+    #[test]
+    fn test_crc32_iscsi_small_all_lengths() {
+        for len in 1..=255 {
+            crc32_iscsi_random(len);
+        }
+    }
+
+    #[test]
+    fn test_crc32_iscsi_medium_lengths() {
+        // Test each length from 256 to 1024, which should fold and include handling remainders
+        for len in 256..=1024 {
+            crc32_iscsi_random(len);
+        }
+    }
+
+    #[test]
+    fn test_crc32_iscsi_large_lengths() {
+        // Test 1 MiB just before, at, and just after the folding boundaries
+        for len in 1048575..1048577 {
+            crc32_iscsi_random(len);
+        }
+    }
+
+    #[cfg(target_feature = "sha3")]
+    fn crc32_iso_hdlc_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISO_HDLC.checksum(&data);
+
+        assert_eq!(crc32_iso_hdlc(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            assert_eq!(
+                crc32_iso_hdlc_eor3_v9s3x2e_s3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+
+            assert_eq!(
+                crc32_iso_hdlc_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+
+    #[cfg(not(target_feature = "sha3"))]
+    fn crc32_iso_hdlc_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISO_HDLC.checksum(&data);
+
+        assert_eq!(crc32_iso_hdlc(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            assert_eq!(
+                crc32_iso_hdlc_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+
+    #[cfg(target_feature = "sha3")]
+    fn crc32_iscsi_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISCSI.checksum(&data);
+
+        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            assert_eq!(
+                crc32_iscsi_eor3_v9s3x2e_s3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+
+            assert_eq!(
+                crc32_iscsi_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+
+    #[cfg(not(target_feature = "sha3"))]
+    fn crc32_iscsi_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISCSI.checksum(&data);
+
+        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            assert_eq!(
+                crc32_iscsi_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+}
diff --git a/src/crc32/fusion/mod.rs b/src/crc32/fusion/mod.rs
index d75a64b..774ce3b 100644
--- a/src/crc32/fusion/mod.rs
+++ b/src/crc32/fusion/mod.rs
@@ -11,24 +11,23 @@
 mod aarch64;
 mod x86;
 
+/// Only AArch64 has native CRC-32/ISO-HDLC instructions
 #[inline(always)]
-#[allow(unused)]
+#[cfg(target_arch = "aarch64")]
 pub(crate) fn crc32_iso_hdlc(state: u32, data: &[u8]) -> u32 {
-    #[cfg(target_arch = "aarch64")]
-    return aarch64::crc32_iso_hdlc(state, data);
-
-    #[cfg(not(target_arch = "aarch64"))]
-    panic!("CRC-32/ISO-HDLC with fusion is only supported on AArch64 architecture");
+    aarch64::crc32_iso_hdlc(state, data)
 }
 
+/// Both AArch64 and x86 have native CRC-32/ISCSI instructions
 #[inline(always)]
 pub(crate) fn crc32_iscsi(state: u32, data: &[u8]) -> u32 {
     #[cfg(target_arch = "aarch64")]
-    return aarch64::crc32_iscsi(state, data);
-
-    #[cfg(target_arch = "x86_64")]
-    return x86::crc32_iscsi(state, data);
+    {
+        aarch64::crc32_iscsi(state, data)
+    }
 
-    #[cfg(all(not(target_arch = "aarch64"), not(target_arch = "x86_64")))]
-    panic!("CRC-32/ISCSI with fusion is only supported on AArch64 and X86_64 architectures");
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        x86::crc32_iscsi(state, data)
+    }
 }
diff --git a/src/crc32/fusion/x86.rs b/src/crc32/fusion/x86.rs
deleted file mode 100644
index c288072..0000000
--- a/src/crc32/fusion/x86.rs
+++ /dev/null
@@ -1,740 +0,0 @@
-//! Provides CRC-32/ISCSI and CRC-32/ISO-HDLC calculations using a fusion of native CLMUL
-//! instructions and native CRC calculation instructions on x86_64.
-//!
-//! https://www.corsix.org/content/fast-crc32c-4k
-//! https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
-//!
-//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-//! with the help of Claude.ai using:
-//!
-//! ./generate -i avx512_vpclmulqdq -p crc32c -a v3x2
-//! ./generate -i avx512 -p crc32c -a v4s3x3
-//! ./generate -i sse -p crc32c -a v4s3x3
-//!
-//! Modified as necessary for this Rust implementation.
-//!
-//! MIT licensed.
-
-#![cfg(target_arch = "x86_64")]
-
-use std::arch::x86_64::*;
-
-/// Safe wrapper for CRC32 iSCSI calculation using AVX-512
-#[rustversion::before(1.89)]
-#[inline(always)]
-pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
-    unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) }
-}
-
-#[rustversion::since(1.89)]
-#[inline(always)]
-pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
-    if is_x86_feature_detected!("vpclmulqdq")
-        && is_x86_feature_detected!("avx512f")
-        && is_x86_feature_detected!("avx512vl")
-    {
-        unsafe {
-            return crc32_iscsi_avx512_vpclmulqdq_v3x2(crc, data.as_ptr(), data.len());
-        }
-    }
-
-    if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
-        unsafe {
-            return crc32_iscsi_avx512_v4s3x3(crc, data.as_ptr(), data.len());
-        }
-    }
-
-    unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) }
-}
-
-#[rustversion::since(1.89)]
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl,vpclmulqdq")]
-unsafe fn clmul_lo_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i {
-    _mm512_clmulepi64_epi128(a, b, 0)
-}
-
-#[rustversion::since(1.89)]
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl,vpclmulqdq")]
-unsafe fn clmul_hi_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i {
-    _mm512_clmulepi64_epi128(a, b, 17)
-}
-
-#[inline]
-#[target_feature(enable = "pclmulqdq")]
-unsafe fn clmul_lo_sse(a: __m128i, b: __m128i) -> __m128i {
-    _mm_clmulepi64_si128(a, b, 0)
-}
-
-#[inline]
-#[target_feature(enable = "pclmulqdq")]
-unsafe fn clmul_hi_sse(a: __m128i, b: __m128i) -> __m128i {
-    _mm_clmulepi64_si128(a, b, 17)
-}
-
-#[inline]
-#[target_feature(enable = "pclmulqdq")]
-unsafe fn clmul_scalar_sse(a: u32, b: u32) -> __m128i {
-    _mm_clmulepi64_si128(_mm_cvtsi32_si128(a as i32), _mm_cvtsi32_si128(b as i32), 0)
-}
-
-// x^n mod P, in log(n) time
-#[target_feature(enable = "sse4.2,pclmulqdq")]
-unsafe fn xnmodp_iscsi_sse(mut n: u64) -> u32 {
-    let mut stack = !1u64;
-    let mut acc: u32;
-    let mut low: u32;
-
-    while n > 191 {
-        stack = (stack << 1) + (n & 1);
-        n = (n >> 1) - 16;
-    }
-    stack = !stack;
-    acc = 0x80000000u32 >> (n & 31);
-    n >>= 5;
-
-    while n > 0 {
-        // Use hardware CRC32C instruction
-        acc = _mm_crc32_u32(acc, 0);
-        n -= 1;
-    }
-
-    while {
-        low = (stack & 1) as u32;
-        stack >>= 1;
-        stack != 0
-    } {
-        let x = _mm_cvtsi32_si128(acc as i32);
-        let y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0)) as u64;
-        acc = _mm_crc32_u64(0, y << low) as u32;
-    }
-    acc
-}
-
-#[inline]
-#[target_feature(enable = "pclmulqdq")]
-unsafe fn crc_shift_iscsi_sse(crc: u32, nbytes: usize) -> __m128i {
-    clmul_scalar_sse(crc, xnmodp_iscsi_sse((nbytes * 8 - 33) as u64))
-}
-
-#[inline]
-#[target_feature(enable = "sse4.1")]
-unsafe fn mm_extract_epi64(val: __m128i, idx: i32) -> u64 {
-    if idx == 0 {
-        _mm_cvtsi128_si64(val) as u64
-    } else {
-        _mm_cvtsi128_si64(_mm_srli_si128(val, 8)) as u64
-    }
-}
-
-#[inline]
-#[target_feature(enable = "sse4.2")]
-unsafe fn mm_crc32_u64(crc: u32, val: u64) -> u32 {
-    _mm_crc32_u64(crc.into(), val) as u32
-}
-
-/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-/// using:
-///
-/// ./generate -i avx512_vpclmulqdq -p crc32c -a v3x2
-#[rustversion::since(1.89)]
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl,vpclmulqdq,sse4.2")]
-pub unsafe fn crc32_iscsi_avx512_vpclmulqdq_v3x2(
-    mut crc0: u32,
-    mut buf: *const u8,
-    mut len: usize,
-) -> u32 {
-    // Align to 8-byte boundary
-    while len > 0 && (buf as usize & 7) != 0 {
-        crc0 = _mm_crc32_u8(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    // Align to 64-byte boundary (cache line)
-    while (buf as usize & 56) != 0 && len >= 8 {
-        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    if len >= 384 {
-        // First vector chunk - load three 512-bit vectors (192 bytes total)
-        let mut x0 = _mm512_loadu_si512(buf as *const __m512i);
-        let mut x1 = _mm512_loadu_si512(buf.add(64) as *const __m512i);
-        let mut x2 = _mm512_loadu_si512(buf.add(128) as *const __m512i);
-
-        // Create the multiplication constant vector
-        // Pattern: [0xa87ab8a8, 0, 0xab7aff2a, 0] repeated across all 128-bit lanes
-        let k_128 = _mm_setr_epi32(0xa87ab8a8u32 as i32, 0, 0xab7aff2au32 as i32, 0);
-        let mut k = _mm512_broadcast_i32x4(k_128);
-
-        // XOR the CRC into the first vector's low 32 bits
-        let crc_vec = _mm512_castsi128_si512(_mm_cvtsi32_si128(crc0 as i32));
-        x0 = _mm512_xor_si512(crc_vec, x0);
-
-        // First round of polynomial multiplication
-        let mut y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
-        x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
-        let mut y1 = clmul_lo_avx512_vpclmulqdq(x1, k);
-        x1 = clmul_hi_avx512_vpclmulqdq(x1, k);
-        let mut y2 = clmul_lo_avx512_vpclmulqdq(x2, k);
-        x2 = clmul_hi_avx512_vpclmulqdq(x2, k);
-
-        // XOR with next chunk of data using ternary logic (A XOR B XOR C)
-        // 0x96 = A XOR B XOR C in ternary logic notation
-        x0 = _mm512_ternarylogic_epi64(
-            x0,
-            y0,
-            _mm512_loadu_si512(buf.add(192) as *const __m512i),
-            0x96,
-        );
-        x1 = _mm512_ternarylogic_epi64(
-            x1,
-            y1,
-            _mm512_loadu_si512(buf.add(256) as *const __m512i),
-            0x96,
-        );
-        x2 = _mm512_ternarylogic_epi64(
-            x2,
-            y2,
-            _mm512_loadu_si512(buf.add(320) as *const __m512i),
-            0x96,
-        );
-
-        buf = buf.add(384);
-        len -= 384;
-
-        // Main loop - process 384 bytes at a time
-        while len >= 384 {
-            // First folding step
-            y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
-            x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
-            y1 = clmul_lo_avx512_vpclmulqdq(x1, k);
-            x1 = clmul_hi_avx512_vpclmulqdq(x1, k);
-            y2 = clmul_lo_avx512_vpclmulqdq(x2, k);
-            x2 = clmul_hi_avx512_vpclmulqdq(x2, k);
-
-            x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512(buf as *const __m512i), 0x96);
-            x1 = _mm512_ternarylogic_epi64(
-                x1,
-                y1,
-                _mm512_loadu_si512(buf.add(64) as *const __m512i),
-                0x96,
-            );
-            x2 = _mm512_ternarylogic_epi64(
-                x2,
-                y2,
-                _mm512_loadu_si512(buf.add(128) as *const __m512i),
-                0x96,
-            );
-
-            // Second folding step
-            y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
-            x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
-            y1 = clmul_lo_avx512_vpclmulqdq(x1, k);
-            x1 = clmul_hi_avx512_vpclmulqdq(x1, k);
-            y2 = clmul_lo_avx512_vpclmulqdq(x2, k);
-            x2 = clmul_hi_avx512_vpclmulqdq(x2, k);
-
-            x0 = _mm512_ternarylogic_epi64(
-                x0,
-                y0,
-                _mm512_loadu_si512(buf.add(192) as *const __m512i),
-                0x96,
-            );
-            x1 = _mm512_ternarylogic_epi64(
-                x1,
-                y1,
-                _mm512_loadu_si512(buf.add(256) as *const __m512i),
-                0x96,
-            );
-            x2 = _mm512_ternarylogic_epi64(
-                x2,
-                y2,
-                _mm512_loadu_si512(buf.add(320) as *const __m512i),
-                0x96,
-            );
-
-            buf = buf.add(384);
-            len -= 384;
-        }
-
-        // Reduce x0, x1, x2 to just x0
-        let k_128 = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0);
-        k = _mm512_broadcast_i32x4(k_128);
-
-        y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
-        x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
-        x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96);
-        x1 = x2;
-
-        y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
-        x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
-        x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96);
-
-        // Reduce 512 bits to 128 bits
-        // Multiple reduction constants for different parts of the 512-bit vector
-        k = _mm512_setr_epi32(
-            0x1c291d04u32 as i32,
-            0,
-            0xddc0152bu32 as i32,
-            0, // Lane 0
-            0x3da6d0cbu32 as i32,
-            0,
-            0xba4fc28eu32 as i32,
-            0, // Lane 1
-            0xf20c0dfeu32 as i32,
-            0,
-            0x493c7d27u32 as i32,
-            0, // Lane 2
-            0,
-            0,
-            0,
-            0, // Lane 3 (unused)
-        );
-
-        y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
-        k = clmul_hi_avx512_vpclmulqdq(x0, k);
-        y0 = _mm512_xor_si512(y0, k);
-
-        // Extract 128-bit lanes and combine them
-        let lane0 = _mm512_castsi512_si128(y0);
-        let lane1 = _mm512_extracti32x4_epi32(y0, 1);
-        let lane2 = _mm512_extracti32x4_epi32(y0, 2);
-        let lane3 = _mm512_extracti32x4_epi32(x0, 3);
-
-        // Combine all lanes using ternary logic
-        let mut z0 = _mm_ternarylogic_epi64(lane0, lane1, lane2, 0x96);
-        z0 = _mm_xor_si128(z0, lane3);
-
-        // Reduce 128 bits to 32 bits using CRC32 instructions
-        crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0) as u64) as u32;
-        crc0 = _mm_crc32_u64(crc0.into(), _mm_extract_epi64(z0, 1) as u64) as u32;
-    }
-
-    // Process remaining 8-byte chunks
-    while len >= 8 {
-        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    // Process remaining bytes
-    while len > 0 {
-        crc0 = _mm_crc32_u8(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    crc0
-}
-
-/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-/// using:
-///
-/// ./generate -i avx512 -p crc32c -a v4s3x3
-#[rustversion::since(1.89)]
-#[inline]
-#[target_feature(enable = "avx2,avx512f,avx512vl,pclmulqdq")]
-pub unsafe fn crc32_iscsi_avx512_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
-    // Align to 8-byte boundary using hardware CRC32C instructions
-    while len > 0 && (buf as usize & 7) != 0 {
-        crc0 = _mm_crc32_u8(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    // Handle 8-byte alignment
-    if (buf as usize & 8) != 0 && len >= 8 {
-        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    if len >= 144 {
-        let blk = (len - 8) / 136;
-        let klen = blk * 24;
-        let buf2 = buf;
-        let mut crc1 = 0u32;
-        let mut crc2 = 0u32;
-
-        // First vector chunk - load four 128-bit vectors (64 bytes total)
-        let mut x0 = _mm_loadu_si128(buf2 as *const __m128i);
-        let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i);
-        let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i);
-        let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i);
-
-        // iSCSI-specific folding constant (different from ISO-HDLC)
-        let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0);
-
-        // XOR the CRC into the first vector's low 32 bits
-        x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0);
-        crc0 = 0;
-
-        let mut buf2 = buf2.add(64);
-        len -= 136;
-        buf = buf.add(blk * 64);
-
-        // Main loop - process 144 bytes at a time
-        while len >= 144 {
-            let y0 = clmul_lo_sse(x0, k);
-            x0 = clmul_hi_sse(x0, k);
-            let y1 = clmul_lo_sse(x1, k);
-            x1 = clmul_hi_sse(x1, k);
-            let y2 = clmul_lo_sse(x2, k);
-            x2 = clmul_hi_sse(x2, k);
-            let y3 = clmul_lo_sse(x3, k);
-            x3 = clmul_hi_sse(x3, k);
-
-            // XOR with next chunk of data using ternary logic (A XOR B XOR C)
-            x0 = _mm_ternarylogic_epi64(x0, y0, _mm_loadu_si128(buf2 as *const __m128i), 0x96);
-            x1 = _mm_ternarylogic_epi64(
-                x1,
-                y1,
-                _mm_loadu_si128(buf2.add(16) as *const __m128i),
-                0x96,
-            );
-            x2 = _mm_ternarylogic_epi64(
-                x2,
-                y2,
-                _mm_loadu_si128(buf2.add(32) as *const __m128i),
-                0x96,
-            );
-            x3 = _mm_ternarylogic_epi64(
-                x3,
-                y3,
-                _mm_loadu_si128(buf2.add(48) as *const __m128i),
-                0x96,
-            );
-
-            // Process scalar data in parallel using hardware CRC32C
-            crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
-            crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen) as *const u64)) as u32;
-            crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2) as *const u64)) as u32;
-            crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(8) as *const u64)) as u32;
-            crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 8) as *const u64)) as u32;
-            crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 8) as *const u64)) as u32;
-            crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(16) as *const u64)) as u32;
-            crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 16) as *const u64)) as u32;
-            crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 16) as *const u64)) as u32;
-
-            buf = buf.add(24);
-            buf2 = buf2.add(64);
-            len -= 136;
-        }
-
-        // Reduce x0 ... x3 to just x0 using iSCSI-specific constants
-        k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0);
-
-        let y0 = clmul_lo_sse(x0, k);
-        x0 = clmul_hi_sse(x0, k);
-        let y2 = clmul_lo_sse(x2, k);
-        x2 = clmul_hi_sse(x2, k);
-
-        x0 = _mm_ternarylogic_epi64(x0, y0, x1, 0x96);
-        x2 = _mm_ternarylogic_epi64(x2, y2, x3, 0x96);
-
-        k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0);
-
-        let y0 = clmul_lo_sse(x0, k);
-        x0 = clmul_hi_sse(x0, k);
-        x0 = _mm_ternarylogic_epi64(x0, y0, x2, 0x96);
-
-        // Final scalar chunk
-        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
-        crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen) as *const u64)) as u32;
-        crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2) as *const u64)) as u32;
-        crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(8) as *const u64)) as u32;
-        crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 8) as *const u64)) as u32;
-        crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 8) as *const u64)) as u32;
-        crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(16) as *const u64)) as u32;
-        crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 16) as *const u64)) as u32;
-        crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 16) as *const u64)) as u32;
-        buf = buf.add(24);
-
-        let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8);
-        let vc1 = crc_shift_iscsi_sse(crc1, klen + 8);
-        let mut vc = _mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0) as u64;
-
-        // Reduce 128 bits to 32 bits, and multiply by x^32
-        let x0_low = _mm_extract_epi64(x0, 0) as u64;
-        let x0_high = _mm_extract_epi64(x0, 1) as u64;
-        let vec_crc = _mm_crc32_u64(_mm_crc32_u64(0, x0_low), x0_high);
-        vc ^= _mm_extract_epi64(crc_shift_iscsi_sse(vec_crc as u32, klen * 3 + 8), 0) as u64;
-
-        // Final 8 bytes
-        buf = buf.add(klen * 2);
-        crc0 = crc2;
-        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64) ^ vc) as u32;
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    // Process remaining 8-byte chunks using hardware CRC32C
-    while len >= 8 {
-        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    // Process remaining bytes using hardware CRC32C
-    while len > 0 {
-        crc0 = _mm_crc32_u8(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    crc0
-}
-
-/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
-/// using:
-///
-/// ./generate -i sse -p crc32c -a v4s3x3
-#[inline]
-#[target_feature(enable = "sse4.2,pclmulqdq")]
-pub unsafe fn crc32_iscsi_sse_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
-    // Align to 8-byte boundary using hardware CRC32C instructions
-    while len > 0 && (buf as usize & 7) != 0 {
-        crc0 = _mm_crc32_u8(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    // Handle 8-byte alignment
-    if (buf as usize & 8) != 0 && len >= 8 {
-        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    if len >= 144 {
-        let blk = (len - 8) / 136;
-        let klen = blk * 24;
-        let buf2 = buf;
-        let mut crc1 = 0u32;
-        let mut crc2 = 0u32;
-
-        // First vector chunk - load four 128-bit vectors (64 bytes total)
-        let mut x0 = _mm_loadu_si128(buf2 as *const __m128i);
-        let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i);
-        let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i);
-        let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i);
-
-        // iSCSI-specific folding constant (same as AVX-512 version)
-        let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0);
-
-        // XOR the CRC into the first vector's low 32 bits
-        x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0);
-        crc0 = 0;
-
-        let mut buf2 = buf2.add(64);
-        len -= 136;
-        buf = buf.add(blk * 64);
-
-        // Main loop - process 144 bytes at a time
-        while len >= 144 {
-            let mut y0 = clmul_lo_sse(x0, k);
-            x0 = clmul_hi_sse(x0, k);
-            let mut y1 = clmul_lo_sse(x1, k);
-            x1 = clmul_hi_sse(x1, k);
-            let mut y2 = clmul_lo_sse(x2, k);
-            x2 = clmul_hi_sse(x2, k);
-            let mut y3 = clmul_lo_sse(x3, k);
-            x3 = clmul_hi_sse(x3, k);
-
-            // XOR operations using separate XOR instructions (no ternary logic in SSE)
-            y0 = _mm_xor_si128(y0, _mm_loadu_si128(buf2 as *const __m128i));
-            x0 = _mm_xor_si128(x0, y0);
-            y1 = _mm_xor_si128(y1, _mm_loadu_si128(buf2.add(16) as *const __m128i));
-            x1 = _mm_xor_si128(x1, y1);
-            y2 = _mm_xor_si128(y2, _mm_loadu_si128(buf2.add(32) as *const __m128i));
-            x2 = _mm_xor_si128(x2, y2);
-            y3 = _mm_xor_si128(y3, _mm_loadu_si128(buf2.add(48) as *const __m128i));
-            x3 = _mm_xor_si128(x3, y3);
-
-            // Process scalar data in parallel using hardware CRC32C
-            crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
-            crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64));
-            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64));
-            crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64));
-            crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64));
-            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64));
-            crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64));
-            crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64));
-            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64));
-
-            buf = buf.add(24);
-            buf2 = buf2.add(64);
-            len -= 136;
-        }
-
-        // Reduce x0 ... x3 to just x0 using iSCSI-specific constants
-        k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0);
-
-        let mut y0 = clmul_lo_sse(x0, k);
-        x0 = clmul_hi_sse(x0, k);
-        let mut y2 = clmul_lo_sse(x2, k);
-        x2 = clmul_hi_sse(x2, k);
-
-        y0 = _mm_xor_si128(y0, x1);
-        x0 = _mm_xor_si128(x0, y0);
-        y2 = _mm_xor_si128(y2, x3);
-        x2 = _mm_xor_si128(x2, y2);
-
-        k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0);
-
-        y0 = clmul_lo_sse(x0, k);
-        x0 = clmul_hi_sse(x0, k);
-        y0 = _mm_xor_si128(y0, x2);
-        x0 = _mm_xor_si128(x0, y0);
-
-        // Final scalar chunk
-        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
-        crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64));
-        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64));
-        crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64));
-        crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64));
-        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64));
-        crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64));
-        crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64));
-        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64));
-        buf = buf.add(24);
-
-        let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8);
-        let vc1 = crc_shift_iscsi_sse(crc1, klen + 8);
-        let mut vc = mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0);
-
-        // Reduce 128 bits to 32 bits, and multiply by x^32
-        // Extract the two 64-bit parts of x0 and combine them
-        let x0_low = mm_extract_epi64(x0, 0);
-        let x0_high = mm_extract_epi64(x0, 1);
-        let x0_combined = mm_extract_epi64(
-            crc_shift_iscsi_sse(mm_crc32_u64(mm_crc32_u64(0, x0_low), x0_high), klen * 3 + 8),
-            0,
-        );
-        vc ^= x0_combined;
-
-        // Final 8 bytes
-        buf = buf.add(klen * 2);
-        crc0 = crc2;
-        crc0 = mm_crc32_u64(crc0, *(buf as *const u64) ^ vc);
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    // Process remaining 8-byte chunks using hardware CRC32C
-    while len >= 8 {
-        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
-        buf = buf.add(8);
-        len -= 8;
-    }
-
-    // Process remaining bytes using hardware CRC32C
-    while len > 0 {
-        crc0 = _mm_crc32_u8(crc0, *buf);
-        buf = buf.add(1);
-        len -= 1;
-    }
-
-    crc0
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::test::consts::TEST_CHECK_STRING;
-    use crc::{Crc, Table};
-    use rand::{rng, Rng};
-
-    const RUST_CRC32_ISCSI: Crc<u32, Table<16>> = Crc::<u32, Table<16>>::new(&crc::CRC_32_ISCSI);
-
-    #[test]
-    fn test_crc32_iscsi_check() {
-        assert_eq!(
-            crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
-            0xe3069283
-        );
-    }
-
-    #[test]
-    fn test_crc32_iscsi_small_all_lengths() {
-        for len in 1..=255 {
-            test_crc32_iscsi_random(len);
-        }
-    }
-
-    #[test]
-    fn test_crc32_iscsi_medium_lengths() {
-        // Test each length from 256 to 1024, which should fold and include handling remainders
-        for len in 256..=1024 {
-            test_crc32_iscsi_random(len);
-        }
-    }
-
-    #[test]
-    fn test_crc32_iscsi_large_lengths() {
-        // Test 1 MiB just before, at, and just after the folding boundaries
-        for len in 1048575..1048577 {
-            test_crc32_iscsi_random(len);
-        }
-    }
-
-    #[rustversion::since(1.89)]
-    fn test_crc32_iscsi_random(len: usize) {
-        let mut data = vec![0u8; len];
-        rng().fill(&mut data[..]);
-
-        let checksum = RUST_CRC32_ISCSI.checksum(&data);
-
-        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
-
-        unsafe {
-            if is_x86_feature_detected!("vpclmulqdq")
-                && is_x86_feature_detected!("avx512vl")
-                && is_x86_feature_detected!("avx512f")
-            {
-                assert_eq!(
-                    crc32_iscsi_avx512_vpclmulqdq_v3x2(0xffffffff, data.as_ptr(), data.len())
-                        ^ 0xffffffff,
-                    checksum
-                );
-            }
-
-            if is_x86_feature_detected!("avx512vl")
-                && is_x86_feature_detected!("avx512f")
-                && is_x86_feature_detected!("pclmulqdq")
-            {
-                assert_eq!(
-                    crc32_iscsi_avx512_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                    checksum
-                );
-            }
-
-            assert_eq!(
-                crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                checksum
-            );
-        }
-    }
-
-    #[rustversion::before(1.89)]
-    fn test_crc32_iscsi_random(len: usize) {
-        let mut data = vec![0u8; len];
-        rng().fill(&mut data[..]);
-
-        let checksum = RUST_CRC32_ISCSI.checksum(&data);
-
-        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
-
-        unsafe {
-            assert_eq!(
-                crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
-                checksum
-            );
-        }
-    }
-}
diff --git a/src/crc32/fusion/x86/iscsi/avx512_pclmulqdq.rs b/src/crc32/fusion/x86/iscsi/avx512_pclmulqdq.rs
new file mode 100644
index 0000000..b3bc0ba
--- /dev/null
+++ b/src/crc32/fusion/x86/iscsi/avx512_pclmulqdq.rs
@@ -0,0 +1,170 @@
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! MIT licensed.
+
+#![cfg(target_arch = "x86_64")]
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// with the help of Claude.ai using:
+///
+/// ./generate -i avx512 -p crc32c -a v4s3x3
+///
+/// Modified as necessary for this Rust implementation.
+///
+/// Uses AVX-512 instructions so only available after Rust 1.89 (when AVX-512 stabilized)
+#[rustversion::since(1.89)]
+#[inline]
+#[target_feature(enable = "avx512vl,pclmulqdq")]
+pub unsafe fn crc32_iscsi_avx512_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
+    use crate::fusion::x86::*;
+
+    // Align to 8-byte boundary using hardware CRC32C instructions
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 144 {
+        let blk = (len - 8) / 136;
+        let klen = blk * 24;
+        let buf2 = buf;
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // First vector chunk - load four 128-bit vectors (64 bytes total)
+        let mut x0 = _mm_loadu_si128(buf2 as *const __m128i);
+        let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i);
+        let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i);
+        let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i);
+
+        // iSCSI-specific folding constant (different from ISO-HDLC)
+        let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0);
+
+        // XOR the CRC into the first vector's low 32 bits
+        x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0);
+        crc0 = 0;
+
+        let mut buf2 = buf2.add(64);
+        len -= 136;
+        buf = buf.add(blk * 64);
+
+        // Main loop - process 144 bytes at a time
+        while len >= 144 {
+            let y0 = clmul_lo_sse(x0, k);
+            x0 = clmul_hi_sse(x0, k);
+            let y1 = clmul_lo_sse(x1, k);
+            x1 = clmul_hi_sse(x1, k);
+            let y2 = clmul_lo_sse(x2, k);
+            x2 = clmul_hi_sse(x2, k);
+            let y3 = clmul_lo_sse(x3, k);
+            x3 = clmul_hi_sse(x3, k);
+
+            // XOR with next chunk of data using ternary logic (A XOR B XOR C)
+            x0 = _mm_ternarylogic_epi64(x0, y0, _mm_loadu_si128(buf2 as *const __m128i), 0x96);
+            x1 = _mm_ternarylogic_epi64(
+                x1,
+                y1,
+                _mm_loadu_si128(buf2.add(16) as *const __m128i),
+                0x96,
+            );
+            x2 = _mm_ternarylogic_epi64(
+                x2,
+                y2,
+                _mm_loadu_si128(buf2.add(32) as *const __m128i),
+                0x96,
+            );
+            x3 = _mm_ternarylogic_epi64(
+                x3,
+                y3,
+                _mm_loadu_si128(buf2.add(48) as *const __m128i),
+                0x96,
+            );
+
+            // Process scalar data in parallel using hardware CRC32C
+            crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+            crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen) as *const u64)) as u32;
+            crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2) as *const u64)) as u32;
+            crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(8) as *const u64)) as u32;
+            crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 8) as *const u64)) as u32;
+            crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 8) as *const u64)) as u32;
+            crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(16) as *const u64)) as u32;
+            crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 16) as *const u64)) as u32;
+            crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 16) as *const u64)) as u32;
+
+            buf = buf.add(24);
+            buf2 = buf2.add(64);
+            len -= 136;
+        }
+
+        // Reduce x0 ... x3 to just x0 using iSCSI-specific constants
+        k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0);
+
+        let y0 = clmul_lo_sse(x0, k);
+        x0 = clmul_hi_sse(x0, k);
+        let y2 = clmul_lo_sse(x2, k);
+        x2 = clmul_hi_sse(x2, k);
+
+        x0 = _mm_ternarylogic_epi64(x0, y0, x1, 0x96);
+        x2 = _mm_ternarylogic_epi64(x2, y2, x3, 0x96);
+
+        k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0);
+
+        let y0 = clmul_lo_sse(x0, k);
+        x0 = clmul_hi_sse(x0, k);
+        x0 = _mm_ternarylogic_epi64(x0, y0, x2, 0x96);
+
+        // Final scalar chunk
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+        crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen) as *const u64)) as u32;
+        crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2) as *const u64)) as u32;
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(8) as *const u64)) as u32;
+        crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 8) as *const u64)) as u32;
+        crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 8) as *const u64)) as u32;
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(16) as *const u64)) as u32;
+        crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 16) as *const u64)) as u32;
+        crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 16) as *const u64)) as u32;
+        buf = buf.add(24);
+
+        let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8);
+        let vc1 = crc_shift_iscsi_sse(crc1, klen + 8);
+        let mut vc = _mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0) as u64;
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        let x0_low = _mm_extract_epi64(x0, 0) as u64;
+        let x0_high = _mm_extract_epi64(x0, 1) as u64;
+        let vec_crc = _mm_crc32_u64(_mm_crc32_u64(0, x0_low), x0_high);
+        vc ^= _mm_extract_epi64(crc_shift_iscsi_sse(vec_crc as u32, klen * 3 + 8), 0) as u64;
+
+        // Final 8 bytes
+        buf = buf.add(klen * 2);
+        crc0 = crc2;
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64) ^ vc) as u32;
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    // Process remaining 8-byte chunks using hardware CRC32C
+    while len >= 8 {
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    // Process remaining bytes using hardware CRC32C
+    while len > 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
diff --git a/src/crc32/fusion/x86/iscsi/avx512_vpclmulqdq.rs b/src/crc32/fusion/x86/iscsi/avx512_vpclmulqdq.rs
new file mode 100644
index 0000000..56a7cb0
--- /dev/null
+++ b/src/crc32/fusion/x86/iscsi/avx512_vpclmulqdq.rs
@@ -0,0 +1,211 @@
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! MIT licensed.
+
+#![cfg(target_arch = "x86_64")]
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// with the help of Claude.ai using:
+///
+/// ./generate -i avx512_vpclmulqdq -p crc32c -a v3x2
+///
+/// Modified as necessary for this Rust implementation.
+///
+/// Uses AVX-512 VPCLMULQDQ instructions, so only available after Rust 1.89 (when AVX-512
+/// stabilized)
+#[rustversion::since(1.89)]
+#[inline]
+#[target_feature(enable = "avx512vl,vpclmulqdq")]
+pub unsafe fn crc32_iscsi_avx512_vpclmulqdq_v3x2(
+    mut crc0: u32,
+    mut buf: *const u8,
+    mut len: usize,
+) -> u32 {
+    use crate::fusion::x86::*;
+
+    // Align to 8-byte boundary
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Align to 64-byte boundary (cache line)
+    while (buf as usize & 56) != 0 && len >= 8 {
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 384 {
+        // First vector chunk - load three 512-bit vectors (192 bytes total)
+        let mut x0 = _mm512_loadu_si512(buf as *const __m512i);
+        let mut x1 = _mm512_loadu_si512(buf.add(64) as *const __m512i);
+        let mut x2 = _mm512_loadu_si512(buf.add(128) as *const __m512i);
+
+        // Create the multiplication constant vector
+        // Pattern: [0xa87ab8a8, 0, 0xab7aff2a, 0] repeated across all 128-bit lanes
+        let k_128 = _mm_setr_epi32(0xa87ab8a8u32 as i32, 0, 0xab7aff2au32 as i32, 0);
+        let mut k = _mm512_broadcast_i32x4(k_128);
+
+        // XOR the CRC into the first vector's low 32 bits
+        let crc_vec = _mm512_castsi128_si512(_mm_cvtsi32_si128(crc0 as i32));
+        x0 = _mm512_xor_si512(crc_vec, x0);
+
+        // First round of polynomial multiplication
+        let mut y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+        x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
+        let mut y1 = clmul_lo_avx512_vpclmulqdq(x1, k);
+        x1 = clmul_hi_avx512_vpclmulqdq(x1, k);
+        let mut y2 = clmul_lo_avx512_vpclmulqdq(x2, k);
+        x2 = clmul_hi_avx512_vpclmulqdq(x2, k);
+
+        // XOR with next chunk of data using ternary logic (A XOR B XOR C)
+        // 0x96 = A XOR B XOR C in ternary logic notation
+        x0 = _mm512_ternarylogic_epi64(
+            x0,
+            y0,
+            _mm512_loadu_si512(buf.add(192) as *const __m512i),
+            0x96,
+        );
+        x1 = _mm512_ternarylogic_epi64(
+            x1,
+            y1,
+            _mm512_loadu_si512(buf.add(256) as *const __m512i),
+            0x96,
+        );
+        x2 = _mm512_ternarylogic_epi64(
+            x2,
+            y2,
+            _mm512_loadu_si512(buf.add(320) as *const __m512i),
+            0x96,
+        );
+
+        buf = buf.add(384);
+        len -= 384;
+
+        // Main loop - process 384 bytes at a time
+        while len >= 384 {
+            // First folding step
+            y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+            x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
+            y1 = clmul_lo_avx512_vpclmulqdq(x1, k);
+            x1 = clmul_hi_avx512_vpclmulqdq(x1, k);
+            y2 = clmul_lo_avx512_vpclmulqdq(x2, k);
+            x2 = clmul_hi_avx512_vpclmulqdq(x2, k);
+
+            x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512(buf as *const __m512i), 0x96);
+            x1 = _mm512_ternarylogic_epi64(
+                x1,
+                y1,
+                _mm512_loadu_si512(buf.add(64) as *const __m512i),
+                0x96,
+            );
+            x2 = _mm512_ternarylogic_epi64(
+                x2,
+                y2,
+                _mm512_loadu_si512(buf.add(128) as *const __m512i),
+                0x96,
+            );
+
+            // Second folding step
+            y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+            x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
+            y1 = clmul_lo_avx512_vpclmulqdq(x1, k);
+            x1 = clmul_hi_avx512_vpclmulqdq(x1, k);
+            y2 = clmul_lo_avx512_vpclmulqdq(x2, k);
+            x2 = clmul_hi_avx512_vpclmulqdq(x2, k);
+
+            x0 = _mm512_ternarylogic_epi64(
+                x0,
+                y0,
+                _mm512_loadu_si512(buf.add(192) as *const __m512i),
+                0x96,
+            );
+            x1 = _mm512_ternarylogic_epi64(
+                x1,
+                y1,
+                _mm512_loadu_si512(buf.add(256) as *const __m512i),
+                0x96,
+            );
+            x2 = _mm512_ternarylogic_epi64(
+                x2,
+                y2,
+                _mm512_loadu_si512(buf.add(320) as *const __m512i),
+                0x96,
+            );
+
+            buf = buf.add(384);
+            len -= 384;
+        }
+
+        // Reduce x0, x1, x2 to just x0
+        let k_128 = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0);
+        k = _mm512_broadcast_i32x4(k_128);
+
+        y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+        x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
+        x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96);
+        x1 = x2;
+
+        y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+        x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
+        x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96);
+
+        // Reduce 512 bits to 128 bits
+        // Multiple reduction constants for different parts of the 512-bit vector
+        k = _mm512_setr_epi32(
+            0x1c291d04u32 as i32,
+            0,
+            0xddc0152bu32 as i32,
+            0, // Lane 0
+            0x3da6d0cbu32 as i32,
+            0,
+            0xba4fc28eu32 as i32,
+            0, // Lane 1
+            0xf20c0dfeu32 as i32,
+            0,
+            0x493c7d27u32 as i32,
+            0, // Lane 2
+            0,
+            0,
+            0,
+            0, // Lane 3 (unused)
+        );
+
+        y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+        k = clmul_hi_avx512_vpclmulqdq(x0, k);
+        y0 = _mm512_xor_si512(y0, k);
+
+        // Extract 128-bit lanes and combine them
+        let lane0 = _mm512_castsi512_si128(y0);
+        let lane1 = _mm512_extracti32x4_epi32(y0, 1);
+        let lane2 = _mm512_extracti32x4_epi32(y0, 2);
+        let lane3 = _mm512_extracti32x4_epi32(x0, 3);
+
+        // Combine all lanes using ternary logic
+        let mut z0 = _mm_ternarylogic_epi64(lane0, lane1, lane2, 0x96);
+        z0 = _mm_xor_si128(z0, lane3);
+
+        // Reduce 128 bits to 32 bits using CRC32 instructions
+        crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0) as u64) as u32;
+        crc0 = _mm_crc32_u64(crc0.into(), _mm_extract_epi64(z0, 1) as u64) as u32;
+    }
+
+    // Process remaining 8-byte chunks
+    while len >= 8 {
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    // Process remaining bytes
+    while len > 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
diff --git a/src/crc32/fusion/x86/iscsi/mod.rs b/src/crc32/fusion/x86/iscsi/mod.rs
new file mode 100644
index 0000000..eaffcde
--- /dev/null
+++ b/src/crc32/fusion/x86/iscsi/mod.rs
@@ -0,0 +1,10 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides x86-specific implementations of CRC-32/ISCSI calculations using
+//! fusion techniques.
+
+#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+
+pub(crate) mod avx512_pclmulqdq;
+pub(crate) mod avx512_vpclmulqdq;
+pub(crate) mod sse_pclmulqdq;
diff --git a/src/crc32/fusion/x86/iscsi/sse_pclmulqdq.rs b/src/crc32/fusion/x86/iscsi/sse_pclmulqdq.rs
new file mode 100644
index 0000000..bda9ae8
--- /dev/null
+++ b/src/crc32/fusion/x86/iscsi/sse_pclmulqdq.rs
@@ -0,0 +1,169 @@
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! MIT licensed.
+
+#![cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+use crate::fusion::x86::*;
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// with the help of Claude.ai using:
+///
+/// ./generate -i sse -p crc32c -a v4s3x3
+///
+/// Modified as necessary for this Rust implementation.
+#[inline]
+#[target_feature(enable = "sse4.2,pclmulqdq")]
+pub unsafe fn crc32_iscsi_sse_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
+    // Align to 8-byte boundary using hardware CRC32C instructions
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 144 {
+        let blk = (len - 8) / 136;
+        let klen = blk * 24;
+        let buf2 = buf;
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // First vector chunk - load four 128-bit vectors (64 bytes total)
+        let mut x0 = _mm_loadu_si128(buf2 as *const __m128i);
+        let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i);
+        let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i);
+        let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i);
+
+        // iSCSI-specific folding constant (same as AVX-512 version)
+        let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0);
+
+        // XOR the CRC into the first vector's low 32 bits
+        x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0);
+        crc0 = 0;
+
+        let mut buf2 = buf2.add(64);
+        len -= 136;
+        buf = buf.add(blk * 64);
+
+        // Main loop - process 144 bytes at a time
+        while len >= 144 {
+            let mut y0 = clmul_lo_sse(x0, k);
+            x0 = clmul_hi_sse(x0, k);
+            let mut y1 = clmul_lo_sse(x1, k);
+            x1 = clmul_hi_sse(x1, k);
+            let mut y2 = clmul_lo_sse(x2, k);
+            x2 = clmul_hi_sse(x2, k);
+            let mut y3 = clmul_lo_sse(x3, k);
+            x3 = clmul_hi_sse(x3, k);
+
+            // XOR operations using separate XOR instructions (no ternary logic in SSE)
+            y0 = _mm_xor_si128(y0, _mm_loadu_si128(buf2 as *const __m128i));
+            x0 = _mm_xor_si128(x0, y0);
+            y1 = _mm_xor_si128(y1, _mm_loadu_si128(buf2.add(16) as *const __m128i));
+            x1 = _mm_xor_si128(x1, y1);
+            y2 = _mm_xor_si128(y2, _mm_loadu_si128(buf2.add(32) as *const __m128i));
+            x2 = _mm_xor_si128(x2, y2);
+            y3 = _mm_xor_si128(y3, _mm_loadu_si128(buf2.add(48) as *const __m128i));
+            x3 = _mm_xor_si128(x3, y3);
+
+            // Process scalar data in parallel using hardware CRC32C
+            crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
+            crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64));
+            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64));
+            crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64));
+            crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64));
+            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+            crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64));
+            crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64));
+            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64));
+
+            buf = buf.add(24);
+            buf2 = buf2.add(64);
+            len -= 136;
+        }
+
+        // Reduce x0 ... x3 to just x0 using iSCSI-specific constants
+        k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0);
+
+        let mut y0 = clmul_lo_sse(x0, k);
+        x0 = clmul_hi_sse(x0, k);
+        let mut y2 = clmul_lo_sse(x2, k);
+        x2 = clmul_hi_sse(x2, k);
+
+        y0 = _mm_xor_si128(y0, x1);
+        x0 = _mm_xor_si128(x0, y0);
+        y2 = _mm_xor_si128(y2, x3);
+        x2 = _mm_xor_si128(x2, y2);
+
+        k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0);
+
+        y0 = clmul_lo_sse(x0, k);
+        x0 = clmul_hi_sse(x0, k);
+        y0 = _mm_xor_si128(y0, x2);
+        x0 = _mm_xor_si128(x0, y0);
+
+        // Final scalar chunk
+        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
+        crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64));
+        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64));
+        crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64));
+        crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64));
+        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+        crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64));
+        crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64));
+        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64));
+        buf = buf.add(24);
+
+        let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8);
+        let vc1 = crc_shift_iscsi_sse(crc1, klen + 8);
+        let mut vc = mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0);
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        // Extract the two 64-bit parts of x0 and combine them
+        let x0_low = mm_extract_epi64(x0, 0);
+        let x0_high = mm_extract_epi64(x0, 1);
+        let x0_combined = mm_extract_epi64(
+            crc_shift_iscsi_sse(mm_crc32_u64(mm_crc32_u64(0, x0_low), x0_high), klen * 3 + 8),
+            0,
+        );
+        vc ^= x0_combined;
+
+        // Final 8 bytes
+        buf = buf.add(klen * 2);
+        crc0 = crc2;
+        crc0 = mm_crc32_u64(crc0, *(buf as *const u64) ^ vc);
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    // Process remaining 8-byte chunks using hardware CRC32C
+    while len >= 8 {
+        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    // Process remaining bytes using hardware CRC32C
+    while len > 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
diff --git a/src/crc32/fusion/x86/mod.rs b/src/crc32/fusion/x86/mod.rs
new file mode 100644
index 0000000..a8043b3
--- /dev/null
+++ b/src/crc32/fusion/x86/mod.rs
@@ -0,0 +1,289 @@
+//! Provides CRC-32/ISCSI calculations using a fusion of native CLMUL
+//! instructions and native CRC calculation instructions on x86 / x86_64.
+//!
+//! https://www.corsix.org/content/fast-crc32c-4k
+//! https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
+//!
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai.
+//!
+//! Modified as necessary for this Rust implementation.
+//!
+//! MIT licensed.
+
+#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+
+mod iscsi;
+
+use iscsi::sse_pclmulqdq::crc32_iscsi_sse_v4s3x3;
+
+#[cfg(target_arch = "x86_64")]
+#[rustversion::since(1.89)]
+use iscsi::avx512_pclmulqdq::crc32_iscsi_avx512_v4s3x3;
+#[cfg(target_arch = "x86_64")]
+#[rustversion::since(1.89)]
+use iscsi::avx512_vpclmulqdq::crc32_iscsi_avx512_vpclmulqdq_v3x2;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+/// CRC32 iSCSI calculation for Rust versions before 1.89 (pre-AVX-512 support)
+///
+///
+/// This function is called by the wrapper layer after feature detection has been performed.
+/// For older Rust versions, only SSE implementation is available.
+#[rustversion::before(1.89)]
+#[inline(always)]
+pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
+    // Only SSE implementation is available for Rust versions before 1.89
+    // Runtime feature detection is handled by the wrapper layer
+    unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) }
+}
+
+/// CRC32 iSCSI calculation using the highest available instruction set after Rust 1.89
+/// (post-AVX-512 support)
+///
+/// This function is called by the wrapper layer after feature detection has been performed.
+/// The wrapper layer ensures that only the appropriate implementation is called based on
+/// cached feature detection results, removing runtime checks from the hot path.
+#[rustversion::since(1.89)]
+#[inline(always)]
+pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
+    #[cfg(target_arch = "x86_64")]
+    {
+        // AVX512 + VPCLMULQDQ
+
+        if is_x86_feature_detected!("avx512vl") && is_x86_feature_detected!("vpclmulqdq") {
+            unsafe {
+                return crc32_iscsi_avx512_vpclmulqdq_v3x2(crc, data.as_ptr(), data.len());
+            }
+        }
+
+        // AVX512
+        if is_x86_feature_detected!("avx512vl") {
+            unsafe {
+                return crc32_iscsi_avx512_v4s3x3(crc, data.as_ptr(), data.len());
+            }
+        }
+    }
+
+    // Fallback to SSE implementation
+    unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) }
+}
+
+#[rustversion::since(1.89)]
+#[cfg(target_arch = "x86_64")]
+#[inline]
+#[target_feature(enable = "avx512vl,vpclmulqdq")]
+unsafe fn clmul_lo_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_clmulepi64_epi128(a, b, 0)
+}
+
+#[rustversion::since(1.89)]
+#[cfg(target_arch = "x86_64")]
+#[inline]
+#[target_feature(enable = "avx512vl,vpclmulqdq")]
+unsafe fn clmul_hi_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_clmulepi64_epi128(a, b, 17)
+}
+
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+unsafe fn clmul_lo_sse(a: __m128i, b: __m128i) -> __m128i {
+    _mm_clmulepi64_si128(a, b, 0)
+}
+
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+unsafe fn clmul_hi_sse(a: __m128i, b: __m128i) -> __m128i {
+    _mm_clmulepi64_si128(a, b, 17)
+}
+
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+unsafe fn clmul_scalar_sse(a: u32, b: u32) -> __m128i {
+    _mm_clmulepi64_si128(_mm_cvtsi32_si128(a as i32), _mm_cvtsi32_si128(b as i32), 0)
+}
+
+// x^n mod P, in log(n) time
+#[target_feature(enable = "sse4.2,pclmulqdq")]
+unsafe fn xnmodp_iscsi_sse(mut n: u64) -> u32 {
+    let mut stack = !1u64;
+    let mut acc: u32;
+    let mut low: u32;
+
+    while n > 191 {
+        stack = (stack << 1) + (n & 1);
+        n = (n >> 1) - 16;
+    }
+    stack = !stack;
+    acc = 0x80000000u32 >> (n & 31);
+    n >>= 5;
+
+    while n > 0 {
+        // Use hardware CRC32C instruction
+        acc = _mm_crc32_u32(acc, 0);
+        n -= 1;
+    }
+
+    while {
+        low = (stack & 1) as u32;
+        stack >>= 1;
+        stack != 0
+    } {
+        let x = _mm_cvtsi32_si128(acc as i32);
+        let clmul_result = _mm_clmulepi64_si128(x, x, 0);
+        let y = mm_extract_epi64(clmul_result, 0);
+        acc = mm_crc32_u64(0, y << low);
+    }
+    acc
+}
+
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+unsafe fn crc_shift_iscsi_sse(crc: u32, nbytes: usize) -> __m128i {
+    clmul_scalar_sse(crc, xnmodp_iscsi_sse((nbytes * 8 - 33) as u64))
+}
+
+#[inline]
+#[target_feature(enable = "sse4.1")]
+unsafe fn mm_extract_epi64(val: __m128i, idx: i32) -> u64 {
+    #[cfg(target_arch = "x86_64")]
+    {
+        if idx == 0 {
+            _mm_cvtsi128_si64(val) as u64
+        } else {
+            _mm_cvtsi128_si64(_mm_srli_si128(val, 8)) as u64
+        }
+    }
+    #[cfg(target_arch = "x86")]
+    {
+        // On 32-bit x86, extract two 32-bit values and combine them
+        let shifted = if idx == 0 {
+            val
+        } else {
+            _mm_srli_si128(val, 8)
+        };
+        let low = _mm_cvtsi128_si32(shifted) as u32;
+        let high = _mm_cvtsi128_si32(_mm_srli_si128(shifted, 4)) as u32;
+        (low as u64) | ((high as u64) << 32)
+    }
+}
+
+#[inline]
+#[target_feature(enable = "sse4.2")]
+unsafe fn mm_crc32_u64(crc: u32, val: u64) -> u32 {
+    #[cfg(target_arch = "x86_64")]
+    {
+        _mm_crc32_u64(crc.into(), val) as u32
+    }
+    #[cfg(target_arch = "x86")]
+    {
+        // On 32-bit x86, process 64-bit value as two 32-bit CRC operations
+        let low = val as u32;
+        let high = (val >> 32) as u32;
+        let crc = _mm_crc32_u32(crc, low);
+        _mm_crc32_u32(crc, high)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::test::consts::TEST_CHECK_STRING;
+    use crc::{Crc, Table};
+    use rand::{rng, Rng};
+
+    const RUST_CRC32_ISCSI: Crc<u32, Table<16>> = Crc::<u32, Table<16>>::new(&crc::CRC_32_ISCSI);
+
+    #[test]
+    fn test_crc32_iscsi_check() {
+        assert_eq!(
+            crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
+            0xe3069283
+        );
+    }
+
+    #[test]
+    fn test_crc32_iscsi_small_all_lengths() {
+        for len in 1..=255 {
+            test_crc32_iscsi_random(len);
+        }
+    }
+
+    #[test]
+    fn test_crc32_iscsi_medium_lengths() {
+        // Test each length from 256 to 1024, which should fold and include handling remainders
+        for len in 256..=1024 {
+            test_crc32_iscsi_random(len);
+        }
+    }
+
+    #[test]
+    fn test_crc32_iscsi_large_lengths() {
+        // Test 1 MiB just before, at, and just after the folding boundaries
+        for len in 1048575..1048577 {
+            test_crc32_iscsi_random(len);
+        }
+    }
+
+    #[rustversion::since(1.89)]
+    fn test_crc32_iscsi_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISCSI.checksum(&data);
+
+        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            #[cfg(target_arch = "x86_64")]
+            {
+                if is_x86_feature_detected!("vpclmulqdq")
+                    && is_x86_feature_detected!("avx512vl")
+                    && is_x86_feature_detected!("avx512f")
+                {
+                    assert_eq!(
+                        crc32_iscsi_avx512_vpclmulqdq_v3x2(0xffffffff, data.as_ptr(), data.len())
+                            ^ 0xffffffff,
+                        checksum
+                    );
+                }
+
+                if is_x86_feature_detected!("avx512vl")
+                    && is_x86_feature_detected!("avx512f")
+                    && is_x86_feature_detected!("pclmulqdq")
+                {
+                    assert_eq!(
+                        crc32_iscsi_avx512_v4s3x3(0xffffffff, data.as_ptr(), data.len())
+                            ^ 0xffffffff,
+                        checksum
+                    );
+                }
+            }
+
+            assert_eq!(
+                crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+
+    #[rustversion::before(1.89)]
+    fn test_crc32_iscsi_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISCSI.checksum(&data);
+
+        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            assert_eq!(
+                crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+}
diff --git a/src/crc32/mod.rs b/src/crc32/mod.rs
index 518f5f2..78b2c45 100644
--- a/src/crc32/mod.rs
+++ b/src/crc32/mod.rs
@@ -5,5 +5,5 @@
 pub mod algorithm;
 pub mod consts;
 
-#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
 pub(crate) mod fusion;
diff --git a/src/feature_detection.rs b/src/feature_detection.rs
new file mode 100644
index 0000000..b170a74
--- /dev/null
+++ b/src/feature_detection.rs
@@ -0,0 +1,1238 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! Feature detection system for safe and efficient hardware acceleration across different
+//! platforms.
+
+use std::sync::OnceLock;
+
+/// Global ArchOps instance cache - initialized once based on feature detection results
+static ARCH_OPS_INSTANCE: OnceLock<ArchOpsInstance> = OnceLock::new();
+
+/// Performance tiers representing different hardware capability levels
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[allow(dead_code)] // Some variants may not be constructed on all target architectures
+pub enum PerformanceTier {
+    // AArch64 tiers
+    AArch64AesSha3,
+    AArch64Aes,
+
+    // x86_64 tiers
+    X86_64Avx512Vpclmulqdq,
+    X86_64Avx512Pclmulqdq,
+    X86_64SsePclmulqdq,
+
+    // x86 tiers
+    X86SsePclmulqdq,
+
+    // Fallback
+    SoftwareTable,
+}
+
+/// Architecture-specific capabilities
+#[derive(Debug, Clone, Copy)]
+#[allow(dead_code)] // Some fields may not be read on all target architectures
+pub struct ArchCapabilities {
+    // AArch64 features
+    pub has_aes: bool, // provides PMULL support for CRC calculations (NEON is implicit)
+    pub has_sha3: bool, // requires 'aes', provides EOR3 for XOR3 operations
+
+    // x86/x86_64 features
+    pub has_sse41: bool,
+    pub has_pclmulqdq: bool,
+    pub has_avx512vl: bool, // implicitly enables avx512f, has XOR3 operations
+    pub has_vpclmulqdq: bool,
+
+    // Rust version gates
+    pub rust_version_supports_avx512: bool,
+}
+
+/// Helper function to convert a performance tier to a human-readable target string
+/// Format: {architecture}-{intrinsics-family}-{intrinsics-features}
+#[inline(always)]
+fn tier_to_target_string(tier: PerformanceTier) -> String {
+    match tier {
+        PerformanceTier::AArch64AesSha3 => "aarch64-neon-pmull-sha3".to_string(),
+        PerformanceTier::AArch64Aes => "aarch64-neon-pmull".to_string(),
+        PerformanceTier::X86_64Avx512Vpclmulqdq => "x86_64-avx512-vpclmulqdq".to_string(),
+        PerformanceTier::X86_64Avx512Pclmulqdq => "x86_64-avx512-pclmulqdq".to_string(),
+        PerformanceTier::X86_64SsePclmulqdq => "x86_64-sse-pclmulqdq".to_string(),
+        PerformanceTier::X86SsePclmulqdq => "x86-sse-pclmulqdq".to_string(),
+        PerformanceTier::SoftwareTable => "software-fallback-tables".to_string(),
+    }
+}
+
+/// Detect architecture-specific capabilities combining compile-time and runtime checks
+///
+/// # Safety
+/// Uses runtime feature detection which may access CPU-specific registers
+unsafe fn detect_arch_capabilities() -> ArchCapabilities {
+    #[cfg(target_arch = "aarch64")]
+    {
+        detect_aarch64_features()
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        detect_x86_features()
+    }
+
+    #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))]
+    {
+        // Other architectures use software fallback
+        ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: false,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        }
+    }
+}
+
+/// AArch64-specific feature detection
+///
+/// Note: NEON is always available on AArch64 and is implicitly enabled by AES support.
+/// AES support provides the PMULL instructions needed for CRC calculations.
+#[inline(always)]
+#[cfg(target_arch = "aarch64")]
+unsafe fn detect_aarch64_features() -> ArchCapabilities {
+    use std::arch::is_aarch64_feature_detected;
+
+    // AES is available on essentially all AArch64 CPUs and provides the PMULL instructions
+    let has_aes = is_aarch64_feature_detected!("aes");
+
+    // SHA3 is available on modern Aarch64 CPUs, and provides the EOR3 instruction for efficient
+    // XOR3 operations.
+    let has_sha3 = is_aarch64_feature_detected!("sha3");
+
+    ArchCapabilities {
+        has_aes,
+        has_sha3,
+        has_sse41: false,
+        has_pclmulqdq: false,
+        has_avx512vl: false,
+        has_vpclmulqdq: false,
+        rust_version_supports_avx512: false,
+    }
+}
+
+/// x86/x86_64-specific feature detection
+#[inline(always)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+unsafe fn detect_x86_features() -> ArchCapabilities {
+    use std::arch::is_x86_feature_detected;
+
+    // Check Rust version support for VPCLMULQDQ (requires 1.89+)
+    let rust_version_supports_avx512 = check_rust_version_supports_avx512();
+
+    // SSE 4.1 and PCLMULQDQ support are the baseline for hardware acceleration
+    let has_sse41 = is_x86_feature_detected!("sse4.1");
+    let has_pclmulqdq = has_sse41 && is_x86_feature_detected!("pclmulqdq");
+
+    // After Rust 1.89, AVX-512VL and VPCLMULQDQ can be used if available
+    let has_avx512vl =
+        has_pclmulqdq && rust_version_supports_avx512 && is_x86_feature_detected!("avx512vl");
+    let has_vpclmulqdq =
+        has_avx512vl && rust_version_supports_avx512 && is_x86_feature_detected!("vpclmulqdq");
+
+    ArchCapabilities {
+        has_aes: false,
+        has_sha3: false,
+        has_sse41,
+        has_pclmulqdq,
+        has_avx512vl,
+        has_vpclmulqdq,
+        rust_version_supports_avx512,
+    }
+}
+
+/// Check if the current Rust version supports VPCLMULQDQ intrinsics
+/// VPCLMULQDQ intrinsics were stabilized in Rust 1.89
+#[rustversion::since(1.89)]
+#[inline(always)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub(crate) fn check_rust_version_supports_avx512() -> bool {
+    true
+}
+
+/// Check if the current Rust version supports VPCLMULQDQ intrinsics
+/// VPCLMULQDQ intrinsics were stabilized in Rust 1.89
+#[rustversion::before(1.89)]
+#[inline(always)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub(crate) fn check_rust_version_supports_avx512() -> bool {
+    false
+}
+
+/// Select the appropriate performance tier based on detected capabilities
+#[inline(always)]
+#[allow(unused)]
+pub(crate) fn select_performance_tier(capabilities: &ArchCapabilities) -> PerformanceTier {
+    #[cfg(target_arch = "aarch64")]
+    {
+        if capabilities.has_sha3 && capabilities.has_aes {
+            return PerformanceTier::AArch64AesSha3;
+        }
+
+        if capabilities.has_aes {
+            return PerformanceTier::AArch64Aes;
+        }
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        if capabilities.has_vpclmulqdq {
+            return PerformanceTier::X86_64Avx512Vpclmulqdq;
+        }
+        if capabilities.has_avx512vl {
+            return PerformanceTier::X86_64Avx512Pclmulqdq;
+        }
+        if capabilities.has_pclmulqdq {
+            return PerformanceTier::X86_64SsePclmulqdq;
+        }
+    }
+
+    #[cfg(target_arch = "x86")]
+    {
+        if capabilities.has_pclmulqdq {
+            return PerformanceTier::X86SsePclmulqdq;
+        }
+    }
+
+    // Fallback to software implementation
+    PerformanceTier::SoftwareTable
+}
+
+/// Enum that holds the different ArchOps implementations for compile-time dispatch
+/// This avoids the need for trait objects while still providing factory-based selection
+#[rustversion::since(1.89)]
+#[derive(Debug, Clone, Copy)]
+pub enum ArchOpsInstance {
+    #[cfg(target_arch = "aarch64")]
+    Aarch64Aes(crate::arch::aarch64::aes::Aarch64AesOps),
+    #[cfg(target_arch = "aarch64")]
+    Aarch64AesSha3(crate::arch::aarch64::aes_sha3::Aarch64AesSha3Ops),
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    X86SsePclmulqdq(crate::arch::x86::sse::X86SsePclmulqdqOps),
+    #[cfg(target_arch = "x86_64")]
+    X86_64Avx512Pclmulqdq(crate::arch::x86_64::avx512::X86_64Avx512PclmulqdqOps),
+    #[cfg(target_arch = "x86_64")]
+    X86_64Avx512Vpclmulqdq(crate::arch::x86_64::avx512_vpclmulqdq::X86_64Avx512VpclmulqdqOps),
+    /// Software fallback - no ArchOps struct needed
+    SoftwareFallback,
+}
+
+#[rustversion::before(1.89)]
+#[derive(Debug, Clone, Copy)]
+pub enum ArchOpsInstance {
+    #[cfg(target_arch = "aarch64")]
+    Aarch64Aes(crate::arch::aarch64::aes::Aarch64AesOps),
+    #[cfg(target_arch = "aarch64")]
+    Aarch64AesSha3(crate::arch::aarch64::aes_sha3::Aarch64AesSha3Ops),
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    X86SsePclmulqdq(crate::arch::x86::sse::X86SsePclmulqdqOps),
+    /// Software fallback - no ArchOps struct needed
+    SoftwareFallback,
+}
+
+impl ArchOpsInstance {
+    #[inline(always)]
+    #[rustversion::since(1.89)]
+    pub fn get_tier(&self) -> PerformanceTier {
+        match self {
+            #[cfg(target_arch = "aarch64")]
+            ArchOpsInstance::Aarch64Aes(_) => PerformanceTier::AArch64Aes,
+            #[cfg(target_arch = "aarch64")]
+            ArchOpsInstance::Aarch64AesSha3(_) => PerformanceTier::AArch64AesSha3,
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            ArchOpsInstance::X86SsePclmulqdq(_) => PerformanceTier::X86SsePclmulqdq,
+            #[cfg(target_arch = "x86_64")]
+            ArchOpsInstance::X86_64Avx512Pclmulqdq(_) => PerformanceTier::X86_64Avx512Pclmulqdq,
+            #[cfg(target_arch = "x86_64")]
+            ArchOpsInstance::X86_64Avx512Vpclmulqdq(_) => PerformanceTier::X86_64Avx512Vpclmulqdq,
+            ArchOpsInstance::SoftwareFallback => PerformanceTier::SoftwareTable,
+        }
+    }
+
+    #[inline(always)]
+    #[rustversion::before(1.89)]
+    pub fn get_tier(&self) -> PerformanceTier {
+        match self {
+            #[cfg(target_arch = "aarch64")]
+            ArchOpsInstance::Aarch64Aes(_) => PerformanceTier::AArch64Aes,
+            #[cfg(target_arch = "aarch64")]
+            ArchOpsInstance::Aarch64AesSha3(_) => PerformanceTier::AArch64AesSha3,
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            ArchOpsInstance::X86SsePclmulqdq(_) => PerformanceTier::X86SsePclmulqdq,
+            ArchOpsInstance::SoftwareFallback => PerformanceTier::SoftwareTable,
+        }
+    }
+
+    /// Get a human-readable target string describing the active configuration
+    #[inline(always)]
+    pub fn get_target_string(&self) -> String {
+        tier_to_target_string(self.get_tier())
+    }
+}
+
+/// Get the global ArchOps instance (thread-safe, initialized once based on feature detection)
+///
+/// This function provides access to the cached ArchOps instance that was selected based on
+/// feature detection results at library initialization time, eliminating runtime feature
+/// detection overhead from hot paths.
+pub fn get_arch_ops() -> &'static ArchOpsInstance {
+    ARCH_OPS_INSTANCE.get_or_init(create_arch_ops)
+}
+
+/// Factory function that creates the appropriate ArchOps struct based on cached feature detection
+///
+/// This function uses the cached feature detection results to select the optimal
+/// architecture-specific implementation at library initialization time, eliminating
+/// runtime feature detection overhead from hot paths.
+fn create_arch_ops() -> ArchOpsInstance {
+    let capabilities = unsafe { detect_arch_capabilities() };
+    let tier = select_performance_tier(&capabilities);
+
+    create_arch_ops_from_tier(tier)
+}
+
+/// Helper function to create ArchOpsInstance from a performance tier for Rust 1.89+ (when AVX512
+/// stabilized)
+#[rustversion::since(1.89)]
+fn create_arch_ops_from_tier(tier: PerformanceTier) -> ArchOpsInstance {
+    match tier {
+        #[cfg(target_arch = "aarch64")]
+        PerformanceTier::AArch64AesSha3 => {
+            use crate::arch::aarch64::aes_sha3::Aarch64AesSha3Ops;
+            ArchOpsInstance::Aarch64AesSha3(Aarch64AesSha3Ops::new())
+        }
+        #[cfg(target_arch = "aarch64")]
+        PerformanceTier::AArch64Aes => {
+            use crate::arch::aarch64::aes::Aarch64AesOps;
+            ArchOpsInstance::Aarch64Aes(Aarch64AesOps)
+        }
+        #[cfg(target_arch = "x86_64")]
+        PerformanceTier::X86_64Avx512Vpclmulqdq => {
+            use crate::arch::x86_64::avx512_vpclmulqdq::X86_64Avx512VpclmulqdqOps;
+            ArchOpsInstance::X86_64Avx512Vpclmulqdq(X86_64Avx512VpclmulqdqOps::new())
+        }
+        #[cfg(target_arch = "x86_64")]
+        PerformanceTier::X86_64Avx512Pclmulqdq => {
+            use crate::arch::x86_64::avx512::X86_64Avx512PclmulqdqOps;
+            ArchOpsInstance::X86_64Avx512Pclmulqdq(X86_64Avx512PclmulqdqOps::new())
+        }
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        PerformanceTier::X86_64SsePclmulqdq | PerformanceTier::X86SsePclmulqdq => {
+            create_x86_sse_pclmulqdq_ops()
+        }
+        PerformanceTier::SoftwareTable => {
+            // Use software fallback
+            ArchOpsInstance::SoftwareFallback
+        }
+        // Handle cases where the performance tier doesn't match the current architecture
+        _ => {
+            // This can happen when a tier is selected for a different architecture
+            // Fall back to software implementation
+            ArchOpsInstance::SoftwareFallback
+        }
+    }
+}
+
+/// Helper function to create ArchOpsInstance from a performance tier for Rust <1.89 (before AVX512
+/// stabilized)
+#[rustversion::before(1.89)]
+fn create_arch_ops_from_tier(tier: PerformanceTier) -> ArchOpsInstance {
+    match tier {
+        #[cfg(target_arch = "aarch64")]
+        PerformanceTier::AArch64AesSha3 => {
+            use crate::arch::aarch64::aes_sha3::Aarch64AesSha3Ops;
+            ArchOpsInstance::Aarch64AesSha3(Aarch64AesSha3Ops::new())
+        }
+        #[cfg(target_arch = "aarch64")]
+        PerformanceTier::AArch64Aes => {
+            use crate::arch::aarch64::aes::Aarch64AesOps;
+            ArchOpsInstance::Aarch64Aes(Aarch64AesOps)
+        }
+        #[cfg(target_arch = "x86_64")]
+        PerformanceTier::X86_64Avx512Vpclmulqdq => {
+            // VPCLMULQDQ and AVX512 not available in older Rust versions, fall back to SSE
+            create_x86_sse_pclmulqdq_ops()
+        }
+        #[cfg(target_arch = "x86_64")]
+        PerformanceTier::X86_64Avx512Pclmulqdq => {
+            // AVX512 not available in older Rust versions, fall back to SSE
+            create_x86_sse_pclmulqdq_ops()
+        }
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        PerformanceTier::X86_64SsePclmulqdq | PerformanceTier::X86SsePclmulqdq => {
+            create_x86_sse_pclmulqdq_ops()
+        }
+        PerformanceTier::SoftwareTable => {
+            // Use software fallback
+            ArchOpsInstance::SoftwareFallback
+        }
+        // Handle cases where the performance tier doesn't match the current architecture
+        _ => {
+            // This can happen when a tier is selected for a different architecture
+            // Fall back to software implementation
+            ArchOpsInstance::SoftwareFallback
+        }
+    }
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn create_x86_sse_pclmulqdq_ops() -> ArchOpsInstance {
+    use crate::arch::x86::sse::X86SsePclmulqdqOps;
+    ArchOpsInstance::X86SsePclmulqdq(X86SsePclmulqdqOps)
+}
+/// Test-specific tier selection that works across all architectures for comprehensive testing
+#[cfg(test)]
+pub fn select_performance_tier_for_test(capabilities: &ArchCapabilities) -> PerformanceTier {
+    // AArch64 tier selection - SHA3 requires AES to be available
+    if capabilities.has_sha3 && capabilities.has_aes {
+        return PerformanceTier::AArch64AesSha3;
+    }
+
+    if capabilities.has_aes {
+        return PerformanceTier::AArch64Aes;
+    }
+
+    // x86_64 tier selection - VPCLMULQDQ requires AVX512VL
+    if capabilities.has_vpclmulqdq
+        && capabilities.has_avx512vl
+        && capabilities.rust_version_supports_avx512
+    {
+        return PerformanceTier::X86_64Avx512Vpclmulqdq;
+    }
+
+    // AVX512VL requires PCLMULQDQ and SSE4.1
+    if capabilities.has_avx512vl
+        && capabilities.has_pclmulqdq
+        && capabilities.rust_version_supports_avx512
+    {
+        return PerformanceTier::X86_64Avx512Pclmulqdq;
+    }
+
+    // PCLMULQDQ requires SSE4.1
+    if capabilities.has_pclmulqdq && capabilities.has_sse41 {
+        return PerformanceTier::X86_64SsePclmulqdq;
+    }
+
+    // Fallback to software implementation
+    PerformanceTier::SoftwareTable
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn test_rust_version_check() {
+        let supports_vpclmulqdq = check_rust_version_supports_avx512();
+
+        // Should return a boolean without panicking
+        let _ = supports_vpclmulqdq;
+    }
+
+    #[test]
+    fn test_aarch64_tier_selection() {
+        // Test that aarch64 tier selection follows the expected hierarchy
+
+        // Test SHA3 + AES (highest tier) - NEON is implicit with AES
+        let capabilities_sha3 = ArchCapabilities {
+            has_aes: true,
+            has_sha3: true,
+            has_sse41: false,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_sha3),
+            PerformanceTier::AArch64AesSha3
+        );
+
+        // Test AES only (baseline tier) - NEON is implicit with AES
+        let capabilities_aes = ArchCapabilities {
+            has_aes: true,
+            has_sha3: false,
+            has_sse41: false,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_aes),
+            PerformanceTier::AArch64Aes
+        );
+
+        // Test missing AES (should fall back to software)
+        let capabilities_no_aes = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: false,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_no_aes),
+            PerformanceTier::SoftwareTable
+        );
+    }
+
+    #[test]
+    fn test_aarch64_feature_hierarchy() {
+        // Test that AArch64 feature hierarchy is properly maintained
+        // NEON is always available on AArch64 and implicit with AES
+        // AES provides PMULL instructions, SHA3 provides EOR3
+
+        // Create test capabilities with AES support (NEON is implicit)
+        let capabilities_with_aes = ArchCapabilities {
+            has_aes: true,
+            has_sha3: false,
+            has_sse41: false,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+
+        // AES support means we have PMULL instructions available for CRC calculations
+        assert!(capabilities_with_aes.has_aes);
+
+        // SHA3 requires AES to be available first
+        let capabilities_with_sha3 = ArchCapabilities {
+            has_aes: true,
+            has_sha3: true,
+            has_sse41: false,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+
+        assert!(capabilities_with_sha3.has_aes);
+        assert!(capabilities_with_sha3.has_sha3);
+    }
+
+    #[test]
+    #[cfg(target_arch = "x86_64")]
+    fn test_x86_64_tier_selection() {
+        // Test that x86_64 tier selection follows the expected hierarchy
+
+        // Test VPCLMULQDQ + AVX512 (highest tier) on Rust 1.89+
+        let capabilities_vpclmulqdq = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: true,
+            has_avx512vl: true,
+            has_vpclmulqdq: true,
+            rust_version_supports_avx512: true,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_vpclmulqdq),
+            PerformanceTier::X86_64Avx512Vpclmulqdq
+        );
+
+        // Test AVX512 + PCLMULQDQ (mid-tier) on Rust 1.89+
+        let capabilities_avx512 = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: true,
+            has_avx512vl: true,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: true,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_avx512),
+            PerformanceTier::X86_64Avx512Pclmulqdq
+        );
+
+        // Test VPCLMULQDQ + AVX512 (highest tier) on Rust < 1.89
+        let capabilities_vpclmulqdq = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: true,
+            has_avx512vl: true,
+            has_vpclmulqdq: true,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_vpclmulqdq),
+            PerformanceTier::X86_64SsePclmulqdq
+        );
+
+        // Test AVX512 + PCLMULQDQ (mid-tier) on Rust < 1.89
+        let capabilities_avx512 = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: true,
+            has_avx512vl: true,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_avx512),
+            PerformanceTier::X86_64SsePclmulqdq
+        );
+
+        // Test SSE + PCLMULQDQ (baseline tier)
+        let capabilities_sse = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: true,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_sse),
+            PerformanceTier::X86_64SsePclmulqdq
+        );
+
+        // Test missing PCLMULQDQ (should fall back to software)
+        let capabilities_no_pclmul = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_no_pclmul),
+            PerformanceTier::SoftwareTable
+        );
+    }
+
+    #[test]
+    #[cfg(target_arch = "x86")]
+    fn test_x86_tier_selection() {
+        // Test that x86 (32-bit) tier selection works correctly
+
+        // Test SSE + PCLMULQDQ (only available tier for x86)
+        let capabilities_sse = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: true,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_sse),
+            PerformanceTier::X86_64SsePclmulqdq
+        );
+
+        // For x86 32-bit testing, we need a special case since AVX512VL indicates x86_64
+        // Create capabilities without AVX512VL to simulate x86 32-bit
+        let capabilities_x86_sse = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: true,
+            has_avx512vl: false, // No AVX512 on 32-bit x86
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        // This should select x86_64 tier since we're testing the general case
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_x86_sse),
+            PerformanceTier::X86_64SsePclmulqdq
+        );
+
+        // Test missing PCLMULQDQ (should fall back to software)
+        let capabilities_no_pclmul = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+        assert_eq!(
+            select_performance_tier_for_test(&capabilities_no_pclmul),
+            PerformanceTier::SoftwareTable
+        );
+    }
+
+    #[test]
+    fn test_x86_feature_hierarchy() {
+        // Test that x86 feature hierarchy is properly maintained
+        // SSE4.1 is required for PCLMULQDQ
+        // AVX512VL requires PCLMULQDQ
+        // VPCLMULQDQ requires AVX512VL and Rust 1.89+
+
+        // Test feature dependencies are enforced
+        let capabilities_full = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,
+            has_pclmulqdq: true,
+            has_avx512vl: true,
+            has_vpclmulqdq: true,
+            rust_version_supports_avx512: true,
+        };
+
+        // All x86 features should be available when hierarchy is satisfied
+        assert!(capabilities_full.has_sse41);
+        assert!(capabilities_full.has_pclmulqdq);
+        assert!(capabilities_full.has_avx512vl);
+        assert!(capabilities_full.has_vpclmulqdq);
+        assert!(capabilities_full.rust_version_supports_avx512);
+    }
+
+    #[test]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn test_rust_version_gating() {
+        // Test that VPCLMULQDQ is properly gated by Rust version
+        let rust_support = check_rust_version_supports_avx512();
+
+        // Should return a boolean based on Rust version
+        // This will be true for Rust 1.89+ and false for earlier versions
+        assert!(rust_support == true || rust_support == false);
+    }
+
+    // Mock tests for compile-time and runtime feature agreement scenarios
+    mod mock_feature_agreement_tests {
+        use super::*;
+
+        #[test]
+        fn test_rust_version_gating_scenarios() {
+            // Test VPCLMULQDQ with different Rust version scenarios
+
+            // All features available but Rust version too old
+            let capabilities_old_rust = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: true,
+                has_vpclmulqdq: true,                // Hardware supports it
+                rust_version_supports_avx512: false, // But Rust version is too old
+            };
+
+            // Should not select VPCLMULQDQ or AVX512 tiers due to Rust version constraint
+            let tier = select_performance_tier_for_test(&capabilities_old_rust);
+            assert_ne!(tier, PerformanceTier::X86_64Avx512Vpclmulqdq);
+            assert_ne!(tier, PerformanceTier::X86_64Avx512Pclmulqdq);
+            assert_eq!(tier, PerformanceTier::X86_64SsePclmulqdq);
+        }
+
+        #[test]
+        fn test_feature_dependency_validation() {
+            // Test that feature dependencies are properly validated
+
+            // SHA3 without AES should not be possible
+            let invalid_sha3_caps = ArchCapabilities {
+                has_aes: false, // Missing required dependency
+                has_sha3: true, // This should be impossible in real detection
+                has_sse41: false,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+
+            // Should fall back to software since AES is required for SHA3
+            assert_eq!(
+                select_performance_tier_for_test(&invalid_sha3_caps),
+                PerformanceTier::SoftwareTable
+            );
+
+            // VPCLMULQDQ without AVX512VL should not be possible
+            let invalid_vpclmul_caps = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: false,  // Missing required dependency
+                has_vpclmulqdq: true, // This should be impossible in real detection
+                rust_version_supports_avx512: true,
+            };
+
+            // Should fall back to SSE tier since AVX512VL is required for VPCLMULQDQ
+            assert_eq!(
+                select_performance_tier_for_test(&invalid_vpclmul_caps),
+                PerformanceTier::X86_64SsePclmulqdq
+            );
+        }
+    }
+
+    // Comprehensive tier selection tests across different hardware configurations
+    mod tier_selection_comprehensive_tests {
+        use super::*;
+
+        #[test]
+        fn test_all_aarch64_tier_combinations() {
+            // Test all possible AArch64 capability combinations
+
+            // No features - software fallback
+            let no_features = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: false,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&no_features),
+                PerformanceTier::SoftwareTable
+            );
+
+            // AES only - baseline AArch64 tier
+            let aes_only = ArchCapabilities {
+                has_aes: true,
+                has_sha3: false,
+                has_sse41: false,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&aes_only),
+                PerformanceTier::AArch64Aes
+            );
+
+            // AES + SHA3 - highest AArch64 tier
+            let aes_sha3 = ArchCapabilities {
+                has_aes: true,
+                has_sha3: true,
+                has_sse41: false,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&aes_sha3),
+                PerformanceTier::AArch64AesSha3
+            );
+        }
+
+        #[test]
+        fn test_all_x86_64_tier_combinations() {
+            // Test all possible x86_64 capability combinations
+
+            // No features - software fallback
+            let no_features = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: false,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&no_features),
+                PerformanceTier::SoftwareTable
+            );
+
+            // SSE4.1 only - software fallback (PCLMULQDQ required)
+            let sse_only = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&sse_only),
+                PerformanceTier::SoftwareTable
+            );
+
+            // SSE4.1 + PCLMULQDQ - baseline x86_64 tier
+            let sse_pclmul = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&sse_pclmul),
+                PerformanceTier::X86_64SsePclmulqdq
+            );
+
+            // SSE4.1 + PCLMULQDQ + AVX512VL but old Rust - should fall back to SSE tier
+            let avx512_pclmul_old_rust = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: true,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false, // Old Rust version
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&avx512_pclmul_old_rust),
+                PerformanceTier::X86_64SsePclmulqdq
+            );
+
+            // SSE4.1 + PCLMULQDQ + AVX512VL with new Rust - mid-tier
+            let avx512_pclmul_new_rust = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: true,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: true, // New Rust version
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&avx512_pclmul_new_rust),
+                PerformanceTier::X86_64Avx512Pclmulqdq
+            );
+
+            // All features + old Rust - should fall back to SSE tier
+            let all_features_old_rust = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: true,
+                has_vpclmulqdq: true,
+                rust_version_supports_avx512: false, // Old Rust version
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&all_features_old_rust),
+                PerformanceTier::X86_64SsePclmulqdq
+            );
+
+            // All features + new Rust - highest tier
+            let all_features_new_rust = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: true,
+                has_vpclmulqdq: true,
+                rust_version_supports_avx512: true, // New Rust version
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&all_features_new_rust),
+                PerformanceTier::X86_64Avx512Vpclmulqdq
+            );
+        }
+
+        #[test]
+        fn test_x86_32bit_tier_combinations() {
+            // Test x86 (32-bit) capability combinations
+
+            // No features - software fallback
+            let no_features = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: false,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            assert_eq!(
+                select_performance_tier_for_test(&no_features),
+                PerformanceTier::SoftwareTable
+            );
+
+            // SSE4.1 + PCLMULQDQ - for x86 32-bit testing, we expect x86_64 tier in our test function
+            // since the test function doesn't distinguish between x86 and x86_64 architectures
+            let sse_pclmul = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: false, // AVX512 not available on 32-bit x86
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            // The test function will return x86_64 tier since it doesn't distinguish architectures
+            assert_eq!(
+                select_performance_tier_for_test(&sse_pclmul),
+                PerformanceTier::X86_64SsePclmulqdq
+            );
+        }
+
+        #[test]
+        fn test_target_string_consistency() {
+            // Test that target strings are consistent with selected tiers
+
+            let test_cases = [
+                (PerformanceTier::AArch64AesSha3, "aarch64-neon-pmull-sha3"),
+                (PerformanceTier::AArch64Aes, "aarch64-neon-pmull"),
+                (
+                    PerformanceTier::X86_64Avx512Vpclmulqdq,
+                    "x86_64-avx512-vpclmulqdq",
+                ),
+                (
+                    PerformanceTier::X86_64Avx512Pclmulqdq,
+                    "x86_64-avx512-pclmulqdq",
+                ),
+                (PerformanceTier::X86_64SsePclmulqdq, "x86_64-sse-pclmulqdq"),
+                (PerformanceTier::X86SsePclmulqdq, "x86-sse-pclmulqdq"),
+                (PerformanceTier::SoftwareTable, "software-fallback-tables"),
+            ];
+
+            for (tier, expected_string) in test_cases {
+                assert_eq!(tier_to_target_string(tier), expected_string);
+            }
+        }
+    }
+
+    // Tests for graceful degradation between performance tiers
+    mod graceful_degradation_tests {
+        use super::*;
+
+        #[test]
+        fn test_aarch64_degradation_path() {
+            // Test the degradation path for AArch64: SHA3+AES -> AES -> Software
+
+            // Start with highest tier capabilities
+            let mut capabilities = ArchCapabilities {
+                has_aes: true,
+                has_sha3: true,
+                has_sse41: false,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+
+            // Should select highest tier
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities),
+                PerformanceTier::AArch64AesSha3
+            );
+
+            // Remove SHA3 - should degrade to AES tier
+            capabilities.has_sha3 = false;
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities),
+                PerformanceTier::AArch64Aes
+            );
+
+            // Remove AES - should degrade to software
+            capabilities.has_aes = false;
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities),
+                PerformanceTier::SoftwareTable
+            );
+        }
+
+        #[test]
+        fn test_x86_64_degradation_path() {
+            // Test the degradation path for x86_64: VPCLMULQDQ -> AVX512 -> SSE -> Software
+
+            // Start with highest tier capabilities
+            let mut capabilities = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: true,
+                has_vpclmulqdq: true,
+                rust_version_supports_avx512: true,
+            };
+
+            // Should select highest tier
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities),
+                PerformanceTier::X86_64Avx512Vpclmulqdq
+            );
+
+            // Remove VPCLMULQDQ - should degrade to AVX512 tier
+            capabilities.has_vpclmulqdq = false;
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities),
+                PerformanceTier::X86_64Avx512Pclmulqdq
+            );
+
+            // Remove AVX512VL - should degrade to SSE tier
+            capabilities.has_avx512vl = false;
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities),
+                PerformanceTier::X86_64SsePclmulqdq
+            );
+
+            // Remove PCLMULQDQ - should degrade to software
+            capabilities.has_pclmulqdq = false;
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities),
+                PerformanceTier::SoftwareTable
+            );
+
+            // Remove SSE4.1 - should still be software (already at lowest tier)
+            capabilities.has_sse41 = false;
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities),
+                PerformanceTier::SoftwareTable
+            );
+        }
+
+        #[test]
+        fn test_rust_version_degradation() {
+            // Test degradation when Rust version doesn't support VPCLMULQDQ
+
+            let capabilities_with_vpclmulqdq = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: true,
+                has_vpclmulqdq: true,
+                rust_version_supports_avx512: false, // Old Rust version
+            };
+
+            // Should degrade from VPCLMULQDQ tier to SSE tier due to Rust version
+            assert_eq!(
+                select_performance_tier_for_test(&capabilities_with_vpclmulqdq),
+                PerformanceTier::X86_64SsePclmulqdq
+            );
+        }
+
+        #[test]
+        fn test_partial_feature_availability() {
+            // Test scenarios where only some features in a tier are available
+
+            // AArch64: SHA3 available but AES not (impossible in real hardware, but test safety)
+            let aarch64_partial = ArchCapabilities {
+                has_aes: false,
+                has_sha3: true, // This would be impossible in real detection
+                has_sse41: false,
+                has_pclmulqdq: false,
+                has_avx512vl: false,
+                has_vpclmulqdq: false,
+                rust_version_supports_avx512: false,
+            };
+            // Should fall back to software since AES is required for SHA3
+            assert_eq!(
+                select_performance_tier_for_test(&aarch64_partial),
+                PerformanceTier::SoftwareTable
+            );
+
+            // x86_64: VPCLMULQDQ available but AVX512VL not (impossible in real hardware)
+            let x86_64_partial = ArchCapabilities {
+                has_aes: false,
+                has_sha3: false,
+                has_sse41: true,
+                has_pclmulqdq: true,
+                has_avx512vl: false,
+                has_vpclmulqdq: true, // This would be impossible in real detection
+                rust_version_supports_avx512: true,
+            };
+            // Should fall back to SSE tier since AVX512VL is required for VPCLMULQDQ
+            assert_eq!(
+                select_performance_tier_for_test(&x86_64_partial),
+                PerformanceTier::X86_64SsePclmulqdq
+            );
+        }
+    }
+}
+
+#[cfg(test)]
+mod software_fallback_tests {
+    use super::*;
+    #[test]
+    fn test_aarch64_without_aes_falls_back_to_software() {
+        // Test that AArch64 without AES support falls back to software implementation
+        let capabilities_no_aes = ArchCapabilities {
+            has_aes: false,  // No AES support
+            has_sha3: false, // SHA3 requires AES, so also false
+            has_sse41: false,
+            has_pclmulqdq: false,
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+
+        let tier = select_performance_tier_for_test(&capabilities_no_aes);
+        assert_eq!(
+            tier,
+            PerformanceTier::SoftwareTable,
+            "AArch64 without AES should fall back to software implementation"
+        );
+    }
+
+    #[test]
+    fn test_x86_without_pclmulqdq_falls_back_to_software() {
+        // Test that x86 without SSE4.1/PCLMULQDQ falls back to software implementation
+        let capabilities_no_pclmul = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: true,      // SSE4.1 available
+            has_pclmulqdq: false, // But PCLMULQDQ not available
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+
+        let tier = select_performance_tier_for_test(&capabilities_no_pclmul);
+        assert_eq!(
+            tier,
+            PerformanceTier::SoftwareTable,
+            "x86 without PCLMULQDQ should fall back to software implementation"
+        );
+
+        // Test x86 without SSE4.1
+        let capabilities_no_sse = ArchCapabilities {
+            has_aes: false,
+            has_sha3: false,
+            has_sse41: false,     // No SSE4.1 support
+            has_pclmulqdq: false, // PCLMULQDQ requires SSE4.1
+            has_avx512vl: false,
+            has_vpclmulqdq: false,
+            rust_version_supports_avx512: false,
+        };
+
+        let tier = select_performance_tier_for_test(&capabilities_no_sse);
+        assert_eq!(
+            tier,
+            PerformanceTier::SoftwareTable,
+            "x86 without SSE4.1 should fall back to software implementation"
+        );
+    }
+
+    #[test]
+    fn test_conditional_compilation_coverage() {
+        // Test that software fallback is properly conditionally compiled
+        // This test ensures the conditional compilation logic is working correctly
+
+        // Software fallback should be available when needed
+        #[cfg(any(
+            not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")),
+            target_arch = "x86",
+            all(target_arch = "aarch64", not(target_feature = "aes"))
+        ))]
+        {
+            // Software fallback should be compiled in these cases
+            use crate::arch::software;
+            let _test_fn = software::update;
+            // This test passes if it compiles successfully
+        }
+
+        // For x86_64, software fallback should not be needed since SSE4.1/PCLMULQDQ are always available
+        // But it may still be compiled for testing purposes
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 21c4935..393edb0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -135,7 +135,7 @@ use crate::crc32::consts::{
     CRC32_ISCSI, CRC32_ISO_HDLC, CRC32_JAMCRC, CRC32_MEF, CRC32_MPEG_2, CRC32_XFER,
 };
 
-#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
 use crate::crc32::fusion;
 
 use crate::crc64::consts::{
@@ -155,6 +155,7 @@ mod consts;
 mod crc32;
 mod crc64;
 mod enums;
+mod feature_detection;
 mod ffi;
 mod generate;
 mod structs;
@@ -782,6 +783,19 @@ pub fn checksum_combine_with_params(
 
 /// Returns the target used to calculate the CRC checksum for the specified algorithm.
 ///
+/// This function provides visibility into the active performance tier being used for CRC calculations.
+/// The target string follows the format `{architecture}-{intrinsics-family}-{intrinsics-features}`,
+/// such as `aarch64-aes-sha3` or `x86_64-avx512-vpclmulqdq`.
+///
+/// The performance tier system provides graceful degradation across different hardware capabilities:
+/// - **AArch64**: `aarch64-aes-sha3` (highest) → `aarch64-aes-pmull` (baseline)
+/// - **x86_64**: `x86_64-avx512-vpclmulqdq` (highest) → `x86_64-avx512-pclmulqdq` (mid) → `x86_64-sse-pclmulqdq` (baseline)
+/// - **x86**: `x86-sse-pclmulqdq` (baseline) → `software-fallback-tables` (fallback)
+/// - **Other architectures**: `software-fallback-tables`
+///
+/// The tier selection is deterministic and consistent across runs on the same hardware,
+/// combining compile-time and runtime feature detection for safety and optimal performance.
+///
 /// These strings are informational only, not stable, and shouldn't be relied on to match across
 /// versions.
 ///
@@ -790,9 +804,17 @@ pub fn checksum_combine_with_params(
 /// use crc_fast::{get_calculator_target, CrcAlgorithm::Crc32IsoHdlc};
 ///
 /// let target = get_calculator_target(Crc32IsoHdlc);
+/// println!("Using performance tier: {}", target);
+/// // Example outputs:
+/// // "aarch64-aes-sha3" - AArch64 with SHA3 and AES support
+/// // "x86_64-avx512-vpclmulqdq" - x86_64 with VPCLMULQDQ support
+/// // "x86_64-sse-pclmulqdq" - x86_64 baseline with SSE4.1 and PCLMULQDQ
 /// ```
 pub fn get_calculator_target(_algorithm: CrcAlgorithm) -> String {
-    arch::get_target()
+    use crate::feature_detection::get_arch_ops;
+
+    let arch_ops = get_arch_ops();
+    arch_ops.get_target_string()
 }
 
 /// Returns the calculator function and parameters for the specified CRC algorithm.
@@ -834,11 +856,15 @@ fn get_calculator_params(algorithm: CrcAlgorithm) -> (CalculatorFn, CrcParams) {
 #[inline(always)]
 fn crc32_iscsi_calculator(state: u64, data: &[u8], _params: CrcParams) -> u64 {
     // both aarch64 and x86 have native CRC-32/ISCSI support, so we can use fusion
-    #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))]
+    #[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))]
     return fusion::crc32_iscsi(state as u32, data) as u64;
 
-    #[cfg(all(not(target_arch = "aarch64"), not(target_arch = "x86_64")))]
-    // fallback to traditional calculation if not aarch64 or x86_64
+    #[cfg(all(
+        not(target_arch = "aarch64"),
+        not(target_arch = "x86_64"),
+        not(target_arch = "x86")
+    ))]
+    // Fallback to traditional calculation for other architectures
     Calculator::calculate(state, data, _params)
 }
 
@@ -945,6 +971,64 @@ mod lib {
         );
     }
 
+    #[test]
+    fn test_get_calculator_target_format() {
+        let target = get_calculator_target(CrcAlgorithm::Crc32IsoHdlc);
+
+        // Target string should not be empty
+        assert!(!target.is_empty());
+
+        // Should follow the expected format with valid architecture prefixes
+        let valid_prefixes = ["aarch64-", "x86_64-", "x86-", "software-"];
+        assert!(
+            valid_prefixes
+                .iter()
+                .any(|prefix| target.starts_with(prefix)),
+            "Target '{}' should start with a valid architecture prefix",
+            target
+        );
+
+        // Should contain intrinsics family and features information
+        let parts: Vec<&str> = target.split('-').collect();
+        assert!(
+            parts.len() >= 3,
+            "Target '{}' should have at least 3 parts: architecture-family-features",
+            target
+        );
+    }
+
+    #[test]
+    fn test_get_calculator_target_consistency() {
+        // Multiple calls should return the same result (deterministic)
+        let target1 = get_calculator_target(CrcAlgorithm::Crc32IsoHdlc);
+        let target2 = get_calculator_target(CrcAlgorithm::Crc32Iscsi);
+        let target3 = get_calculator_target(CrcAlgorithm::Crc64Nvme);
+
+        assert_eq!(
+            target1, target2,
+            "Target should be consistent across different CRC-32 algorithms"
+        );
+        assert_eq!(
+            target1, target3,
+            "Target should be consistent across CRC-32 and CRC-64 algorithms"
+        );
+    }
+
+    #[test]
+    fn test_get_calculator_target_uses_cached_detection() {
+        // This test verifies that the function uses cached feature detection
+        // by checking that multiple calls are consistent and don't perform
+        // redundant feature detection
+
+        let target1 = get_calculator_target(CrcAlgorithm::Crc32IsoHdlc);
+        let target2 = get_calculator_target(CrcAlgorithm::Crc32IsoHdlc);
+
+        assert_eq!(
+            target1, target2,
+            "Cached detection should return identical results"
+        );
+    }
+
     #[test]
     fn test_digest_updates_check() {
         for config in TEST_ALL_CONFIGS {
diff --git a/src/test/future_proof.rs b/src/test/future_proof_tests.rs
similarity index 100%
rename from src/test/future_proof.rs
rename to src/test/future_proof_tests.rs
diff --git a/src/test/mod.rs b/src/test/mod.rs
index 2b7b216..8b84fcc 100644
--- a/src/test/mod.rs
+++ b/src/test/mod.rs
@@ -7,7 +7,7 @@
 
 pub(crate) mod consts;
 pub(crate) mod enums;
-mod future_proof;
+mod future_proof_tests;
 mod structs;
 
 /// Creates a new aligned data vector from the input slice for testing.
diff --git a/tests/checksum_integration_tests.rs b/tests/checksum_integration_tests.rs
new file mode 100644
index 0000000..2eab8dd
--- /dev/null
+++ b/tests/checksum_integration_tests.rs
@@ -0,0 +1,237 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+use std::fs;
+use std::process::Command;
+
+#[test]
+fn test_benchmark_flag_parsing() {
+    let output = Command::new("cargo")
+        .args(&["run", "--bin", "checksum", "--", "-a", "CRC-32/ISCSI", "-b"])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(
+        output.status.success(),
+        "Command should succeed with -b flag"
+    );
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Algorithm: CRC-32/ISCSI"));
+    assert!(stdout.contains("Throughput:"));
+    assert!(stdout.contains("GiB/s"));
+}
+
+#[test]
+fn test_benchmark_with_size_parameter() {
+    let output = Command::new("cargo")
+        .args(&[
+            "run",
+            "--bin",
+            "checksum",
+            "--",
+            "-a",
+            "CRC-32/ISCSI",
+            "-b",
+            "--size",
+            "1024",
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(output.status.success());
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Data Size: 1,024 bytes"));
+}
+
+#[test]
+fn test_benchmark_with_duration_parameter() {
+    let output = Command::new("cargo")
+        .args(&[
+            "run",
+            "--bin",
+            "checksum",
+            "--",
+            "-a",
+            "CRC-32/ISCSI",
+            "-b",
+            "--duration",
+            "1.0",
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(output.status.success());
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Duration: 1."));
+}
+
+#[test]
+fn test_benchmark_invalid_size() {
+    let output = Command::new("cargo")
+        .args(&[
+            "run",
+            "--bin",
+            "checksum",
+            "--",
+            "-a",
+            "CRC-32/ISCSI",
+            "-b",
+            "--size",
+            "0",
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(!output.status.success());
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(stderr.contains("Size must be greater than 0"));
+}
+
+#[test]
+fn test_benchmark_invalid_duration() {
+    let output = Command::new("cargo")
+        .args(&[
+            "run",
+            "--bin",
+            "checksum",
+            "--",
+            "-a",
+            "CRC-32/ISCSI",
+            "-b",
+            "--duration",
+            "0",
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(!output.status.success());
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(stderr.contains("Duration must be greater than 0"));
+}
+
+#[test]
+fn test_benchmark_with_file_input() {
+    // Create a temporary test file
+    let test_file = "test_benchmark_file.txt";
+    fs::write(test_file, "Hello, benchmark world!").expect("Failed to create test file");
+
+    let output = Command::new("cargo")
+        .args(&[
+            "run",
+            "--bin",
+            "checksum",
+            "--",
+            "-a",
+            "CRC-32/ISCSI",
+            "-b",
+            "-f",
+            test_file,
+            "--duration",
+            "0.5",
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    // Clean up
+    let _ = fs::remove_file(test_file);
+
+    assert!(output.status.success());
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Data Size: 23 bytes"));
+}
+
+#[test]
+fn test_benchmark_with_string_input() {
+    let output = Command::new("cargo")
+        .args(&[
+            "run",
+            "--bin",
+            "checksum",
+            "--",
+            "-a",
+            "CRC-32/ISCSI",
+            "-b",
+            "-s",
+            "test string",
+            "--duration",
+            "0.5",
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(output.status.success());
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Data Size: 11 bytes"));
+}
+
+#[test]
+fn test_benchmark_different_algorithms() {
+    let algorithms = ["CRC-32/ISCSI", "CRC-64/NVME"];
+
+    for algorithm in &algorithms {
+        let output = Command::new("cargo")
+            .args(&[
+                "run",
+                "--bin",
+                "checksum",
+                "--",
+                "-a",
+                algorithm,
+                "-b",
+                "--duration",
+                "0.5",
+            ])
+            .output()
+            .expect("Failed to execute command");
+
+        assert!(
+            output.status.success(),
+            "Algorithm {} should work",
+            algorithm
+        );
+        let stdout = String::from_utf8_lossy(&output.stdout);
+        assert!(stdout.contains(&format!("Algorithm: {}", algorithm)));
+    }
+}
+
+#[test]
+fn test_benchmark_size_without_benchmark_flag() {
+    let output = Command::new("cargo")
+        .args(&[
+            "run",
+            "--bin",
+            "checksum",
+            "--",
+            "-a",
+            "CRC-32/ISCSI",
+            "--size",
+            "1024",
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(!output.status.success());
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(stderr.contains("--size and --duration can only be used with -b flag"));
+}
+
+#[test]
+fn test_benchmark_nonexistent_file() {
+    let output = Command::new("cargo")
+        .args(&[
+            "run",
+            "--bin",
+            "checksum",
+            "--",
+            "-a",
+            "CRC-32/ISCSI",
+            "-b",
+            "-f",
+            "nonexistent_file.txt",
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(!output.status.success());
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(stderr.contains("File not found"));
+}