Skip to content

Commit

Permalink
Implement a SIMD fast path for CRC checksums
Browse files Browse the repository at this point in the history
Recently I was profiling Cargo's extraction of tarballs and was quite
surprised to learn that 15% of the execution time of tarball extraction
was entirely crc32 checksum calculations in miniz. This was quite a
surprise to me and led me down a long rabbit hole of figuring out how to
speed this up!

It turns out Intel's written a paper, "Fast CRC Computation for Generic
Polynomials Using PCLMULQDQ Instruction", which describes how to
implement a CRC-32 value using hardware instructions. Note that these
are not the hardware CRC instructions, which I think are a different
algorithm.

This commit implements this paper in Rust, looking to a few other
external implementations for guidance as well. Overall the results are
quite promising, and I'm pretty confident in the correctness of this as
well. Current results look like:

* This SIMD implementation runs at about 25GB/s
* The miniz implementation runs at about 450MB/s
* The zlib implmentation, on OSX, runs at 25GB/s (seems to implement the
  same algorithm)
* The bundled zlib implmentation (and also the one I found on Linux)
  runs at 1.4GB/s

So this should be ~50 times faster for Cargo (which uses miniz), about
20 times faster for anyone using system zlib on Linux or the bundled
zlib, and on part with OSX's zlib performance.
  • Loading branch information
alexcrichton committed Nov 18, 2018
1 parent 37a60a7 commit 9b44592
Show file tree
Hide file tree
Showing 11 changed files with 439 additions and 10 deletions.
9 changes: 9 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@ matrix:

- rust: stable
script: cargo run --manifest-path systest/Cargo.toml
name: "systest"

- rust: nightly
install: rustup target add wasm32-unknown-unknown
script: cargo build --target wasm32-unknown-unknown
name: "wasm"

- rust: stable
env: RUST_BACKEND=1
Expand All @@ -28,6 +30,7 @@ matrix:
- cargo doc --no-deps --all-features
after_success:
- travis-cargo --only nightly doc-upload
name: "docs"

allow_failures:
- env: RUST_BACKEND=1
Expand All @@ -40,10 +43,16 @@ script:
- cargo test --features tokio
- cargo test --features 'tokio zlib'
- cargo test --features zlib --no-default-features
- cargo test --manifest-path flate2-crc/Cargo.toml
- cargo test --release --manifest-path flate2-crc/Cargo.toml
- cargo clean && cargo build
- cargo doc --no-deps
- cargo doc --no-deps --manifest-path=miniz-sys/Cargo.toml

branches:
only:
- master

env:
global:
secure: "PHVT7IaeP5nQQVwGHKwqCYBDp0QyetSlER7se2j2Xgfx+lw3Bu6VWH6VF04B636Gb0tHPN/sUCXSgGRcvDuy6XFOev4LfynoYxNKgHJYg2E34EP2QLwsFfnvE4iujaG3GJk3o935Y7OYGv2OP1HeG4Mv6JhQK0GLnNDBZQ65kWI="
Expand Down
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ libz-sys = { version = "1.0", optional = true }
tokio-io = { version = "0.1", optional = true }
futures = { version = "0.1", optional = true }
miniz_oxide_c_api = { version = "0.2", optional = true, features = ["no_c_export"]}
flate2-crc = { version = '0.1', path = 'flate2-crc' }

[target.'cfg(all(target_arch = "wasm32", not(target_os = "emscripten")))'.dependencies]
miniz_oxide_c_api = { version = "0.2", features = ["no_c_export"] }
Expand Down
10 changes: 8 additions & 2 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,11 @@ install:
build: false

test_script:
- cargo test --verbose --target %TARGET%
- cargo test --verbose --target %TARGET% --features tokio
- cargo test --target %TARGET%
- cargo test --target %TARGET% --features tokio
- cargo test --target %TARGET% --manifest-path flate2-crc/Cargo.toml
- cargo test --target %TARGET% --manifest-path flate2-crc/Cargo.toml --release

branches:
only:
- master
21 changes: 21 additions & 0 deletions flate2-crc/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[package]
name = "flate2-crc"
version = "0.1.0"
authors = ["Alex Crichton <alex@alexcrichton.com>"]
license = "MIT/Apache-2.0"
repository = "https://github.com/alexcrichton/flate2-rs/tree/flate2-crc"
homepage = "https://github.com/alexcrichton/flate2-rs"
documentation = "https://docs.rs/flate2-crc"
description = """
SIMD acceleration for CRC-32 checksums used in the gzip format
"""

[dependencies]
cfg-if = "0.1.6"

[dev-dependencies]
miniz-sys = { path = '../miniz-sys' }
rand = "0.6"
libz-sys = "1.0"
rayon = "1.0.3"
quickcheck = "0.7"
69 changes: 69 additions & 0 deletions flate2-crc/benches/run.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#![feature(test)]

extern crate flate2_crc;
extern crate rand;
extern crate test;
extern crate miniz_sys;
extern crate libz_sys;

use rand::{thread_rng, RngCore};

fn flate2_crc(data: &[u8]) -> u32 {
flate2_crc::Hardware::detect().calculate(0, data, |crc, data| {
unsafe {
miniz_sys::mz_crc32(crc as u64, data.as_ptr(), data.len()) as u32
}
})
}

fn miniz(data: &[u8]) -> u32 {
unsafe {
miniz_sys::mz_crc32(0, data.as_ptr(), data.len()) as u32
}
}

fn zlib(data: &[u8]) -> u32 {
unsafe {
libz_sys::crc32(0, data.as_ptr(), data.len() as u32) as u32
}
}

macro_rules! benches {
($($f:ident => ($small:ident, $medium:ident, $large:ident),)*) => ($(
#[bench]
fn $small(b: &mut test::Bencher) {
let mut rng = thread_rng();
let mut buf = vec![0u8; 8];
rng.fill_bytes(&mut buf);

b.bytes = 8;
b.iter(|| $f(&buf));
}

#[bench]
fn $medium(b: &mut test::Bencher) {
let mut rng = thread_rng();
let mut buf = vec![0u8; 65_000];
rng.fill_bytes(&mut buf);

b.bytes = 65_000;
b.iter(|| $f(&buf));
}

#[bench]
fn $large(b: &mut test::Bencher) {
let mut rng = thread_rng();
let mut buf = vec![0u8; 1_000_000];
rng.fill_bytes(&mut buf);

b.bytes = 1_000_000;
b.iter(|| $f(&buf));
}
)*)
}

benches! {
flate2_crc => (flate2_crc_8, flate2_crc_65000, flate2_crc_1000000),
miniz => (miniz_8, miniz_65000, miniz_1000000),
zlib => (zlib_8, zlib_65000, zlib_1000000),
}
36 changes: 36 additions & 0 deletions flate2-crc/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
use std::env;
use std::process::Command;
use std::str;

fn main() {
println!("cargo:rerun-if-changed=build.rs");

let minor = match rustc_minor_version() {
Some(n) => n,
None => return,
};

if minor >= 27 {
println!("cargo:rustc-cfg=simd");
}
}

fn rustc_minor_version() -> Option<u32> {
macro_rules! otry {
($e:expr) => {
match $e {
Some(e) => e,
None => return None,
}
};
}
let rustc = otry!(env::var_os("RUSTC"));
let output = otry!(Command::new(rustc).arg("--version").output().ok());
let version = otry!(str::from_utf8(&output.stdout).ok());
let mut pieces = version.split('.');
if pieces.next() != Some("rustc 1") {
return None;
}
otry!(pieces.next()).parse().ok()
}

103 changes: 103 additions & 0 deletions flate2-crc/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// Note that this isn't really intended to be a user-facing crate, that's
// `flate2::Crc`

#[macro_use]
extern crate cfg_if;

#[cfg(test)]
#[macro_use]
extern crate quickcheck;

cfg_if! {
if #[cfg(not(simd))] {
mod other;
use self::other as imp;
} else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
mod x86;
use self::x86 as imp;
} else {
mod other;
use self::other as imp;
}
}

#[derive(Debug)]
pub struct Hardware(bool);

impl Hardware {
#[inline]
pub fn detect() -> Hardware {
Hardware(imp::detect())
}

#[inline]
pub fn calculate(
&self,
crc: u32,
data: &[u8],
fallback: fn(u32, &[u8]) -> u32,
) -> u32 {
if self.0 {
unsafe { imp::calculate(crc, data, fallback) }
} else {
fallback(crc, data)
}
}
}

#[cfg(test)]
mod tests {
extern crate miniz_sys;
extern crate rand;
extern crate rayon;

use self::rand::Rng;
use self::rayon::prelude::*;
use super::Hardware;

fn fallback(a: u32, b: &[u8]) -> u32 {
unsafe {
miniz_sys::mz_crc32(a as _, b.as_ptr(), b.len()) as u32
}
}

fn random_chunks(iters: usize, lo: usize, hi: usize) {
let hardware = Hardware::detect();

(0..iters)
.into_par_iter()
.for_each_with(Vec::new(), |data, _| {
let mut rng = rand::thread_rng();
let init = rng.gen::<u32>();
let len = rng.gen_range(lo, hi);
data.resize(len, 0u8);
rng.fill(&mut data[..]);

assert_eq!(
fallback(init, &data),
hardware.calculate(init, &data, fallback),
);
});
}

#[test]
fn random_small() {
random_chunks(1000, 0, 256);
}

#[test]
fn random_med() {
random_chunks(1000, 256, 16 * 1024);
}

#[test]
fn random_large() {
random_chunks(1000, 0, 1024 * 1024);
}

quickcheck! {
fn prop(crc: u32, xs: Vec<u8>) -> bool {
fallback(crc, &xs) == Hardware::detect().calculate(crc, &xs, fallback)
}
}
}
12 changes: 12 additions & 0 deletions flate2-crc/src/other.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#[inline]
pub fn detect() -> bool {
false
}

pub unsafe fn calculate(
_crc: u32,
_data: &[u8],
_fallback: fn(u32, &[u8]) -> u32,
) -> u32 {
panic!()
}
Loading

0 comments on commit 9b44592

Please sign in to comment.