Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 111 additions & 7 deletions benches/lua_bench.lua
Original file line number Diff line number Diff line change
Expand Up @@ -62,17 +62,36 @@ local function make_payload(target_bytes)
.. '[{"role":"user","content":[' .. table.concat(parts, ",") .. ']}]}'
end

local ROUNDS = 5

local function bench(name, iters, fn)
-- Warmup pass: lets JIT compile hot traces and any one-time pools fill
-- before measurement starts. Excluded from timing and memory delta.
local warmup = math.max(3, math.floor(iters / 5))
for _ = 1, warmup do fn() end

collectgarbage("collect")
local mem_before = collectgarbage("count")
local t0 = os.clock()
for _ = 1, iters do fn() end
local t1 = os.clock()

local ops = {}
for r = 1, ROUNDS do
local t0 = os.clock()
for _ = 1, iters do fn() end
local t1 = os.clock()
ops[r] = iters / (t1 - t0)
end
local mem_after = collectgarbage("count")
local elapsed = t1 - t0
print(string.format("%-44s %7.2fms total %10.0f ops/s %+8.1fKB",
name, elapsed * 1000, iters / elapsed,
mem_after - mem_before))

table.sort(ops)
local median = ops[math.ceil(ROUNDS / 2)]
local lo, hi = ops[1], ops[ROUNDS]
local sum = 0
for i = 1, ROUNDS do sum = sum + ops[i] end
local mean = sum / ROUNDS

print(string.format(
"%-44s median %9.0f ops/s mean %9.0f range %7.0f..%-9.0f %+8.1fKB",
name, median, mean, lo, hi, mem_after - mem_before))
end

local scenarios = {
Expand All @@ -87,6 +106,11 @@ local scenarios = {
{name = "10m", iters = 20, payload = make_payload(10 * 1024 * 1024)},
}

-- The pooled API (qd.new_decoder + :parse) only exists on commits that
-- landed the Decoder refactor. Probe so the bench still runs on older builds.
local has_pooled_api = type(qd.new_decoder) == "function"
local pooled_decoder = has_pooled_api and qd.new_decoder() or nil

for _, s in ipairs(scenarios) do
print(string.format("=== %s (%d bytes) ===", s.name, #s.payload))

Expand All @@ -103,4 +127,84 @@ for _, s in ipairs(scenarios) do
local _ = d:get_f64("temperature")
local _ = d:get_str("messages[0].role")
end)

if has_pooled_api then
bench("quickdecode pooled :parse + access 3 fields", s.iters, function()
local d = pooled_decoder:parse(s.payload)
local _ = d:get_str("model")
local _ = d:get_f64("temperature")
local _ = d:get_str("messages[0].role")
end)

-- One-shot-per-request pattern: each iter creates a fresh decoder,
-- parses once, and lets both decoder and doc fall to GC. No reuse.
-- This is the typical "user does not cache the decoder" path.
bench("quickdecode new_decoder()+parse (one-shot)", s.iters, function()
local dec = qd.new_decoder()
local d = dec:parse(s.payload)
local _ = d:get_str("model")
local _ = d:get_f64("temperature")
local _ = d:get_str("messages[0].role")
end)
end
end

-- Interleaved scenario: cycle through several payloads of different sizes
-- back-to-back, mirroring a server processing variable-size requests. The
-- single-payload loops above hand the allocator the same block over and over
-- and have no allocation to amortize away — they cannot exercise the doc
-- pool. This scenario can.
local function scenario_by_name(n)
for _, s in ipairs(scenarios) do
if s.name == n then return s end
end
error("no scenario " .. n)
end

local interleaved_names = {"100k", "200k", "500k", "1m"}
local interleaved = {}
for _, n in ipairs(interleaved_names) do
interleaved[#interleaved + 1] = scenario_by_name(n).payload
end

local function make_cycler(items)
local i = 0
local n = #items
return function()
i = i + 1
return items[((i - 1) % n) + 1]
end
end

print(string.format("=== interleaved %s ===", table.concat(interleaved_names, ",")))

do
local next_p = make_cycler(interleaved)
bench("cjson.decode + access 3 fields", 400, function()
local p = next_p()
local obj = cjson.decode(p)
local _ = obj.model
local _ = obj.temperature
local _ = obj.messages and obj.messages[1] and obj.messages[1].role
end)

next_p = make_cycler(interleaved)
bench("quickdecode.parse + access 3 fields", 400, function()
local p = next_p()
local d = qd.parse(p)
local _ = d:get_str("model")
local _ = d:get_f64("temperature")
local _ = d:get_str("messages[0].role")
end)

if has_pooled_api then
next_p = make_cycler(interleaved)
bench("quickdecode pooled :parse + access 3 fields", 400, function()
local p = next_p()
local d = pooled_decoder:parse(p)
local _ = d:get_str("model")
local _ = d:get_f64("temperature")
local _ = d:get_str("messages[0].role")
end)
end
end
60 changes: 60 additions & 0 deletions benches/perf_probe.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
-- Minimal probe for perf: hammers qd.parse on a fixed 100K payload so perf
-- samples concentrate on the FFI entry + parse hot path. Not a benchmark —
-- there is no timing or memory accounting here, just sustained work.

package.path = package.path .. ";./lua/?.lua"
package.cpath = package.cpath .. ";./target/release/lib?.so"

local qd = require("quickdecode")

-- Same payload generator as lua_bench.lua so probe output corresponds to
-- the same shape the bench measures. Park-Miller LCG keeps it deterministic.
local function make_payload(target_bytes)
local rng_state = 42
local function rng_range(lo, hi)
rng_state = (rng_state * 48271) % 2147483647
return lo + (rng_state % (hi - lo + 1))
end

local text = string.rep("Q", 1500)
local text_part = '{"type":"text","text":"' .. text .. '"}'
local parts = { text_part }
local current = 200 + #text_part

while current < target_bytes do
local remaining = target_bytes - current
local img_size
if remaining < 50 * 1024 then
img_size = math.max(1024, remaining)
else
local upper = math.min(500 * 1024, remaining)
img_size = rng_range(50 * 1024, upper)
end
local b64 = string.rep("A", img_size)
local img_part = '{"type":"image_url","image_url":{"url":"data:image/jpeg;base64,'
.. b64 .. '"}}'
parts[#parts + 1] = img_part
current = current + #img_part + 1
end

return '{"model":"gpt-4-vision","temperature":0.7,"messages":'
.. '[{"role":"user","content":[' .. table.concat(parts, ",") .. ']}]}'
end

local payload = make_payload(100 * 1024)
local iters = tonumber(arg[1]) or 500000

-- Warmup so JIT traces compile before perf starts sampling steady state.
for _ = 1, 1000 do
local d = qd.parse(payload)
local _ = d:get_str("model")
end

io.stderr:write(string.format("probe: %d bytes payload, %d iters\n", #payload, iters))

for _ = 1, iters do
local d = qd.parse(payload)
local _ = d:get_str("model")
local _ = d:get_f64("temperature")
local _ = d:get_str("messages[0].role")
end
58 changes: 46 additions & 12 deletions src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,36 @@ pub struct qjd_doc {
owns_decoder: bool,
}

/// Heap layout for the one-shot [`qjd_parse`] path: `qjd_doc` and its
/// private `qjd_decoder` live in the same allocation. `#[repr(C)]` pins
/// `doc` at offset 0 so the public `*mut qjd_doc` returned to callers can
/// be cast back to `*mut OwnedDocBlock` on free.
#[repr(C)]
struct OwnedDocBlock {
doc: qjd_doc,
decoder: qjd_decoder,
}

// ── Entry-point safety helpers ──────────────────────────────────────────────

/// Validate `doc` and return the live decoder. Order matters: a destroyed
/// decoder is reported as `QJD_INVALID_ARG`, not `QJD_STALE_DOC`.
///
/// Docs created by [`qjd_parse`] (`owns_decoder == true`) take a fast path:
/// the private decoder is unreachable to any of the `qjd_decoder_*` mutating
/// operations, so state and gen are pristine by construction.
unsafe fn check_doc_alive(doc: *mut qjd_doc) -> Result<&'static Decoder, qjd_err> {
if doc.is_null() { return Err(qjd_err::QJD_INVALID_ARG); }
let d = &*doc;
if d.owns_decoder {
// Legacy: the decoder is the sibling field in the same OwnedDocBlock.
// Compute its address from the static struct offset — no pointer load,
// matching the pre-pool layout where the decoder sat directly inside
// the doc allocation.
let block_ptr = doc as *const OwnedDocBlock;
let dec: &Decoder = &(*block_ptr).decoder.0;
return Ok(std::mem::transmute::<&Decoder, &'static Decoder>(dec));
}
let dec: &Decoder = &d.decoder.as_ref().0;
if matches!(dec.state, DecoderState::Destroyed) {
return Err(qjd_err::QJD_INVALID_ARG);
Expand Down Expand Up @@ -146,20 +169,27 @@ pub unsafe extern "C" fn qjd_parse(
if !err_out.is_null() { *err_out = qjd_err::QJD_INVALID_ARG as c_int; }
return ptr::null_mut();
}
let dec_ptr = Box::into_raw(Box::new(qjd_decoder(Decoder::new())));
// Allocate the block first with a fresh Decoder, then parse in-place.
// This avoids the ~100-byte stack→heap memcpy a naive `let mut decoder
// = Decoder::new(); ... Box::new(...decoder)` path pays per parse.
let block_ptr = Box::into_raw(Box::new(OwnedDocBlock {
doc: qjd_doc {
decoder: NonNull::dangling(),
gen: 0,
owns_decoder: true,
},
decoder: qjd_decoder(Decoder::new()),
}));
(*block_ptr).doc.decoder = NonNull::new_unchecked(&mut (*block_ptr).decoder);
let slice: &[u8] = std::slice::from_raw_parts(buf, len);
match (*dec_ptr).0.parse(slice) {
match (*block_ptr).decoder.0.parse(slice) {
Ok(()) => {
(*block_ptr).doc.gen = (*block_ptr).decoder.0.gen;
*err_out = qjd_err::QJD_OK as c_int;
let doc = qjd_doc {
decoder: NonNull::new_unchecked(dec_ptr),
gen: (*dec_ptr).0.gen,
owns_decoder: true,
};
Box::into_raw(Box::new(doc))
&mut (*block_ptr).doc
}
Err(e) => {
let _ = Box::from_raw(dec_ptr);
let _ = Box::from_raw(block_ptr);
*err_out = e as c_int;
ptr::null_mut()
}
Expand Down Expand Up @@ -187,9 +217,13 @@ pub unsafe extern "C" fn qjd_parse(
#[no_mangle]
pub unsafe extern "C" fn qjd_free(doc: *mut qjd_doc) {
if doc.is_null() { return; }
let d = Box::from_raw(doc);
if d.owns_decoder {
let _ = Box::from_raw(d.decoder.as_ptr());
// Read owns_decoder before taking ownership: the legacy doc lives inside
// an OwnedDocBlock and must be freed through that layout, while a pool
// doc is a standalone Box.
if (*doc).owns_decoder {
let _ = Box::from_raw(doc as *mut OwnedDocBlock);
} else {
let _ = Box::from_raw(doc);
}
}

Expand Down
Loading