Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,29 @@ lazy proxy directly to `cjson.encode` (cjson bypasses metamethods in C); use
`qjson.encode` instead, or call `qjson.materialize(t)` to get a plain Lua table
that any third-party encoder can handle.

`qjson.materialize(t, { keep_origin = true })` keeps lightweight provenance on
the returned plain Lua tables so `qjson.encode` can preserve key order and
reuse selected original tokens. Recording is intentionally threshold-based:

- String children are recorded only when their raw JSON token (including
quotes) is longer than 24 bytes.
- Table children are recorded in the parent only when the child origin is
complete and its raw subtree span is longer than 64 bytes.
- Numbers, booleans, null, and short strings are not recorded.

Each recorded container tracks whether its provenance is complete:

- `complete = true`: every child needed to prove byte-for-byte identity is
recorded, so an unchanged container can be emitted as the original slice.
- `complete = false`: provenance is partial. Objects still preserve original
key order for existing keys and can reuse recorded large children, but arrays
fall back to normal array/object encoding.

Because materialized tables are ordinary Lua tables (no dirty-tracking
metatable), `keep_origin` with partial provenance preserves JSON-equivalent
output rather than guaranteeing byte-identical re-emission of every unchanged
small token.

**Native `next` caveat.** `next(t)` is not proxy-aware: it bypasses the
`__pairs` / `__ipairs` hooks and may see qjson implementation fields instead of
JSON fields. Do not use native `next` to iterate a lazy proxy or test whether it
Expand Down
85 changes: 42 additions & 43 deletions lua/qjson/table.lua
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ local TABLE_TYPE_HINT = setmetatable({}, { __mode = "k" })
-- Weak side-table for keep_origin materialization metadata.
-- Maps materialized table -> provenance record used by qjson.encode.
local TABLE_ORIGIN = setmetatable({}, { __mode = "k" })
local ORIGIN_STRING_MIN_RAW = 24
local ORIGIN_TABLE_MIN_RAW = 64

-- Box scratch used for one-shot FFI returns. Reused across calls to avoid
-- per-call allocation; safe because the parent Doc / lazy view holds the
Expand Down Expand Up @@ -548,29 +550,34 @@ local function materialize_plain(v)
return v
end

local function cursor_raw_token(ctx, cursor)
local function cursor_token_span(ctx, cursor)
local rc = C.qjson_cursor_bytes(cursor, sz_a, sz_b)
check(ctx, rc)
local bs = tonumber(sz_a[0])
local be = tonumber(sz_b[0])
return ctx._doc._hold:sub(bs + 1, be), bs, be
return bs, be
end

local function scalar_origin_record(v, raw_token)
if rawequal(v, _M.null) then
return { tag = "null", raw = raw_token }
end
local function origin_child_record(v, source, bs, be)
local raw_len = be - bs
local tv = type(v)
if tv == "string" then
return { tag = "string", value = v, raw = raw_token }
end
if tv == "number" then
return { tag = "number", value = v, raw = raw_token }
if raw_len > ORIGIN_STRING_MIN_RAW then
return { tag = "string", value = v, raw = source:sub(bs + 1, be) }, true
end
return nil, false
end
if tv == "boolean" then
return { tag = "boolean", value = v, raw = raw_token }
if tv == "table" then
local child_origin = TABLE_ORIGIN[v]
if child_origin ~= nil
and child_origin.complete == true
and (child_origin.be - child_origin.bs) > ORIGIN_TABLE_MIN_RAW
then
return { tag = "table", origin = child_origin }, true
end
return nil, false
end
return nil
return nil, false
end

local materialize_with_origin
Expand All @@ -581,6 +588,7 @@ local function materialize_object_with_origin(view)
local records = {}
local seen = {}
local had_duplicates = false
local complete = true
local it = new_object_iter(view)

while true do
Expand All @@ -596,7 +604,7 @@ local function materialize_object_with_origin(view)
had_duplicates = true
end

local raw_token = cursor_raw_token(view, child_box[0])
local bs, be = cursor_token_span(view, child_box[0])
local child
if count == 1 then
local cached = cached_child(view, key)
Expand All @@ -614,16 +622,16 @@ local function materialize_object_with_origin(view)
local materialized_child = materialize_with_origin(child)
out[key] = materialized_child

local record = scalar_origin_record(materialized_child, raw_token)
local child_origin = type(materialized_child) == "table" and TABLE_ORIGIN[materialized_child] or nil
if record == nil and child_origin ~= nil then
record = { tag = "table", origin = child_origin }
local record, captured = origin_child_record(materialized_child, view._doc._hold, bs, be)
if not captured then
complete = false
end
records[key] = record
end

TABLE_ORIGIN[out] = {
kind = "object",
complete = complete,
source = view._doc._hold,
bs = view._bs,
be = view._be,
Expand All @@ -638,22 +646,22 @@ end
local function materialize_array_with_origin(view)
local out = {}
local records = {}
local complete = true
local i = 0
while true do
local rc = C.qjson_cursor_index(view._cur, i, child_box)
if rc == QJSON_NOT_FOUND then break end
check(view, rc, T_ARR)
local raw_token = cursor_raw_token(view, child_box[0])
local bs, be = cursor_token_span(view, child_box[0])
local idx = i + 1
local cached = rawget(view, idx)
local child = cached or decode_cursor(view, child_box)
local materialized_child = materialize_with_origin(child)
out[idx] = materialized_child

local record = scalar_origin_record(materialized_child, raw_token)
local child_origin = type(materialized_child) == "table" and TABLE_ORIGIN[materialized_child] or nil
if record == nil and child_origin ~= nil then
record = { tag = "table", origin = child_origin }
local record, captured = origin_child_record(materialized_child, view._doc._hold, bs, be)
if not captured then
complete = false
end
records[idx] = record
i = idx
Expand All @@ -663,6 +671,7 @@ local function materialize_array_with_origin(view)
end
TABLE_ORIGIN[out] = {
kind = "array",
complete = complete,
source = view._doc._hold,
bs = view._bs,
be = view._be,
Expand Down Expand Up @@ -973,13 +982,7 @@ local function origin_record_matches(record, value, depth, active)
return false
end
local tag = record.tag
if tag == "null" then
return rawequal(value, _M.null)
elseif tag == "boolean" then
return type(value) == "boolean" and value == record.value
elseif tag == "number" then
return type(value) == "number" and value == record.value
elseif tag == "string" then
if tag == "string" then
return type(value) == "string" and value == record.value
elseif tag == "table" then
if type(value) ~= "table" then
Expand Down Expand Up @@ -1015,6 +1018,9 @@ local function origin_table_slice(origin)
end

origin_object_fully_matches = function(t, origin, depth, active)
if origin.complete ~= true then
return false
end
if origin.had_duplicates then
return false
end
Expand All @@ -1036,6 +1042,9 @@ origin_object_fully_matches = function(t, origin, depth, active)
end

origin_array_fully_matches = function(t, origin, depth, active)
if origin.complete ~= true then
return false
end
if depth > ENCODE_MAX_DEPTH then
error(ENCODE_DEPTH_ERROR)
end
Expand Down Expand Up @@ -1067,26 +1076,16 @@ local function encode_origin_child(value, depth, active, record)
then
return record.raw
end
if record.tag == "null" and rawequal(value, _M.null) then
return record.raw
end
if record.tag == "boolean"
and type(value) == "boolean"
and value == record.value
then
return record.raw
end
end
-- Numeric scalars intentionally do not reuse raw lexical form when a
-- parent container is being walked; use the normal number encoder.
-- Small scalars and incomplete child tables are re-encoded.
return encode(value, depth + 1, active)
end

local function encode_object_with_origin(t, depth, active, origin)
if depth > ENCODE_MAX_DEPTH then
error(ENCODE_DEPTH_ERROR)
end
if origin_object_fully_matches(t, origin, depth, active) then
if origin.complete == true and origin_object_fully_matches(t, origin, depth, active) then
return origin_table_slice(origin)
end

Expand Down Expand Up @@ -1124,7 +1123,7 @@ local function encode_array_with_origin(t, depth, active, origin)
if depth > ENCODE_MAX_DEPTH then
error(ENCODE_DEPTH_ERROR)
end
if origin_array_fully_matches(t, origin, depth, active) then
if origin.complete == true and origin_array_fully_matches(t, origin, depth, active) then
return origin_table_slice(origin)
end
local kind, max = classify_plain_table(t)
Expand Down
101 changes: 89 additions & 12 deletions tests/lua/origin_materialize_spec.lua
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
local qjson = require("qjson")
local cjson = require("cjson")
local LONG_ESC_A = "\\u0061\\u0062\\u0063\\u0064\\u0065"
local LONG_ESC_B = "\\u0066\\u0067\\u0068\\u0069\\u006A"
local EXACT_24_ESC = "\\u0061\\u0062abcdefghij"
local EXACT_64_CHILD_VALUE = string.rep("a", 56)

local function count_string_sub_calls(fn)
local original = string.sub
local calls = 0
rawset(string, "sub", function(...)
calls = calls + 1
return original(...)
end)
local ok, err = pcall(fn)
rawset(string, "sub", original)
if not ok then
error(err, 0)
end
return calls
end

describe("qjson.materialize keep_origin", function()
it("keeps default materialize semantics when keep_origin is not set", function()
Expand All @@ -25,11 +44,37 @@ describe("qjson.materialize keep_origin", function()
end, "qjson.materialize: opts.keep_origin must be a boolean")
end)

it("reuses unchanged escaped string token when parent is changed", function()
it("does not guarantee reuse for short escaped strings when parent is changed", function()
local t = qjson.materialize(qjson.decode('{"blob":"\\u0061","x":1}'), { keep_origin = true })
t.x = 2

assert.are.equal('{"blob":"\\u0061","x":2}', qjson.encode(t))
assert.are.equal('{"blob":"a","x":2}', qjson.encode(t))
end)

it("does not slice raw tokens for dropped provenance records", function()
local doc = qjson.decode('{"n":1,"b":true,"u":null,"s":"x","arr":[1,2],"obj":{"x":1}}')
local sub_calls = count_string_sub_calls(function()
qjson.materialize(doc, { keep_origin = true })
end)

assert.are.equal(0, sub_calls)
end)

it("does not treat an exact 24-byte string token as above threshold", function()
assert.are.equal(24, #('"' .. EXACT_24_ESC .. '"'))

local t = qjson.materialize(qjson.decode('{"blob":"' .. EXACT_24_ESC .. '","x":1}'), { keep_origin = true })
t.x = 2

assert.are.equal('{"blob":"ababcdefghij","x":2}', qjson.encode(t))
end)

it("reuses unchanged escaped string token when raw token is above threshold", function()
local src = '{"blob":"' .. LONG_ESC_A .. '","x":1}'
local t = qjson.materialize(qjson.decode(src), { keep_origin = true })
t.x = 2

assert.are.equal('{"blob":"' .. LONG_ESC_A .. '","x":2}', qjson.encode(t))
end)

it("falls back to normal escaping for changed string children", function()
Expand All @@ -39,15 +84,30 @@ describe("qjson.materialize keep_origin", function()
assert.are.equal('{"blob":"line1\\nline2","x":1}', qjson.encode(t))
end)

it("reuses unchanged nested object and array siblings when parent is changed", function()
local src = '{"x":0,"obj":{"k":"\\u0061"},"arr":[1, 2 ,3]}'
it("re-emits small-scalar containers field-by-field when unmodified", function()
local src = '{ "n":1.0, "s":"\\u0061", "b":true, "u":null }'
local t = qjson.materialize(qjson.decode(src), { keep_origin = true })
t.x = 9

local out = qjson.encode(t)
assert.is_truthy(string.find(out, '"obj":{"k":"\\u0061"}', 1, true))
assert.is_truthy(string.find(out, '"arr":[1, 2 ,3]', 1, true))
assert.are.equal(9, cjson.decode(out).x)

assert.are.equal('{"n":1,"s":"a","b":true,"u":null}', out)
assert.are_not.equal(src, out)
end)

it("returns original slice for unmodified containers with complete large children", function()
local src = '{ "a":"' .. LONG_ESC_A .. '" , "b":"' .. LONG_ESC_B .. '" }'
local t = qjson.materialize(qjson.decode(src), { keep_origin = true })

assert.are.equal(src, qjson.encode(t))
end)

it("does not treat an exact 64-byte child container as above threshold", function()
local child = '{"a":"' .. EXACT_64_CHILD_VALUE .. '"}'
assert.are.equal(64, #child)

local src = '{ "child" : ' .. child .. ' }'
local t = qjson.materialize(qjson.decode(src), { keep_origin = true })

assert.are.equal('{"child":' .. child .. '}', qjson.encode(t))
end)

it("does not reintroduce duplicate keys after materialization", function()
Expand All @@ -67,13 +127,20 @@ describe("qjson.materialize keep_origin", function()
assert.are.equal('{"n":1,"e":1000,"z":0,"x":2}', qjson.encode(t))
end)

it("does not hide nested table mutations behind a parent raw slice", function()
it("partial origins do not hide nested table mutations behind a parent raw slice", function()
local t = qjson.materialize(qjson.decode('{"a":{"x":1},"b":2}'), { keep_origin = true })
t.a.x = 9

assert.are.equal('{"a":{"x":9},"b":2}', qjson.encode(t))
end)

it("falls back to normal array/object classification for incomplete arrays", function()
local src = '[ 1 , 2 , 3 ]'
local t = qjson.materialize(qjson.decode(src), { keep_origin = true })

assert.are.equal("[1,2,3]", qjson.encode(t))
end)

it("still reports circular references after materialization", function()
local t = qjson.materialize(qjson.decode('{"a":1}'), { keep_origin = true })
t.self = t
Expand Down Expand Up @@ -128,13 +195,23 @@ describe("qjson.materialize keep_origin", function()

it("keeps source bytes alive for provenance-backed reuse", function()
local function materialized()
local src = '{"blob":"\\u0061","x":1}'
local src = '{"blob":"' .. LONG_ESC_A .. '","x":1}'
return qjson.materialize(qjson.decode(src), { keep_origin = true })
end
local t = materialized()
collectgarbage("collect")
t.x = 2

assert.are.equal('{"blob":"\\u0061","x":2}', qjson.encode(t))
assert.are.equal('{"blob":"' .. LONG_ESC_A .. '","x":2}', qjson.encode(t))
end)

it("reuses large complete child subtrees when parent is modified", function()
local src = '{"x":0,"big": { "a":"' .. LONG_ESC_A .. '" , "b":"' .. LONG_ESC_B .. '" }}'
local t = qjson.materialize(qjson.decode(src), { keep_origin = true })
t.x = 9

local out = qjson.encode(t)
assert.are.equal(9, cjson.decode(out).x)
assert.is_truthy(string.find(out, '"big":{ "a":"' .. LONG_ESC_A .. '" , "b":"' .. LONG_ESC_B .. '" }', 1, true))
end)
end)
Loading