From 5a5e3e0975a5448b7e8bd80071cca0486de0493e Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Mon, 1 Jun 2026 00:39:20 +0800 Subject: [PATCH] feat(lua): reuse origin slices for materialized encode --- lua/qjson/table.lua | 352 +++++++++++++++++++++++++- tests/lua/origin_materialize_spec.lua | 140 ++++++++++ 2 files changed, 488 insertions(+), 4 deletions(-) create mode 100644 tests/lua/origin_materialize_spec.lua diff --git a/lua/qjson/table.lua b/lua/qjson/table.lua index 6ec6ac8..e85dc93 100644 --- a/lua/qjson/table.lua +++ b/lua/qjson/table.lua @@ -27,6 +27,10 @@ end -- user-visible keys. Maps materialized table → "object" | "array". local TABLE_TYPE_HINT = setmetatable({}, { __mode = "k" }) +-- Weak side-table for keep_origin materialization metadata. +-- Maps materialized table -> provenance record used by qjson.encode. +local TABLE_ORIGIN = setmetatable({}, { __mode = "k" }) + -- Box scratch used for one-shot FFI returns. Reused across calls to avoid -- per-call allocation; safe because the parent Doc / lazy view holds the -- buffer alive and these are read-and-copy. @@ -506,7 +510,7 @@ function _M.decode(json_str) end end -local function materialize(v) +local function materialize_plain(v) local mt = (type(v) == "table") and getmetatable(v) or nil if mt == LazyObject then local out = {} @@ -520,13 +524,13 @@ local function materialize(v) for _, k in ipairs(keys) do local val = values[k] assert(val ~= nil, "qjson: internal invariant violated (ORDER_VALUES missing key " .. tostring(k) .. ")") - out[k] = materialize(val) + out[k] = materialize_plain(val) end else -- Not yet materialized: use cursor-based walk for _, kv in ipairs(materialize_object_contents(v)) do local child = cached_child(v, kv[1]) or kv[2] - out[kv[1]] = materialize(child) + out[kv[1]] = materialize_plain(child) end end return out @@ -534,7 +538,7 @@ local function materialize(v) local raw = materialize_array_contents(v) local out = {} for i, x in ipairs(raw) do - out[i] = materialize(x) + out[i] = materialize_plain(rawget(v, i) or x) end if #out == 0 then setmetatable(out, _M.empty_array_mt) @@ -544,6 +548,167 @@ local function materialize(v) return v end +local function cursor_raw_token(ctx, cursor) + local rc = C.qjson_cursor_bytes(cursor, sz_a, sz_b) + check(ctx, rc) + local bs = tonumber(sz_a[0]) + local be = tonumber(sz_b[0]) + return ctx._doc._hold:sub(bs + 1, be), bs, be +end + +local function scalar_origin_record(v, raw_token) + if rawequal(v, _M.null) then + return { tag = "null", raw = raw_token } + end + local tv = type(v) + if tv == "string" then + return { tag = "string", value = v, raw = raw_token } + end + if tv == "number" then + return { tag = "number", value = v, raw = raw_token } + end + if tv == "boolean" then + return { tag = "boolean", value = v, raw = raw_token } + end + return nil +end + +local materialize_with_origin + +local function materialize_object_with_origin(view) + local out = {} + local keys = {} + local records = {} + local seen = {} + local had_duplicates = false + local it = new_object_iter(view) + + while true do + local rc = C.qjson_iter_next(it, strp_box, size_box, child_box) + if rc == QJSON_NOT_FOUND then break end + check(view, rc) + local key = ffi.string(strp_box[0], size_box[0]) + local count = (seen[key] or 0) + 1 + seen[key] = count + if count == 1 then + keys[#keys + 1] = key + else + had_duplicates = true + end + + local raw_token = cursor_raw_token(view, child_box[0]) + local child + if count == 1 then + local cached = cached_child(view, key) + if cached ~= nil then + child = cached + else + child = decode_cursor(view, child_box) + end + else + -- Duplicate keys must read from lexical order for deterministic + -- last-wins collapse and provenance snapshots. + child = decode_cursor(view, child_box) + end + + local materialized_child = materialize_with_origin(child) + out[key] = materialized_child + + local record = scalar_origin_record(materialized_child, raw_token) + local child_origin = type(materialized_child) == "table" and TABLE_ORIGIN[materialized_child] or nil + if record == nil and child_origin ~= nil then + record = { tag = "table", origin = child_origin } + end + records[key] = record + end + + TABLE_ORIGIN[out] = { + kind = "object", + source = view._doc._hold, + bs = view._bs, + be = view._be, + keys = keys, + records = records, + had_duplicates = had_duplicates, + } + TABLE_TYPE_HINT[out] = "object" + return out +end + +local function materialize_array_with_origin(view) + local out = {} + local records = {} + local i = 0 + while true do + local rc = C.qjson_cursor_index(view._cur, i, child_box) + if rc == QJSON_NOT_FOUND then break end + check(view, rc, T_ARR) + local raw_token = cursor_raw_token(view, child_box[0]) + local idx = i + 1 + local cached = rawget(view, idx) + local child = cached or decode_cursor(view, child_box) + local materialized_child = materialize_with_origin(child) + out[idx] = materialized_child + + local record = scalar_origin_record(materialized_child, raw_token) + local child_origin = type(materialized_child) == "table" and TABLE_ORIGIN[materialized_child] or nil + if record == nil and child_origin ~= nil then + record = { tag = "table", origin = child_origin } + end + records[idx] = record + i = idx + end + if #out == 0 then + setmetatable(out, _M.empty_array_mt) + end + TABLE_ORIGIN[out] = { + kind = "array", + source = view._doc._hold, + bs = view._bs, + be = view._be, + len = #out, + records = records, + } + return out +end + +materialize_with_origin = function(v) + local mt = (type(v) == "table") and getmetatable(v) or nil + if mt == LazyObject then + if rawget(v, "_dirty") then + return materialize_plain(v) + end + return materialize_object_with_origin(v) + elseif mt == LazyArray then + if rawget(v, "_dirty") then + return materialize_plain(v) + end + return materialize_array_with_origin(v) + end + return v +end + +local function parse_materialize_opts(opts) + if opts == nil then + return false + end + if type(opts) ~= "table" then + error("qjson.materialize: opts must be a table") + end + local keep_origin = opts.keep_origin + if keep_origin ~= nil and type(keep_origin) ~= "boolean" then + error("qjson.materialize: opts.keep_origin must be a boolean") + end + return keep_origin == true +end + +local function materialize(v, opts) + if parse_materialize_opts(opts) then + return materialize_with_origin(v) + end + return materialize_plain(v) +end + _M.materialize = materialize local string_byte = string.byte @@ -800,9 +965,188 @@ local function encode_object(t, depth, active) return "{" .. table.concat(parts, ",") .. "}" end +local origin_object_fully_matches +local origin_array_fully_matches + +local function origin_record_matches(record, value, depth, active) + if record == nil then + return false + end + local tag = record.tag + if tag == "null" then + return rawequal(value, _M.null) + elseif tag == "boolean" then + return type(value) == "boolean" and value == record.value + elseif tag == "number" then + return type(value) == "number" and value == record.value + elseif tag == "string" then + return type(value) == "string" and value == record.value + elseif tag == "table" then + if type(value) ~= "table" then + return false + end + if active[value] then + error(ENCODE_CYCLE_ERROR) + end + local origin = TABLE_ORIGIN[value] + if origin == nil or origin ~= record.origin then + return false + end + if depth > ENCODE_MAX_DEPTH then + error(ENCODE_DEPTH_ERROR) + end + active[value] = true + local ok + if origin.kind == "object" then + ok = origin_object_fully_matches(value, origin, depth, active) + elseif origin.kind == "array" then + ok = origin_array_fully_matches(value, origin, depth, active) + else + ok = false + end + active[value] = nil + return ok + end + return false +end + +local function origin_table_slice(origin) + return origin.source:sub(origin.bs + 1, origin.be) +end + +origin_object_fully_matches = function(t, origin, depth, active) + if origin.had_duplicates then + return false + end + if depth > ENCODE_MAX_DEPTH then + error(ENCODE_DEPTH_ERROR) + end + for k, v in pairs(t) do + local record = origin.records[k] + if not origin_record_matches(record, v, depth + 1, active) then + return false + end + end + for _, k in ipairs(origin.keys) do + if rawget(t, k) == nil then + return false + end + end + return true +end + +origin_array_fully_matches = function(t, origin, depth, active) + if depth > ENCODE_MAX_DEPTH then + error(ENCODE_DEPTH_ERROR) + end + if #t ~= origin.len then + return false + end + for i = 1, origin.len do + if not origin_record_matches(origin.records[i], rawget(t, i), depth + 1, active) then + return false + end + end + for k, _ in pairs(t) do + if type(k) ~= "number" + or k ~= math.floor(k) + or k < 1 + or k > origin.len + then + return false + end + end + return true +end + +local function encode_origin_child(value, depth, active, record) + if record ~= nil then + if record.tag == "string" + and type(value) == "string" + and value == record.value + then + return record.raw + end + if record.tag == "null" and rawequal(value, _M.null) then + return record.raw + end + if record.tag == "boolean" + and type(value) == "boolean" + and value == record.value + then + return record.raw + end + end + -- Numeric scalars intentionally do not reuse raw lexical form when a + -- parent container is being walked; use the normal number encoder. + return encode(value, depth + 1, active) +end + +local function encode_object_with_origin(t, depth, active, origin) + if depth > ENCODE_MAX_DEPTH then + error(ENCODE_DEPTH_ERROR) + end + if origin_object_fully_matches(t, origin, depth, active) then + return origin_table_slice(origin) + end + + local parts = {} + local emitted = {} + local records = origin.records + + for _, key in ipairs(origin.keys) do + local value = rawget(t, key) + if value ~= nil then + emitted[key] = true + parts[#parts + 1] = encode_string(key) .. ":" .. encode_origin_child(value, depth, active, records[key]) + end + end + + for key, value in pairs(t) do + if not emitted[key] then + local key_type = type(key) + local encoded_key + if key_type == "string" then + encoded_key = key + elseif key_type == "number" then + encoded_key = tostring(key) + else + error("qjson.encode: object key must be a string or number, got " .. key_type) + end + parts[#parts + 1] = encode_string(encoded_key) .. ":" .. encode(value, depth + 1, active) + end + end + + return "{" .. table.concat(parts, ",") .. "}" +end + +local function encode_array_with_origin(t, depth, active, origin) + if depth > ENCODE_MAX_DEPTH then + error(ENCODE_DEPTH_ERROR) + end + if origin_array_fully_matches(t, origin, depth, active) then + return origin_table_slice(origin) + end + local kind, max = classify_plain_table(t) + if kind == "array" then + return encode_array(t, depth, active, max) + end + return encode_object(t, depth, active) +end + -- Dispatch for plain (non-lazy) tables. Separated from the main encode -- function to keep the lazy-proxy fast path narrow for LuaJIT traces. local function encode_plain_table(v, depth, active) + local origin = TABLE_ORIGIN[v] + if origin ~= nil then + if origin.kind == "object" then + return encode_object_with_origin(v, depth, active, origin) + end + if origin.kind == "array" then + return encode_array_with_origin(v, depth, active, origin) + end + end + local mt = getmetatable(v) if mt == _M.empty_array_mt then return encode_array(v, depth, active, #v) diff --git a/tests/lua/origin_materialize_spec.lua b/tests/lua/origin_materialize_spec.lua new file mode 100644 index 0000000..cd6c9b4 --- /dev/null +++ b/tests/lua/origin_materialize_spec.lua @@ -0,0 +1,140 @@ +local qjson = require("qjson") +local cjson = require("cjson") + +describe("qjson.materialize keep_origin", function() + it("keeps default materialize semantics when keep_origin is not set", function() + local src = '{"blob":"\\u0061"}' + local t = qjson.materialize(qjson.decode(src)) + + assert.is_nil(getmetatable(t)) + assert.are.equal("a", t.blob) + assert.are.equal('{"blob":"a"}', qjson.encode(t)) + end) + + it("accepts keep_origin=true and validates options", function() + local t = qjson.materialize(qjson.decode('{"a":1}'), { keep_origin = true }) + assert.is_nil(getmetatable(t)) + assert.are.equal(1, t.a) + + assert.has_error(function() + qjson.materialize(qjson.decode('{"a":1}'), true) + end, "qjson.materialize: opts must be a table") + + assert.has_error(function() + qjson.materialize(qjson.decode('{"a":1}'), { keep_origin = 1 }) + end, "qjson.materialize: opts.keep_origin must be a boolean") + end) + + it("reuses unchanged escaped string token when parent is changed", function() + local t = qjson.materialize(qjson.decode('{"blob":"\\u0061","x":1}'), { keep_origin = true }) + t.x = 2 + + assert.are.equal('{"blob":"\\u0061","x":2}', qjson.encode(t)) + end) + + it("falls back to normal escaping for changed string children", function() + local t = qjson.materialize(qjson.decode('{"blob":"\\u0061","x":1}'), { keep_origin = true }) + t.blob = "line1\nline2" + + assert.are.equal('{"blob":"line1\\nline2","x":1}', qjson.encode(t)) + end) + + it("reuses unchanged nested object and array siblings when parent is changed", function() + local src = '{"x":0,"obj":{"k":"\\u0061"},"arr":[1, 2 ,3]}' + local t = qjson.materialize(qjson.decode(src), { keep_origin = true }) + t.x = 9 + + local out = qjson.encode(t) + assert.is_truthy(string.find(out, '"obj":{"k":"\\u0061"}', 1, true)) + assert.is_truthy(string.find(out, '"arr":[1, 2 ,3]', 1, true)) + assert.are.equal(9, cjson.decode(out).x) + end) + + it("does not reintroduce duplicate keys after materialization", function() + local t = qjson.materialize(qjson.decode('{"a":1,"a":2}'), { keep_origin = true }) + t.b = 3 + + local out = qjson.encode(t) + assert.are.equal('{"a":2,"b":3}', out) + local _, count = out:gsub('"a":', "") + assert.are.equal(1, count) + end) + + it("does not splice standalone numeric tokens in changed parents", function() + local t = qjson.materialize(qjson.decode('{"n":1.0,"e":1e3,"z":-0,"x":1}'), { keep_origin = true }) + t.x = 2 + + assert.are.equal('{"n":1,"e":1000,"z":0,"x":2}', qjson.encode(t)) + end) + + it("does not hide nested table mutations behind a parent raw slice", function() + local t = qjson.materialize(qjson.decode('{"a":{"x":1},"b":2}'), { keep_origin = true }) + t.a.x = 9 + + assert.are.equal('{"a":{"x":9},"b":2}', qjson.encode(t)) + end) + + it("still reports circular references after materialization", function() + local t = qjson.materialize(qjson.decode('{"a":1}'), { keep_origin = true }) + t.self = t + + assert.has_error(function() + qjson.encode(t) + end, "qjson.encode: circular reference") + end) + + it("still reports circular references through origin child tables", function() + local t = qjson.materialize(qjson.decode('{"a":{"x":1},"b":2}'), { keep_origin = true }) + t.a.self = t + + assert.has_error(function() + qjson.encode(t) + end, "qjson.encode: circular reference") + end) + + it("still reports max-depth errors for unchanged origin trees", function() + local parts = {} + for i = 1, 1001 do + parts[i] = '{"x":' + end + parts[#parts + 1] = '{}' + for _ = 1, 1001 do + parts[#parts + 1] = '}' + end + local t = qjson.materialize(qjson.decode(table.concat(parts)), { keep_origin = true }) + + assert.has_error(function() + qjson.encode(t) + end, "qjson.encode: max depth exceeded") + end) + + it("preserves lazy mutations made before keep_origin materialization", function() + local lazy = qjson.decode('{"a":{"x":1},"b":2}') + lazy.a.x = 9 + local t = qjson.materialize(lazy, { keep_origin = true }) + + assert.are.equal(9, t.a.x) + assert.are.equal(2, t.b) + end) + + it("preserves lazy array child mutations made before keep_origin materialization", function() + local lazy = qjson.decode('[{"x":1},{"y":2}]') + lazy[1].x = 9 + local t = qjson.materialize(lazy, { keep_origin = true }) + + assert.are.equal(9, t[1].x) + assert.are.equal(2, t[2].y) + end) + + it("keeps source bytes alive for provenance-backed reuse", function() + local function materialized() + local src = '{"blob":"\\u0061","x":1}' + return qjson.materialize(qjson.decode(src), { keep_origin = true }) + end + local t = materialized() + collectgarbage("collect") + t.x = 2 + + assert.are.equal('{"blob":"\\u0061","x":2}', qjson.encode(t)) + end) +end)