From eafa5da3e895453d89eee834e7d4dad579dac0f6 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 24 Apr 2026 10:23:15 +0000 Subject: [PATCH 1/6] jepsen(dynamodb): linearizable register tests for every attribute type The existing DynamoDB Jepsen workload only exercises the List type via list-append. Add a complementary workload that runs a Knossos linearizable-register check (PutItem + ConsistentRead GetItem) for each of the 10 attribute types elastickv supports: S, N, B, BOOL, NULL, SS, NS, BS, L, M. Each type writes to its own table and uses a per-type encoder/decoder that canonicalises the value (sorted sets, byte arrays as int vecs) so register equality is reliable. Wired into the Jepsen CI workflows and the local run-jepsen-local.sh script. --- .github/workflows/jepsen-test-scheduled.yml | 18 + .github/workflows/jepsen-test.yml | 13 + .../src/elastickv/dynamodb_types_workload.clj | 420 ++++++++++++++++++ jepsen/src/elastickv/jepsen_test.clj | 5 + .../dynamodb_types_workload_test.clj | 50 +++ scripts/run-jepsen-local.sh | 35 +- 6 files changed, 538 insertions(+), 3 deletions(-) create mode 100644 jepsen/src/elastickv/dynamodb_types_workload.clj create mode 100644 jepsen/test/elastickv/dynamodb_types_workload_test.clj diff --git a/.github/workflows/jepsen-test-scheduled.yml b/.github/workflows/jepsen-test-scheduled.yml index ad925f8f6..317c2e65b 100644 --- a/.github/workflows/jepsen-test-scheduled.yml +++ b/.github/workflows/jepsen-test-scheduled.yml @@ -114,6 +114,24 @@ jobs: --max-txn-length ${{ inputs.max-txn-length || '4' }} \ --dynamo-ports 63801,63802,63803 \ --host 127.0.0.1 + - name: Run DynamoDB per-type Jepsen workloads against elastickv + working-directory: jepsen + timeout-minutes: 30 + run: | + set -e + for t in string number binary bool null string-set number-set binary-set list map; do + echo "::group::value-type=${t}" + timeout 240 ~/lein run -m elastickv.dynamodb-types-workload --local \ + --time-limit ${{ inputs.time-limit || '60' }} \ + --rate ${{ inputs.rate || '5' }} \ + --concurrency ${{ inputs.concurrency || '8' }} \ + --key-count ${{ inputs.key-count || '8' }} \ + --max-writes-per-key ${{ inputs.max-writes-per-key || '150' }} \ + --value-type "${t}" \ + --dynamo-ports 63801,63802,63803 \ + --host 127.0.0.1 + echo "::endgroup::" + done - name: Run S3 Jepsen workload against elastickv working-directory: jepsen timeout-minutes: 10 diff --git a/.github/workflows/jepsen-test.yml b/.github/workflows/jepsen-test.yml index aeac9354c..5cec72234 100644 --- a/.github/workflows/jepsen-test.yml +++ b/.github/workflows/jepsen-test.yml @@ -95,6 +95,19 @@ jobs: timeout-minutes: 3 run: | timeout 120 ~/lein run -m elastickv.dynamodb-workload --local --time-limit 5 --rate 5 --concurrency 5 --dynamo-ports 63801,63802,63803 --host 127.0.0.1 + - name: Run DynamoDB per-type Jepsen workloads against elastickv + working-directory: jepsen + timeout-minutes: 10 + run: | + set -e + for t in string number binary bool null string-set number-set binary-set list map; do + echo "::group::value-type=${t}" + timeout 120 ~/lein run -m elastickv.dynamodb-types-workload --local \ + --time-limit 5 --rate 5 --concurrency 4 \ + --value-type "${t}" \ + --dynamo-ports 63801,63802,63803 --host 127.0.0.1 + echo "::endgroup::" + done - name: Run S3 Jepsen workload against elastickv working-directory: jepsen timeout-minutes: 3 diff --git a/jepsen/src/elastickv/dynamodb_types_workload.clj b/jepsen/src/elastickv/dynamodb_types_workload.clj new file mode 100644 index 000000000..fcdada2e6 --- /dev/null +++ b/jepsen/src/elastickv/dynamodb_types_workload.clj @@ -0,0 +1,420 @@ +(ns elastickv.dynamodb-types-workload + "Jepsen workload that exercises every DynamoDB attribute type elastickv + currently supports. + + The existing dynamodb-workload covers only the List type via the + list-append (cycle/elle) consistency model. This workload complements it + with a per-type linearizable-register check for the remaining nine types: + String (S), Number (N), Binary (B), Boolean (BOOL), Null (NULL), String + Set (SS), Number Set (NS), Binary Set (BS), List (L) and Map (M). + + Each key is an independent register stored in its own DynamoDB item. + Writes use PutItem (replacing the entire `val` attribute). Reads use + GetItem with ConsistentRead=true. Test values are derived from the + write index, so each write produces a distinct value the register + model can disambiguate. + + A single test run targets one type (selected via --value-type). Each + type uses its own table so concurrent or sequential runs do not + interfere." + (:gen-class) + (:require [clojure.string :as str] + [cognitect.aws.client.api :as aws] + [cognitect.aws.credentials :as creds] + [elastickv.cli :as cli] + [elastickv.db :as ekdb] + [jepsen [checker :as checker] + [client :as client] + [generator :as gen] + [independent :as independent] + [net :as net]] + [jepsen.checker.timeline :as timeline] + [jepsen.control :as control] + [jepsen.db :as jdb] + [jepsen.nemesis :as nemesis] + [jepsen.nemesis.combined :as combined] + [jepsen.os :as os] + [jepsen.os.debian :as debian] + [knossos.model :as model])) + +(def ^:private pk-attr "pk") +(def ^:private val-attr "val") + +;; --------------------------------------------------------------------------- +;; Per-type specifications +;; --------------------------------------------------------------------------- +;; +;; Every spec defines: +;; :table DynamoDB table name (one per type, for isolation) +;; :gen (fn [i]) -> Clojure value used by the register model +;; :encode (fn [v]) -> DynamoDB attribute map (e.g. {:S "x"}) +;; :decode (fn [a]) -> Clojure value (canonicalised — sets sorted, byte +;; arrays converted to vectors of int) so equality is reliable +;; across the Knossos register check. +;; +;; The encoder always builds a *single* DynamoDB attribute matching the type +;; under test, so each test exercises exactly one attribute kind. + +(defn- bytes-of ^bytes [^String s] + (.getBytes s "UTF-8")) + +(defn- ->byte-array + "Coerce a value returned by cognitect/aws-api for a Binary attribute into a + Java byte[]. Different SDK versions return either byte[] or ByteBuffer, + so we accept both (and pass through nil for missing attributes)." + ^bytes [b] + (cond + (nil? b) nil + (instance? java.nio.ByteBuffer b) (let [^java.nio.ByteBuffer buf (.duplicate ^java.nio.ByteBuffer b) + arr (byte-array (.remaining buf))] + (.get buf arr) + arr) + :else b)) + +(defn- bytes->vec + "Canonicalise binary data to a vector of unsigned ints so that equality + works inside the Knossos register model (byte[] uses identity equality)." + [b] + (when-let [arr (->byte-array b)] + (vec (map #(bit-and 0xff (int %)) arr)))) + +(def ^:private type-specs + {:string {:table "jepsen_types_string" + :gen (fn [i] (str "v-" i)) + :encode (fn [v] {:S v}) + :decode (fn [a] (:S a))} + + :number {:table "jepsen_types_number" + :gen (fn [i] (long i)) + :encode (fn [v] {:N (str v)}) + :decode (fn [a] (Long/parseLong (:N a)))} + + :binary {:table "jepsen_types_binary" + :gen (fn [i] (bytes->vec (bytes-of (str "v-" i)))) + :encode (fn [v] {:B (byte-array (map unchecked-byte v))}) + :decode (fn [a] (bytes->vec (:B a)))} + + :bool {:table "jepsen_types_bool" + :gen (fn [i] (odd? i)) + :encode (fn [v] {:BOOL v}) + :decode (fn [a] (boolean (:BOOL a)))} + + ;; NULL has only one valid value. The register check still verifies that + ;; reads observe the written attribute (and never an absent / wrong-typed + ;; one) under nemesis. + :null {:table "jepsen_types_null" + :gen (fn [_i] :null) + :encode (fn [_v] {:NULL true}) + :decode (fn [a] (when (:NULL a) :null))} + + :string-set + {:table "jepsen_types_string_set" + :gen (fn [i] (sort [(str "v-" i) (str "w-" i)])) + :encode (fn [v] {:SS (vec v)}) + :decode (fn [a] (some-> (:SS a) sort vec))} + + :number-set + {:table "jepsen_types_number_set" + :gen (fn [i] (sort [(long i) (long (+ i 1000))])) + :encode (fn [v] {:NS (mapv str v)}) + :decode (fn [a] (some->> (:NS a) (map #(Long/parseLong %)) sort vec))} + + :binary-set + {:table "jepsen_types_binary_set" + :gen (fn [i] (sort [(bytes->vec (bytes-of (str "v-" i))) + (bytes->vec (bytes-of (str "w-" i)))])) + :encode (fn [v] {:BS (mapv #(byte-array (map unchecked-byte %)) v)}) + :decode (fn [a] (some->> (:BS a) (map bytes->vec) sort vec))} + + :list {:table "jepsen_types_list" + :gen (fn [i] [(long i) (long (+ i 1))]) + :encode (fn [v] {:L (mapv (fn [n] {:N (str n)}) v)}) + :decode (fn [a] (some->> (:L a) (mapv #(Long/parseLong (:N %)))))} + + :map {:table "jepsen_types_map" + :gen (fn [i] {"a" (long i) "b" (long (+ i 1))}) + :encode (fn [v] {:M (into {} (map (fn [[k n]] [k {:N (str n)}]) v))}) + :decode (fn [a] (when-let [m (:M a)] + (into {} (map (fn [[k av]] + [(name k) (Long/parseLong (:N av))]) + m))))}}) + +(def value-type-keys + "All registered DynamoDB type keys, in display/run order." + [:string :number :binary :bool :null + :string-set :number-set :binary-set + :list :map]) + +;; --------------------------------------------------------------------------- +;; AWS client +;; --------------------------------------------------------------------------- + +(defn- make-ddb-client + [host port] + (aws/client + {:api :dynamodb + :region "us-east-1" + :credentials-provider (creds/basic-credentials-provider + {:access-key-id "dummy" + :secret-access-key "dummy"}) + :endpoint-override {:protocol :http + :hostname host + :port port}})) + +(defn- anomaly? [resp] + (contains? resp :cognitect.anomalies/category)) + +(defn- ddb-invoke! + [ddb op request] + (let [resp (aws/invoke ddb {:op op :request request})] + (if (anomaly? resp) + (let [err-type (:__type resp) + category (:cognitect.anomalies/category resp) + msg (or (:message resp) + (:Message resp) + (:cognitect.anomalies/message resp) + "")] + (throw (ex-info (str "DynamoDB " (or err-type category) ": " msg) + {:type err-type + :category category + :resp resp}))) + resp))) + +(defn- create-table! + "Create a table for the type under test; ignore ResourceInUseException." + [ddb table] + (try + (ddb-invoke! ddb :CreateTable + {:TableName table + :KeySchema [{:AttributeName pk-attr :KeyType "HASH"}] + :AttributeDefinitions [{:AttributeName pk-attr :AttributeType "S"}] + :ProvisionedThroughput {:ReadCapacityUnits 5 :WriteCapacityUnits 5}}) + (catch clojure.lang.ExceptionInfo e + (when-not (= "ResourceInUseException" (:type (ex-data e))) + (throw e))))) + +(defn- dynamo-put! + "PutItem with the encoded value. Replaces the entire item." + [ddb table k attr] + (ddb-invoke! ddb :PutItem + {:TableName table + :Item {pk-attr {:S (str k)} + val-attr attr}}) + nil) + +(defn- dynamo-get + "ConsistentRead GetItem; returns the raw attribute map at val, or nil." + [ddb table k] + (let [resp (ddb-invoke! ddb :GetItem + {:TableName table + :Key {pk-attr {:S (str k)}} + :ConsistentRead true})] + (get-in resp [:Item (keyword val-attr)]))) + +;; --------------------------------------------------------------------------- +;; Jepsen client +;; --------------------------------------------------------------------------- + +(defrecord DynamoDBTypesClient [node->port spec ddb] + client/Client + + (open! [this test node] + (let [port (get node->port node 8000) + host (or (:dynamo-host test) (name node))] + (assoc this :ddb (make-ddb-client host port)))) + + (setup! [_this _test] + (create-table! ddb (:table spec))) + + (teardown! [_this _test]) + + (close! [this _test] + (when ddb (aws/stop ddb)) + (assoc this :ddb nil)) + + (invoke! [_this _test op] + (try + (let [[k v] (:value op) + table (:table spec)] + (case (:f op) + :write + (do (dynamo-put! ddb table k ((:encode spec) v)) + (assoc op :type :ok)) + + :read + (let [attr (dynamo-get ddb table k) + decoded (when attr ((:decode spec) attr))] + (assoc op :type :ok :value (independent/tuple k decoded))))) + + (catch clojure.lang.ExceptionInfo e + (let [data (ex-data e) + err-type (:type data) + category (:category data)] + (cond + (and (nil? err-type) + (#{:cognitect.anomalies/fault + :cognitect.anomalies/unavailable} category)) + (assoc op :type :info :error :network-error) + + (contains? #{"InternalServerError"} err-type) + (assoc op :type :info :error (str err-type)) + + (= "ValidationException" err-type) + (assoc op :type :fail + :error (str err-type ": " + (get-in data [:resp :message] + (get-in data [:resp :Message] "")))) + + :else + (assoc op :type :info :error (.getMessage e))))) + + (catch Exception e + (assoc op :type :info :error (.getMessage e)))))) + +;; --------------------------------------------------------------------------- +;; Workload & Test builders +;; --------------------------------------------------------------------------- + +(def default-nodes ["n1" "n2" "n3" "n4" "n5"]) + +(defn- resolve-spec + "Lookup the spec for the requested :value-type, defaulting to :string." + [opts] + (let [vt (or (:value-type opts) :string)] + (or (get type-specs vt) + (throw (ex-info (str "Unknown value-type " vt + "; valid: " (vec (keys type-specs))) + {:value-type vt}))))) + +(defn dynamodb-types-workload + "Builds the linearizable-register workload for one DynamoDB attribute type." + [opts] + (let [spec (resolve-spec opts) + gen-fn (:gen spec) + key-count (or (:key-count opts) 5) + max-writes (or (:max-writes-per-key opts) 50) + threads-per-key (or (:threads-per-key opts) 2) + client (->DynamoDBTypesClient + (or (:node->port opts) + (zipmap default-nodes (repeat 8000))) + spec + nil)] + {:client client + :generator (independent/concurrent-generator + threads-per-key + (range key-count) + (fn [_k] + (->> (gen/mix [(map (fn [i] {:f :write :value (gen-fn i)}) + (range)) + (gen/repeat {:f :read})]) + (gen/limit max-writes)))) + :checker (independent/checker + (checker/compose + {:linear (checker/linearizable + {:model (model/register) + :algorithm :linear}) + :timeline (timeline/html)}))})) + +(defn elastickv-dynamodb-types-test + "Builds a Jepsen test map for a single DynamoDB attribute type." + ([] (elastickv-dynamodb-types-test {})) + ([opts] + (let [value-type (or (:value-type opts) :string) + nodes (or (:nodes opts) default-nodes) + dynamo-ports (or (:dynamo-ports opts) + (repeat (count nodes) (or (:dynamo-port opts) 8000))) + node->port (or (:node->port opts) (cli/ports->node-map dynamo-ports nodes)) + local? (:local opts) + db (if local? + jdb/noop + (ekdb/db {:grpc-port (or (:grpc-port opts) 50051) + :redis-port (or (:redis-port opts) 6379) + :dynamo-port node->port + :raft-groups (:raft-groups opts) + :shard-ranges (:shard-ranges opts)})) + rate (double (or (:rate opts) 5)) + time-limit (or (:time-limit opts) 30) + faults (if local? + [] + (cli/normalize-faults (or (:faults opts) [:partition :kill]))) + nemesis-p (when-not local? + (combined/nemesis-package {:db db + :faults faults + :interval (or (:fault-interval opts) 40)})) + nemesis-gen (if nemesis-p + (:generator nemesis-p) + (gen/once {:type :info :f :noop})) + workload (dynamodb-types-workload (assoc opts :node->port node->port))] + (merge workload + {:name (or (:name opts) + (str "elastickv-dynamodb-type-" (name value-type))) + :nodes nodes + :db db + :dynamo-host (:dynamo-host opts) + :os (if local? os/noop debian/os) + :net (if local? net/noop net/iptables) + :ssh (merge {:username "vagrant" + :private-key-path "/home/vagrant/.ssh/id_rsa" + :strict-host-key-checking false} + (when local? {:dummy true}) + (:ssh opts)) + :remote control/ssh + :nemesis (if nemesis-p (:nemesis nemesis-p) nemesis/noop) + :final-generator nil + :concurrency (or (:concurrency opts) 10) + :generator (->> (:generator workload) + (gen/nemesis nemesis-gen) + (gen/stagger (/ rate)) + (gen/time-limit time-limit))})))) + +;; --------------------------------------------------------------------------- +;; CLI +;; --------------------------------------------------------------------------- + +(defn- parse-value-type [s] + (let [k (keyword s)] + (when-not (contains? type-specs k) + (throw (IllegalArgumentException. + (str "Unknown --value-type " s "; valid: " + (str/join "," (map name value-type-keys)))))) + k)) + +(def types-cli-opts + [[nil "--dynamo-ports PORTS" "Comma-separated DynamoDB ports (one per node)." + :default nil + :parse-fn (fn [s] + (->> (str/split s #",") + (remove str/blank?) + (mapv #(Integer/parseInt %))))] + [nil "--dynamo-port PORT" "DynamoDB port (applied to all nodes)." + :default 8000 + :parse-fn #(Integer/parseInt %)] + [nil "--redis-port PORT" "Redis port." + :default 6379 + :parse-fn #(Integer/parseInt %)] + [nil "--value-type TYPE" "DynamoDB attribute type to test (string,number,binary,bool,null,string-set,number-set,binary-set,list,map)." + :default :string + :parse-fn parse-value-type] + [nil "--threads-per-key N" "Concurrent threads per register key." + :default 2 + :parse-fn #(Integer/parseInt %)]]) + +(defn- prepare-types-opts + [options] + (let [dynamo-ports (:dynamo-ports options) + options (cli/parse-common-opts options dynamo-ports) + node->port (if dynamo-ports + (cli/ports->node-map dynamo-ports (:nodes options)) + (zipmap (:nodes options) (repeat (:dynamo-port options))))] + (assoc options + :dynamo-host (:host options) + :node->port node->port + :dynamo-port (:dynamo-port options) + :redis-port (:redis-port options)))) + +(defn -main + [& args] + (cli/run-workload! args + (into cli/common-cli-opts types-cli-opts) + prepare-types-opts + elastickv-dynamodb-types-test)) diff --git a/jepsen/src/elastickv/jepsen_test.clj b/jepsen/src/elastickv/jepsen_test.clj index 8b9df3290..01b9a9300 100644 --- a/jepsen/src/elastickv/jepsen_test.clj +++ b/jepsen/src/elastickv/jepsen_test.clj @@ -2,6 +2,7 @@ (:gen-class) (:require [elastickv.redis-workload :as redis-workload] [elastickv.dynamodb-workload :as dynamodb-workload] + [elastickv.dynamodb-types-workload :as dynamodb-types-workload] [elastickv.s3-workload :as s3-workload] [jepsen.cli :as cli])) @@ -11,6 +12,10 @@ (defn elastickv-dynamodb-test [] (dynamodb-workload/elastickv-dynamodb-test {})) +(defn elastickv-dynamodb-types-test + ([] (elastickv-dynamodb-types-test {})) + ([opts] (dynamodb-types-workload/elastickv-dynamodb-types-test opts))) + (defn elastickv-s3-test [] (s3-workload/elastickv-s3-test {})) diff --git a/jepsen/test/elastickv/dynamodb_types_workload_test.clj b/jepsen/test/elastickv/dynamodb_types_workload_test.clj new file mode 100644 index 000000000..396e69bd5 --- /dev/null +++ b/jepsen/test/elastickv/dynamodb_types_workload_test.clj @@ -0,0 +1,50 @@ +(ns elastickv.dynamodb-types-workload-test + (:require [clojure.test :refer :all] + [jepsen.client :as client] + [elastickv.dynamodb-types-workload :as workload])) + +(deftest builds-test-spec-for-each-type + (doseq [t workload/value-type-keys] + (let [test-map (workload/elastickv-dynamodb-types-test {:value-type t})] + (is (map? test-map) (str "test-map for " (name t) " is a map")) + (is (= (str "elastickv-dynamodb-type-" (name t)) (:name test-map)) + (str "test name for " (name t))) + (is (some? (:client test-map)) (str "client for " (name t))) + (is (some? (:checker test-map)) (str "checker for " (name t))) + (is (some? (:generator test-map)) (str "generator for " (name t)))))) + +(deftest unknown-value-type-throws + (is (thrown? clojure.lang.ExceptionInfo + (workload/elastickv-dynamodb-types-test {:value-type :nope})))) + +(deftest custom-options-override-defaults + (let [test-map (workload/elastickv-dynamodb-types-test + {:value-type :number + :time-limit 60 + :concurrency 20 + :dynamo-port 9000})] + (is (= 20 (:concurrency test-map))))) + +(deftest host-override-creates-client + (let [test-map (workload/elastickv-dynamodb-types-test + {:value-type :string + :dynamo-host "127.0.0.1" + :node->port {"n1" 8000 "n2" 8001}}) + c (:client test-map) + opened (client/open! c test-map "n1")] + (is (some? (:ddb opened))))) + +;; Each type's encode -> decode round-trip must be lossless and produce the +;; canonical form the register checker compares against. +(deftest encode-decode-round-trips + (doseq [t workload/value-type-keys] + (let [{:keys [encode decode gen]} (get @#'workload/type-specs t) + v (gen 7)] + (is (= v (decode (encode v))) + (str "round-trip for " (name t) " value " (pr-str v)))))) + +(deftest distinct-table-per-type + (let [tables (map #(:table (get @#'workload/type-specs %)) + workload/value-type-keys)] + (is (= (count tables) (count (set tables))) + "every type uses a distinct table name"))) diff --git a/scripts/run-jepsen-local.sh b/scripts/run-jepsen-local.sh index 1568a8eb5..ed17e257c 100755 --- a/scripts/run-jepsen-local.sh +++ b/scripts/run-jepsen-local.sh @@ -87,9 +87,12 @@ if ! $NO_CLUSTER; then done fi -# ---- run Jepsen DynamoDB workload ---- -echo "[jepsen] running DynamoDB workload..." +# ---- run Jepsen DynamoDB workloads ---- cd "$REPO_ROOT/jepsen" + +EXIT_CODE=0 + +echo "[jepsen] running DynamoDB list-append workload..." set +e lein run -m elastickv.dynamodb-workload \ --local \ @@ -98,8 +101,34 @@ lein run -m elastickv.dynamodb-workload \ --concurrency 5 \ --dynamo-ports 63801,63802,63803 \ --host 127.0.0.1 -EXIT_CODE=$? +APPEND_EXIT=$? set -e +if [ $APPEND_EXIT -ne 0 ]; then + echo "[jepsen] list-append FAILED (exit $APPEND_EXIT)" + EXIT_CODE=$APPEND_EXIT +fi + +# All DynamoDB attribute types currently supported by elastickv. +# Each runs as its own register/linearizable test. +TYPES=(string number binary bool null string-set number-set binary-set list map) +for t in "${TYPES[@]}"; do + echo "[jepsen] running DynamoDB types workload: ${t}..." + set +e + lein run -m elastickv.dynamodb-types-workload \ + --local \ + --time-limit 20 \ + --rate 5 \ + --concurrency 4 \ + --value-type "$t" \ + --dynamo-ports 63801,63802,63803 \ + --host 127.0.0.1 + TYPE_EXIT=$? + set -e + if [ $TYPE_EXIT -ne 0 ]; then + echo "[jepsen] type=${t} FAILED (exit $TYPE_EXIT)" + EXIT_CODE=$TYPE_EXIT + fi +done if [ $EXIT_CODE -eq 0 ]; then echo "[jepsen] PASSED" From d63bc01bb133a56e907818abb7b8e513d31f0548 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 24 Apr 2026 10:49:20 +0000 Subject: [PATCH 2/6] jepsen(dynamodb-types): address PR #615 review feedback - BOOL decoder: return (:BOOL a) instead of (boolean (:BOOL a)) so a nil/malformed payload no longer silently collapses to false and confuses the register checker (codex). - Expand the retryable DynamoDB error set to include ServiceUnavailableException, ThrottlingException, and ProvisionedThroughputExceededException alongside InternalServerError (gemini). - Make test-table ProvisionedThroughput configurable and raise the default to 100/100 so real-DynamoDB stress runs do not self-throttle (gemini). - Scheduled CI: decouple the per-type sweep time-limit from the workflow_dispatch input and derive the inner `timeout` from it, so the outer guard never races a user-bumped runtime (codex). --- .github/workflows/jepsen-test-scheduled.yml | 13 ++++- .../src/elastickv/dynamodb_types_workload.clj | 54 +++++++++++++++---- 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/.github/workflows/jepsen-test-scheduled.yml b/.github/workflows/jepsen-test-scheduled.yml index 317c2e65b..730ce52a1 100644 --- a/.github/workflows/jepsen-test-scheduled.yml +++ b/.github/workflows/jepsen-test-scheduled.yml @@ -116,13 +116,22 @@ jobs: --host 127.0.0.1 - name: Run DynamoDB per-type Jepsen workloads against elastickv working-directory: jepsen + # The per-type sweep is a coverage check across all 10 attribute + # types, not the deep stress run — it uses its own shorter + # time-limit so the 10-type loop fits comfortably inside the job + # timeout regardless of the workflow_dispatch time-limit input. + # The per-invocation `timeout` is derived from TYPE_TL + buffer + # so bumping TYPE_TL never races against the outer timeout. timeout-minutes: 30 + env: + TYPE_TL: "60" run: | set -e + PER_TYPE_TIMEOUT=$((TYPE_TL + 180)) for t in string number binary bool null string-set number-set binary-set list map; do echo "::group::value-type=${t}" - timeout 240 ~/lein run -m elastickv.dynamodb-types-workload --local \ - --time-limit ${{ inputs.time-limit || '60' }} \ + timeout "${PER_TYPE_TIMEOUT}" ~/lein run -m elastickv.dynamodb-types-workload --local \ + --time-limit "${TYPE_TL}" \ --rate ${{ inputs.rate || '5' }} \ --concurrency ${{ inputs.concurrency || '8' }} \ --key-count ${{ inputs.key-count || '8' }} \ diff --git a/jepsen/src/elastickv/dynamodb_types_workload.clj b/jepsen/src/elastickv/dynamodb_types_workload.clj index fcdada2e6..54b562ddb 100644 --- a/jepsen/src/elastickv/dynamodb_types_workload.clj +++ b/jepsen/src/elastickv/dynamodb_types_workload.clj @@ -97,7 +97,12 @@ :bool {:table "jepsen_types_bool" :gen (fn [i] (odd? i)) :encode (fn [v] {:BOOL v}) - :decode (fn [a] (boolean (:BOOL a)))} + ;; Return (:BOOL a) directly, not (boolean ...): coercing to + ;; boolean would turn a nil payload (missing or wrong-typed + ;; attribute) into false, which is indistinguishable from a + ;; legitimately-written false. Preserving nil lets the + ;; register checker surface shape/type regressions. + :decode (fn [a] (:BOOL a))} ;; NULL has only one valid value. The register check still verifies that ;; reads observe the written attribute (and never an absent / wrong-typed @@ -180,15 +185,25 @@ :resp resp}))) resp))) +;; Default provisioned throughput for test tables. elastickv does not +;; enforce these numbers today, but we pick something high enough that a +;; real DynamoDB endpoint would not throttle a stress run (which would +;; otherwise manifest as ProvisionedThroughputExceededException :info +;; operations and waste test time). Overridable via --read-capacity / +;; --write-capacity. +(def ^:private default-read-capacity 100) +(def ^:private default-write-capacity 100) + (defn- create-table! "Create a table for the type under test; ignore ResourceInUseException." - [ddb table] + [ddb table read-capacity write-capacity] (try (ddb-invoke! ddb :CreateTable {:TableName table :KeySchema [{:AttributeName pk-attr :KeyType "HASH"}] :AttributeDefinitions [{:AttributeName pk-attr :AttributeType "S"}] - :ProvisionedThroughput {:ReadCapacityUnits 5 :WriteCapacityUnits 5}}) + :ProvisionedThroughput {:ReadCapacityUnits read-capacity + :WriteCapacityUnits write-capacity}}) (catch clojure.lang.ExceptionInfo e (when-not (= "ResourceInUseException" (:type (ex-data e))) (throw e))))) @@ -215,7 +230,7 @@ ;; Jepsen client ;; --------------------------------------------------------------------------- -(defrecord DynamoDBTypesClient [node->port spec ddb] +(defrecord DynamoDBTypesClient [node->port spec read-capacity write-capacity ddb] client/Client (open! [this test node] @@ -224,7 +239,7 @@ (assoc this :ddb (make-ddb-client host port)))) (setup! [_this _test] - (create-table! ddb (:table spec))) + (create-table! ddb (:table spec) read-capacity write-capacity)) (teardown! [_this _test]) @@ -256,7 +271,14 @@ :cognitect.anomalies/unavailable} category)) (assoc op :type :info :error :network-error) - (contains? #{"InternalServerError"} err-type) + ;; Transient server-side errors that may or may not have been + ;; applied. Mark them :info so Jepsen treats them as + ;; indeterminate rather than a definite failure. + (contains? #{"InternalServerError" + "ServiceUnavailableException" + "ThrottlingException" + "ProvisionedThroughputExceededException"} + err-type) (assoc op :type :info :error (str err-type)) (= "ValidationException" err-type) @@ -294,10 +316,14 @@ key-count (or (:key-count opts) 5) max-writes (or (:max-writes-per-key opts) 50) threads-per-key (or (:threads-per-key opts) 2) + read-capacity (or (:read-capacity opts) default-read-capacity) + write-capacity (or (:write-capacity opts) default-write-capacity) client (->DynamoDBTypesClient (or (:node->port opts) (zipmap default-nodes (repeat 8000))) spec + read-capacity + write-capacity nil)] {:client client :generator (independent/concurrent-generator @@ -397,6 +423,12 @@ :parse-fn parse-value-type] [nil "--threads-per-key N" "Concurrent threads per register key." :default 2 + :parse-fn #(Integer/parseInt %)] + [nil "--read-capacity N" "ProvisionedThroughput.ReadCapacityUnits for the test table." + :default default-read-capacity + :parse-fn #(Integer/parseInt %)] + [nil "--write-capacity N" "ProvisionedThroughput.WriteCapacityUnits for the test table." + :default default-write-capacity :parse-fn #(Integer/parseInt %)]]) (defn- prepare-types-opts @@ -407,10 +439,12 @@ (cli/ports->node-map dynamo-ports (:nodes options)) (zipmap (:nodes options) (repeat (:dynamo-port options))))] (assoc options - :dynamo-host (:host options) - :node->port node->port - :dynamo-port (:dynamo-port options) - :redis-port (:redis-port options)))) + :dynamo-host (:host options) + :node->port node->port + :dynamo-port (:dynamo-port options) + :redis-port (:redis-port options) + :read-capacity (:read-capacity options) + :write-capacity (:write-capacity options)))) (defn -main [& args] From 8685dc49f8fff4afc979a47a54c3af80ba0e4137 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 24 Apr 2026 16:05:29 +0000 Subject: [PATCH 3/6] jepsen(dynamodb-types): clear test keys at setup for clean re-runs If the test table already exists (ResourceInUseException path) and holds items from a previous run, reads issued before this run's first write on a key could observe those stale items, distorting the register history and producing spurious linearizability failures. This is especially relevant for --no-cluster local runs or shared DynamoDB endpoints where data outlives a single invocation. setup! now issues DeleteItem for each key the run will touch after CreateTable. DeleteItem on a missing key is a no-op in DynamoDB, so this is safe on the first run too. Addresses PR #615 codex review feedback. --- .../src/elastickv/dynamodb_types_workload.clj | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/jepsen/src/elastickv/dynamodb_types_workload.clj b/jepsen/src/elastickv/dynamodb_types_workload.clj index 54b562ddb..65ef6c06a 100644 --- a/jepsen/src/elastickv/dynamodb_types_workload.clj +++ b/jepsen/src/elastickv/dynamodb_types_workload.clj @@ -208,6 +208,20 @@ (when-not (= "ResourceInUseException" (:type (ex-data e))) (throw e))))) +(defn- clear-keys! + "Delete every partition key the test will touch, giving re-runs a clean + slate. Without this, reads issued before this run's first write on a + key could observe stale items from a previous run (against a shared + endpoint or a --no-cluster local run that reuses the Raft data-dir), + distorting the register history and producing spurious linearizability + failures. DeleteItem on a missing key is a no-op in DynamoDB, so this + is safe on first run too." + [ddb table key-count] + (doseq [k (range key-count)] + (ddb-invoke! ddb :DeleteItem + {:TableName table + :Key {pk-attr {:S (str k)}}}))) + (defn- dynamo-put! "PutItem with the encoded value. Replaces the entire item." [ddb table k attr] @@ -230,7 +244,7 @@ ;; Jepsen client ;; --------------------------------------------------------------------------- -(defrecord DynamoDBTypesClient [node->port spec read-capacity write-capacity ddb] +(defrecord DynamoDBTypesClient [node->port spec key-count read-capacity write-capacity ddb] client/Client (open! [this test node] @@ -239,7 +253,8 @@ (assoc this :ddb (make-ddb-client host port)))) (setup! [_this _test] - (create-table! ddb (:table spec) read-capacity write-capacity)) + (create-table! ddb (:table spec) read-capacity write-capacity) + (clear-keys! ddb (:table spec) key-count)) (teardown! [_this _test]) @@ -322,6 +337,7 @@ (or (:node->port opts) (zipmap default-nodes (repeat 8000))) spec + key-count read-capacity write-capacity nil)] From 31d46a69394e7d4a509fc216aebed17edd7e4bdf Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 24 Apr 2026 16:43:59 +0000 Subject: [PATCH 4/6] jepsen(dynamodb-types): make per-type CI sweep diagnostic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-type Jepsen CI step used \`set -e\` inside the for-loop, so the first failing attribute type aborted the sweep with nothing but a "process completed with exit code 1" annotation — leaving no way to tell which type crashed. Changes: - Local script and both CI workflows now run every type independently, tracking pass/fail per type and printing a final summary table. The step still fails at the end if any type failed, but the log now identifies each failing type by name. - Upload jepsen/store as a CI artifact on failure so the analysis output (history.txt, results.edn, timeline.html) is inspectable. - Harden ->byte-array against unexpected return shapes from cognitect/aws-api for Binary attributes (accept byte[], ByteBuffer, or any Seqable of byte-coercible numbers). --- .github/workflows/jepsen-test-scheduled.yml | 31 ++++++++++++++++++- .github/workflows/jepsen-test.yml | 31 ++++++++++++++++++- .../src/elastickv/dynamodb_types_workload.clj | 19 +++++++----- scripts/run-jepsen-local.sh | 10 ++++++ 4 files changed, 81 insertions(+), 10 deletions(-) diff --git a/.github/workflows/jepsen-test-scheduled.yml b/.github/workflows/jepsen-test-scheduled.yml index 730ce52a1..c6232b12f 100644 --- a/.github/workflows/jepsen-test-scheduled.yml +++ b/.github/workflows/jepsen-test-scheduled.yml @@ -126,10 +126,15 @@ jobs: env: TYPE_TL: "60" run: | - set -e + # Run every type independently: one failure does not stop + # the sweep so the final summary shows which specific types + # passed/failed. The step still fails if any type failed. PER_TYPE_TIMEOUT=$((TYPE_TL + 180)) + declare -A RESULT + FAILED=() for t in string number binary bool null string-set number-set binary-set list map; do echo "::group::value-type=${t}" + set +e timeout "${PER_TYPE_TIMEOUT}" ~/lein run -m elastickv.dynamodb-types-workload --local \ --time-limit "${TYPE_TL}" \ --rate ${{ inputs.rate || '5' }} \ @@ -139,8 +144,32 @@ jobs: --value-type "${t}" \ --dynamo-ports 63801,63802,63803 \ --host 127.0.0.1 + rc=$? + set -e + if [ "$rc" -eq 0 ]; then + RESULT[$t]="pass" + else + RESULT[$t]="fail(${rc})" + FAILED+=("$t") + fi echo "::endgroup::" done + echo + echo "=== per-type jepsen summary ===" + for t in string number binary bool null string-set number-set binary-set list map; do + printf ' %-12s %s\n' "$t" "${RESULT[$t]}" + done + if [ ${#FAILED[@]} -ne 0 ]; then + echo "FAILED types: ${FAILED[*]}" + exit 1 + fi + - name: Upload Jepsen store on failure + if: failure() + uses: actions/upload-artifact@v7 + with: + name: jepsen-store-types + path: jepsen/store + retention-days: 7 - name: Run S3 Jepsen workload against elastickv working-directory: jepsen timeout-minutes: 10 diff --git a/.github/workflows/jepsen-test.yml b/.github/workflows/jepsen-test.yml index 5cec72234..e6424c907 100644 --- a/.github/workflows/jepsen-test.yml +++ b/.github/workflows/jepsen-test.yml @@ -99,15 +99,44 @@ jobs: working-directory: jepsen timeout-minutes: 10 run: | - set -e + # Run every type even if one fails, so the log shows which + # specific attribute types pass and which fail. The step + # still fails at the end if any single type failed. + declare -A RESULT + FAILED=() for t in string number binary bool null string-set number-set binary-set list map; do echo "::group::value-type=${t}" + set +e timeout 120 ~/lein run -m elastickv.dynamodb-types-workload --local \ --time-limit 5 --rate 5 --concurrency 4 \ --value-type "${t}" \ --dynamo-ports 63801,63802,63803 --host 127.0.0.1 + rc=$? + set -e + if [ "$rc" -eq 0 ]; then + RESULT[$t]="pass" + else + RESULT[$t]="fail(${rc})" + FAILED+=("$t") + fi echo "::endgroup::" done + echo + echo "=== per-type jepsen summary ===" + for t in string number binary bool null string-set number-set binary-set list map; do + printf ' %-12s %s\n' "$t" "${RESULT[$t]}" + done + if [ ${#FAILED[@]} -ne 0 ]; then + echo "FAILED types: ${FAILED[*]}" + exit 1 + fi + - name: Upload Jepsen store on per-type failure + if: failure() + uses: actions/upload-artifact@v7 + with: + name: jepsen-store-types + path: jepsen/store + retention-days: 7 - name: Run S3 Jepsen workload against elastickv working-directory: jepsen timeout-minutes: 3 diff --git a/jepsen/src/elastickv/dynamodb_types_workload.clj b/jepsen/src/elastickv/dynamodb_types_workload.clj index 65ef6c06a..c50d8baa4 100644 --- a/jepsen/src/elastickv/dynamodb_types_workload.clj +++ b/jepsen/src/elastickv/dynamodb_types_workload.clj @@ -60,16 +60,19 @@ (defn- ->byte-array "Coerce a value returned by cognitect/aws-api for a Binary attribute into a - Java byte[]. Different SDK versions return either byte[] or ByteBuffer, - so we accept both (and pass through nil for missing attributes)." + Java byte[]. Different SDK versions return byte[], ByteBuffer, or a + Seqable of bytes, so we handle all three (and pass through nil for + missing attributes)." ^bytes [b] (cond - (nil? b) nil - (instance? java.nio.ByteBuffer b) (let [^java.nio.ByteBuffer buf (.duplicate ^java.nio.ByteBuffer b) - arr (byte-array (.remaining buf))] - (.get buf arr) - arr) - :else b)) + (nil? b) nil + (bytes? b) b + (instance? java.nio.ByteBuffer b) (let [^java.nio.ByteBuffer buf (.duplicate ^java.nio.ByteBuffer b) + arr (byte-array (.remaining buf))] + (.get buf arr) + arr) + ;; Fallback: treat as a sequence of byte-coercible numbers. + :else (byte-array (map unchecked-byte b)))) (defn- bytes->vec "Canonicalise binary data to a vector of unsigned ints so that equality diff --git a/scripts/run-jepsen-local.sh b/scripts/run-jepsen-local.sh index ed17e257c..cbec8636b 100755 --- a/scripts/run-jepsen-local.sh +++ b/scripts/run-jepsen-local.sh @@ -111,6 +111,7 @@ fi # All DynamoDB attribute types currently supported by elastickv. # Each runs as its own register/linearizable test. TYPES=(string number binary bool null string-set number-set binary-set list map) +declare -A TYPE_RESULT for t in "${TYPES[@]}"; do echo "[jepsen] running DynamoDB types workload: ${t}..." set +e @@ -127,9 +128,18 @@ for t in "${TYPES[@]}"; do if [ $TYPE_EXIT -ne 0 ]; then echo "[jepsen] type=${t} FAILED (exit $TYPE_EXIT)" EXIT_CODE=$TYPE_EXIT + TYPE_RESULT[$t]="fail(${TYPE_EXIT})" + else + TYPE_RESULT[$t]="pass" fi done +echo +echo "[jepsen] per-type summary:" +for t in "${TYPES[@]}"; do + printf ' %-12s %s\n' "$t" "${TYPE_RESULT[$t]}" +done + if [ $EXIT_CODE -eq 0 ]; then echo "[jepsen] PASSED" else From 2adabcbeb88ba14b3a3f176fa95262af6852fc21 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 24 Apr 2026 16:54:24 +0000 Subject: [PATCH 5/6] jepsen(dynamodb-types): fix :valid? :unknown on the binary sweep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: the per-type sweep was dense enough (8 keys × 150 writes × 2 threads/key, ~2400 ops total) that Knossos's :linear graph-search could not finish inside its analysis budget for the binary type, and it returned :valid? :unknown. fail-on-invalid! treats :unknown as a hard failure, so the step exited 1. Fix: - Checker now uses :competition, which races Knossos's :linear and :wgl algorithms in parallel and takes whichever proves a verdict first. :wgl (tree search) typically finishes on workloads where :linear times out. - Scheduled CI: split the per-type sweep's concurrency/key-count/ max-writes env vars away from the stress-run inputs, and cap them at 4 / 8 / 80 respectively so the per-key history stays comfortably inside Knossos's budget. The deep dynamodb-workload step still uses the full inputs.*. The per-PR jepsen-test.yml already uses a tiny 5-second sweep so no change is needed there. --- .github/workflows/jepsen-test-scheduled.yml | 14 +++++++++++--- jepsen/src/elastickv/dynamodb_types_workload.clj | 8 +++++++- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/.github/workflows/jepsen-test-scheduled.yml b/.github/workflows/jepsen-test-scheduled.yml index c6232b12f..1c9a21c06 100644 --- a/.github/workflows/jepsen-test-scheduled.yml +++ b/.github/workflows/jepsen-test-scheduled.yml @@ -124,7 +124,15 @@ jobs: # so bumping TYPE_TL never races against the outer timeout. timeout-minutes: 30 env: + # Per-type sweep is a coverage check, not the deep stress run, so + # it uses its own shorter runtime and smaller history density than + # the parent dynamodb-workload step. Keeping per-key ops modest + # also keeps Knossos's linearizability analysis inside its + # time budget (dense histories cause :valid? :unknown verdicts). TYPE_TL: "60" + TYPE_CONCURRENCY: "4" + TYPE_KEY_COUNT: "8" + TYPE_MAX_WRITES: "80" run: | # Run every type independently: one failure does not stop # the sweep so the final summary shows which specific types @@ -138,9 +146,9 @@ jobs: timeout "${PER_TYPE_TIMEOUT}" ~/lein run -m elastickv.dynamodb-types-workload --local \ --time-limit "${TYPE_TL}" \ --rate ${{ inputs.rate || '5' }} \ - --concurrency ${{ inputs.concurrency || '8' }} \ - --key-count ${{ inputs.key-count || '8' }} \ - --max-writes-per-key ${{ inputs.max-writes-per-key || '150' }} \ + --concurrency "${TYPE_CONCURRENCY}" \ + --key-count "${TYPE_KEY_COUNT}" \ + --max-writes-per-key "${TYPE_MAX_WRITES}" \ --value-type "${t}" \ --dynamo-ports 63801,63802,63803 \ --host 127.0.0.1 diff --git a/jepsen/src/elastickv/dynamodb_types_workload.clj b/jepsen/src/elastickv/dynamodb_types_workload.clj index c50d8baa4..ce47d4d40 100644 --- a/jepsen/src/elastickv/dynamodb_types_workload.clj +++ b/jepsen/src/elastickv/dynamodb_types_workload.clj @@ -353,11 +353,17 @@ (range)) (gen/repeat {:f :read})]) (gen/limit max-writes)))) + ;; :competition runs Knossos's graph-search (:linear) and tree-search + ;; (:wgl) algorithms in parallel and returns whichever proves a verdict + ;; first. :linear alone times out on dense Jepsen histories and yields + ;; :valid? :unknown, which fail-on-invalid! then treats as a hard + ;; failure; :competition avoids that by letting :wgl win on the + ;; workloads where it is faster. :checker (independent/checker (checker/compose {:linear (checker/linearizable {:model (model/register) - :algorithm :linear}) + :algorithm :competition}) :timeline (timeline/html)}))})) (defn elastickv-dynamodb-types-test From 12c1738ec58ff5b4acd83252a264b65ff54ffb8b Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 24 Apr 2026 17:16:34 +0000 Subject: [PATCH 6/6] jepsen(dynamodb-types): bound writes, not total ops, per key gen/limit max-writes was wrapping a mixed read/write generator, so --max-writes-per-key actually capped total ops; with a 50/50 mix a key could finish with roughly half as many writes as advertised (or even zero on the unlucky tail), starving the register signal and making the CI knob misleading. Now both the write stream and the read stream are finite: the write generator emits exactly max-writes distinct writes (one per index), and reads are capped at max-writes too. gen/mix terminates when both are drained, so --max-writes-per-key is honest regardless of the random mix ratio. Addresses PR #615 codex review feedback. --- jepsen/src/elastickv/dynamodb_types_workload.clj | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/jepsen/src/elastickv/dynamodb_types_workload.clj b/jepsen/src/elastickv/dynamodb_types_workload.clj index ce47d4d40..04864c34f 100644 --- a/jepsen/src/elastickv/dynamodb_types_workload.clj +++ b/jepsen/src/elastickv/dynamodb_types_workload.clj @@ -345,14 +345,20 @@ write-capacity nil)] {:client client + ;; Per-key generator: emits exactly `max-writes` distinct writes (one + ;; per index 0..max-writes-1) interleaved with up to `max-writes` + ;; reads. Both sub-gens are finite, so gen/mix terminates when both + ;; are drained. We bound *writes* specifically rather than total ops + ;; so --max-writes-per-key is honest: a read-heavy random mix can no + ;; longer starve the run of the writes that drive the register + ;; signal. :generator (independent/concurrent-generator threads-per-key (range key-count) (fn [_k] - (->> (gen/mix [(map (fn [i] {:f :write :value (gen-fn i)}) - (range)) - (gen/repeat {:f :read})]) - (gen/limit max-writes)))) + (gen/mix [(map (fn [i] {:f :write :value (gen-fn i)}) + (range max-writes)) + (repeat max-writes {:f :read})]))) ;; :competition runs Knossos's graph-search (:linear) and tree-search ;; (:wgl) algorithms in parallel and returns whichever proves a verdict ;; first. :linear alone times out on dense Jepsen histories and yields