From 8fc2dd3ad82491dbd6d21dea62546354149622a2 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sun, 22 Mar 2026 18:56:00 +0900 Subject: [PATCH 01/27] adapter: harden s3 routing and consistency --- adapter/dynamodb_migration_test.go | 59 +- adapter/s3.go | 1437 ++++++++++++++++++++++++++++ adapter/s3_auth.go | 248 +++++ adapter/s3_test.go | 564 +++++++++++ cmd/server/demo.go | 148 ++- cmd/server/demo_test.go | 34 + internal/s3keys/keys.go | 307 ++++++ internal/s3keys/keys_test.go | 82 ++ kv/coordinator.go | 14 +- kv/coordinator_txn_test.go | 29 +- kv/shard_key.go | 7 + kv/shard_key_test.go | 29 + kv/shard_store.go | 4 + kv/shard_store_test.go | 58 ++ kv/sharded_coordinator.go | 13 +- kv/sharded_coordinator_txn_test.go | 35 + kv/transcoder.go | 3 + 17 files changed, 3040 insertions(+), 31 deletions(-) create mode 100644 adapter/s3.go create mode 100644 adapter/s3_auth.go create mode 100644 adapter/s3_test.go create mode 100644 internal/s3keys/keys.go create mode 100644 internal/s3keys/keys_test.go create mode 100644 kv/shard_key_test.go diff --git a/adapter/dynamodb_migration_test.go b/adapter/dynamodb_migration_test.go index 7412ad1b..e2838ded 100644 --- a/adapter/dynamodb_migration_test.go +++ b/adapter/dynamodb_migration_test.go @@ -28,24 +28,55 @@ func (c *localAdapterCoordinator) Dispatch(ctx context.Context, req *kv.Operatio if req == nil { return &kv.CoordinateResponse{}, nil } - commitTS := c.Clock().Next() - if req.IsTxn && commitTS <= req.StartTS { - c.Clock().Observe(req.StartTS) + commitTS, err := c.commitTSForRequest(req) + if err != nil { + return nil, err + } + if err := c.applyElems(ctx, req.Elems, commitTS); err != nil { + return nil, err + } + return &kv.CoordinateResponse{}, nil +} + +func (c *localAdapterCoordinator) commitTSForRequest(req *kv.OperationGroup[kv.OP]) (uint64, error) { + if req == nil { + return 0, nil + } + commitTS := req.CommitTS + if commitTS == 0 { commitTS = c.Clock().Next() + if req.IsTxn && commitTS <= req.StartTS { + c.Clock().Observe(req.StartTS) + commitTS = c.Clock().Next() + } } - for _, elem := range req.Elems { - switch elem.Op { - case kv.Put: - if err := c.store.PutAt(ctx, elem.Key, elem.Value, commitTS, 0); err != nil { - return nil, err - } - case kv.Del: - if err := c.store.DeleteAt(ctx, elem.Key, commitTS); err != nil { - return nil, err - } + if req.IsTxn && commitTS <= req.StartTS { + return 0, kv.ErrTxnCommitTSRequired + } + return commitTS, nil +} + +func (c *localAdapterCoordinator) applyElems(ctx context.Context, elems []*kv.Elem[kv.OP], commitTS uint64) error { + for _, elem := range elems { + if err := c.applyElem(ctx, elem, commitTS); err != nil { + return err } } - return &kv.CoordinateResponse{}, nil + return nil +} + +func (c *localAdapterCoordinator) applyElem(ctx context.Context, elem *kv.Elem[kv.OP], commitTS uint64) error { + if elem == nil { + return nil + } + switch elem.Op { + case kv.Put: + return c.store.PutAt(ctx, elem.Key, elem.Value, commitTS, 0) + case kv.Del: + return c.store.DeleteAt(ctx, elem.Key, commitTS) + default: + return nil + } } func newLegacyMigrationTestServer( diff --git a/adapter/s3.go b/adapter/s3.go new file mode 100644 index 00000000..f3b31030 --- /dev/null +++ b/adapter/s3.go @@ -0,0 +1,1437 @@ +package adapter + +import ( + "context" + "crypto/md5" //nolint:gosec // S3 ETag compatibility requires MD5. + "crypto/rand" + "crypto/sha256" + "encoding/base64" + "encoding/binary" + "encoding/hex" + "encoding/xml" + "fmt" + "io" + "math" + "net" + "net/http" + "net/http/httputil" + "net/url" + "strconv" + "strings" + "time" + + "github.com/bootjp/elastickv/internal/s3keys" + "github.com/bootjp/elastickv/kv" + "github.com/bootjp/elastickv/store" + "github.com/cockroachdb/errors" + json "github.com/goccy/go-json" + "github.com/hashicorp/raft" +) + +const ( + s3HealthPath = "/healthz" + s3ChunkSize = 1 << 20 + s3ChunkBatchOps = 16 + s3XMLNamespace = "http://s3.amazonaws.com/doc/2006-03-01/" + s3DefaultRegion = "us-east-1" + s3MaxKeys = 1000 + s3ListPageSize = 256 + + s3TxnRetryInitialBackoff = 2 * time.Millisecond + s3TxnRetryMaxBackoff = 32 * time.Millisecond + s3TxnRetryBackoffFactor = 2 + s3TxnRetryMaxAttempts = 8 + + s3PathSplitParts = 2 + s3GenerationBytes = 8 + s3HLCPhysicalShift = 16 +) + +type S3Server struct { + listen net.Listener + s3Addr string + region string + store store.MVCCStore + coordinator kv.Coordinator + leaderS3 map[raft.ServerAddress]string + staticCreds map[string]string + readTracker *kv.ActiveTimestampTracker + httpServer *http.Server +} + +type s3BucketMeta struct { + BucketName string `json:"bucket_name"` + Generation uint64 `json:"generation"` + CreatedAtHLC uint64 `json:"created_at_hlc"` + Owner string `json:"owner,omitempty"` + Region string `json:"region,omitempty"` +} + +type s3ObjectManifest struct { + UploadID string `json:"upload_id"` + ETag string `json:"etag"` + SizeBytes int64 `json:"size_bytes"` + LastModifiedHLC uint64 `json:"last_modified_hlc"` + ContentType string `json:"content_type,omitempty"` + ContentEncoding string `json:"content_encoding,omitempty"` + CacheControl string `json:"cache_control,omitempty"` + ContentDisposition string `json:"content_disposition,omitempty"` + UserMetadata map[string]string `json:"user_metadata,omitempty"` + Parts []s3ObjectPart `json:"parts,omitempty"` +} + +type s3ObjectPart struct { + PartNo uint64 `json:"part_no"` + ETag string `json:"etag"` + SizeBytes int64 `json:"size_bytes"` + ChunkCount uint64 `json:"chunk_count"` + ChunkSizes []uint64 `json:"chunk_sizes,omitempty"` +} + +type s3ContinuationToken struct { + Bucket string `json:"bucket"` + Generation uint64 `json:"generation"` + Prefix string `json:"prefix,omitempty"` + Delimiter string `json:"delimiter,omitempty"` + LastKey string `json:"last_key,omitempty"` + LastCommonPrefix string `json:"last_common_prefix,omitempty"` + ReadTS uint64 `json:"read_ts,omitempty"` +} + +type s3ResponseError struct { + Status int + Code string + Message string + Bucket string + Key string +} + +func (e *s3ResponseError) Error() string { + if e == nil { + return "" + } + return e.Message +} + +type s3ErrorResponse struct { + XMLName xml.Name `xml:"Error"` + XMLNS string `xml:"xmlns,attr,omitempty"` + Code string `xml:"Code"` + Message string `xml:"Message"` + Bucket string `xml:"BucketName,omitempty"` + Key string `xml:"Key,omitempty"` +} + +type s3ListBucketsResult struct { + XMLName xml.Name `xml:"ListAllMyBucketsResult"` + XMLNS string `xml:"xmlns,attr,omitempty"` + Buckets s3BucketsCollection `xml:"Buckets"` +} + +type s3BucketsCollection struct { + Buckets []s3BucketEntry `xml:"Bucket"` +} + +type s3BucketEntry struct { + Name string `xml:"Name"` + CreationDate string `xml:"CreationDate"` +} + +type s3ListBucketResult struct { + XMLName xml.Name `xml:"ListBucketResult"` + XMLNS string `xml:"xmlns,attr,omitempty"` + Name string `xml:"Name"` + Prefix string `xml:"Prefix,omitempty"` + Delimiter string `xml:"Delimiter,omitempty"` + MaxKeys int `xml:"MaxKeys"` + KeyCount int `xml:"KeyCount"` + IsTruncated bool `xml:"IsTruncated"` + ContinuationToken string `xml:"ContinuationToken,omitempty"` + NextContinuationToken string `xml:"NextContinuationToken,omitempty"` + Contents []s3ListObjectContent `xml:"Contents"` + CommonPrefixes []s3CommonPrefix `xml:"CommonPrefixes"` +} + +type s3ListObjectContent struct { + Key string `xml:"Key"` + LastModified string `xml:"LastModified"` + ETag string `xml:"ETag"` + Size int64 `xml:"Size"` + StorageClass string `xml:"StorageClass"` +} + +type s3CommonPrefix struct { + Prefix string `xml:"Prefix"` +} + +func NewS3Server(listen net.Listener, s3Addr string, st store.MVCCStore, coordinate kv.Coordinator, leaderS3 map[raft.ServerAddress]string, opts ...S3ServerOption) *S3Server { + s := &S3Server{ + listen: listen, + s3Addr: s3Addr, + region: s3DefaultRegion, + store: st, + coordinator: coordinate, + leaderS3: cloneLeaderAddrMap(leaderS3), + } + for _, opt := range opts { + if opt != nil { + opt(s) + } + } + if s.s3Addr != "" { + if s.leaderS3 == nil { + s.leaderS3 = map[raft.ServerAddress]string{} + } + if leader := s.coordinatorLeaderAddress(); leader != "" { + s.leaderS3[leader] = s.s3Addr + } + } + mux := http.NewServeMux() + mux.HandleFunc("/", s.handle) + s.httpServer = &http.Server{Handler: mux, ReadHeaderTimeout: time.Second} + return s +} + +func (s *S3Server) Run() error { + if err := s.httpServer.Serve(s.listen); err != nil && !errors.Is(err, http.ErrServerClosed) { + return errors.WithStack(err) + } + return nil +} + +func (s *S3Server) Stop() { + if s != nil && s.httpServer != nil { + _ = s.httpServer.Shutdown(context.Background()) + } +} + +func (s *S3Server) handle(w http.ResponseWriter, r *http.Request) { + if serveS3Healthz(w, r) { + return + } + if s.maybeProxyToLeader(w, r) { + return + } + if authErr := s.authorizeRequest(r); authErr != nil { + writeS3Error(w, authErr.Status, authErr.Code, authErr.Message, "", "") + return + } + + bucket, objectKey, hasObject, err := parseS3Path(r) + if err != nil { + writeS3Error(w, http.StatusBadRequest, "InvalidURI", err.Error(), bucket, objectKey) + return + } + + switch { + case bucket == "": + s.handleRoot(w, r) + case !hasObject: + s.handleBucket(w, r, bucket) + default: + s.handleObject(w, r, bucket, objectKey) + } +} + +func (s *S3Server) handleRoot(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + writeS3Error(w, http.StatusMethodNotAllowed, "MethodNotAllowed", "unsupported method", "", "") + return + } + s.listBuckets(w, r) +} + +func (s *S3Server) handleBucket(w http.ResponseWriter, r *http.Request, bucket string) { + switch r.Method { + case http.MethodPut: + s.createBucket(w, r, bucket) + case http.MethodHead: + s.headBucket(w, r, bucket) + case http.MethodDelete: + s.deleteBucket(w, r, bucket) + case http.MethodGet: + if r.URL.Query().Get("list-type") == "2" { + s.listObjectsV2(w, r, bucket) + return + } + writeS3Error(w, http.StatusNotImplemented, "NotImplemented", "only ListObjectsV2 is supported", bucket, "") + default: + writeS3Error(w, http.StatusMethodNotAllowed, "MethodNotAllowed", "unsupported method", bucket, "") + } +} + +func (s *S3Server) handleObject(w http.ResponseWriter, r *http.Request, bucket string, objectKey string) { + switch r.Method { + case http.MethodPut: + s.putObject(w, r, bucket, objectKey) + case http.MethodGet: + s.getObject(w, r, bucket, objectKey, false) + case http.MethodHead: + s.getObject(w, r, bucket, objectKey, true) + case http.MethodDelete: + s.deleteObject(w, r, bucket, objectKey) + default: + writeS3Error(w, http.StatusMethodNotAllowed, "MethodNotAllowed", "unsupported method", bucket, objectKey) + } +} + +func (s *S3Server) listBuckets(w http.ResponseWriter, r *http.Request) { + readTS := s.readTS() + readPin := s.pinReadTS(readTS) + defer readPin.Release() + kvs, err := s.store.ScanAt(r.Context(), []byte(s3keys.BucketMetaPrefix), prefixScanEnd([]byte(s3keys.BucketMetaPrefix)), s3MaxKeys, readTS) + if err != nil { + writeS3InternalError(w, err) + return + } + + out := s3ListBucketsResult{XMLNS: s3XMLNamespace} + for _, kvp := range kvs { + bucket, ok := s3keys.ParseBucketMetaKey(kvp.Key) + if !ok { + continue + } + meta, err := decodeS3BucketMeta(kvp.Value) + if err != nil { + writeS3InternalError(w, err) + return + } + if meta == nil { + continue + } + out.Buckets.Buckets = append(out.Buckets.Buckets, s3BucketEntry{ + Name: bucket, + CreationDate: formatS3ISOTime(meta.CreatedAtHLC), + }) + } + writeS3XML(w, http.StatusOK, out) +} + +func (s *S3Server) createBucket(w http.ResponseWriter, r *http.Request, bucket string) { + if err := validateS3BucketName(bucket); err != nil { + writeS3Error(w, http.StatusBadRequest, "InvalidBucketName", err.Error(), bucket, "") + return + } + err := s.retryS3Mutation(r.Context(), func() error { + readTS := s.readTS() + startTS := s.txnStartTS(readTS) + readPin := s.pinReadTS(readTS) + defer readPin.Release() + + existing, exists, err := s.loadBucketMetaAt(r.Context(), bucket, readTS) + if err != nil { + return errors.WithStack(err) + } + if exists && existing != nil { + return &s3ResponseError{ + Status: http.StatusConflict, + Code: "BucketAlreadyOwnedByYou", + Message: "bucket already exists", + Bucket: bucket, + } + } + + nextGeneration, err := s.nextBucketGenerationAt(r.Context(), bucket, readTS) + if err != nil { + return errors.WithStack(err) + } + commitTS, err := s.nextTxnCommitTS(startTS) + if err != nil { + return errors.WithStack(err) + } + meta := &s3BucketMeta{ + BucketName: bucket, + Generation: nextGeneration, + CreatedAtHLC: commitTS, + Region: s.effectiveRegion(), + } + body, err := encodeS3BucketMeta(meta) + if err != nil { + return errors.WithStack(err) + } + req := &kv.OperationGroup[kv.OP]{ + IsTxn: true, + StartTS: startTS, + CommitTS: commitTS, + Elems: []*kv.Elem[kv.OP]{ + {Op: kv.Put, Key: s3keys.BucketMetaKey(bucket), Value: body}, + {Op: kv.Put, Key: s3keys.BucketGenerationKey(bucket), Value: encodeS3Generation(nextGeneration)}, + }, + } + _, err = s.coordinator.Dispatch(r.Context(), req) + return errors.WithStack(err) + }) + if err != nil { + writeS3MutationError(w, err, bucket, "") + return + } + w.WriteHeader(http.StatusOK) +} + +func (s *S3Server) headBucket(w http.ResponseWriter, r *http.Request, bucket string) { + readTS := s.readTS() + readPin := s.pinReadTS(readTS) + defer readPin.Release() + + _, exists, err := s.loadBucketMetaAt(r.Context(), bucket, readTS) + if err != nil { + writeS3InternalError(w, err) + return + } + if !exists { + writeS3Error(w, http.StatusNotFound, "NoSuchBucket", "bucket not found", bucket, "") + return + } + w.WriteHeader(http.StatusOK) +} + +func (s *S3Server) deleteBucket(w http.ResponseWriter, r *http.Request, bucket string) { + err := s.retryS3Mutation(r.Context(), func() error { + readTS := s.readTS() + startTS := s.txnStartTS(readTS) + readPin := s.pinReadTS(readTS) + defer readPin.Release() + + meta, exists, err := s.loadBucketMetaAt(r.Context(), bucket, readTS) + if err != nil { + return errors.WithStack(err) + } + if !exists || meta == nil { + return &s3ResponseError{ + Status: http.StatusNotFound, + Code: "NoSuchBucket", + Message: "bucket not found", + Bucket: bucket, + } + } + + start := s3keys.ObjectManifestPrefixForBucket(bucket, meta.Generation) + kvs, err := s.store.ScanAt(r.Context(), start, prefixScanEnd(start), 1, readTS) + if err != nil { + return errors.WithStack(err) + } + if len(kvs) > 0 { + return &s3ResponseError{ + Status: http.StatusConflict, + Code: "BucketNotEmpty", + Message: "bucket is not empty", + Bucket: bucket, + } + } + + _, err = s.coordinator.Dispatch(r.Context(), &kv.OperationGroup[kv.OP]{ + IsTxn: true, + StartTS: startTS, + Elems: []*kv.Elem[kv.OP]{ + {Op: kv.Del, Key: s3keys.BucketMetaKey(bucket)}, + }, + }) + return errors.WithStack(err) + }) + if err != nil { + writeS3MutationError(w, err, bucket, "") + return + } + w.WriteHeader(http.StatusNoContent) +} + +//nolint:cyclop,gocognit,nestif // The S3 PUT flow is intentionally linear and maps directly to protocol steps. +func (s *S3Server) putObject(w http.ResponseWriter, r *http.Request, bucket string, objectKey string) { + readTS := s.readTS() + startTS := s.txnStartTS(readTS) + readPin := s.pinReadTS(readTS) + defer readPin.Release() + + meta, exists, err := s.loadBucketMetaAt(r.Context(), bucket, readTS) + if err != nil { + writeS3InternalError(w, err) + return + } + if !exists || meta == nil { + writeS3Error(w, http.StatusNotFound, "NoSuchBucket", "bucket not found", bucket, objectKey) + return + } + + headKey := s3keys.ObjectManifestKey(bucket, meta.Generation, objectKey) + previous, _, err := s.loadObjectManifestAt(r.Context(), headKey, readTS) + if err != nil { + writeS3InternalError(w, err) + return + } + if err := validateS3PutPreconditions(r, previous); err != nil { + writeS3Error(w, http.StatusPreconditionFailed, "PreconditionFailed", err.Error(), bucket, objectKey) + return + } + + uploadID := newS3UploadID(s.clock()) + hasher := md5.New() //nolint:gosec // S3 ETag compatibility requires MD5. + sha256Hasher := sha256.New() + expectedPayloadSHA := normalizeS3PayloadHash(r.Header.Get("X-Amz-Content-Sha256")) + validatePayloadSHA := expectedPayloadSHA != "" && !strings.EqualFold(expectedPayloadSHA, s3UnsignedPayload) + part := s3ObjectPart{PartNo: 1} + sizeBytes := int64(0) + chunkNo := uint64(0) + buf := make([]byte, s3ChunkSize) + pendingBatch := make([]*kv.Elem[kv.OP], 0, s3ChunkBatchOps) + pendingChunkSizes := make([]uint64, 0, s3ChunkBatchOps) + uploadedManifest := func() *s3ObjectManifest { + if len(part.ChunkSizes) == 0 { + return &s3ObjectManifest{UploadID: uploadID} + } + clonedPart := part + clonedPart.ChunkSizes = append([]uint64(nil), part.ChunkSizes...) + clonedPart.ChunkCount = uint64(len(clonedPart.ChunkSizes)) + return &s3ObjectManifest{ + UploadID: uploadID, + Parts: []s3ObjectPart{clonedPart}, + } + } + flushBatch := func() error { + if len(pendingBatch) == 0 { + return nil + } + _, err := s.coordinator.Dispatch(r.Context(), &kv.OperationGroup[kv.OP]{Elems: pendingBatch}) + if err != nil { + return errors.WithStack(err) + } + part.ChunkSizes = append(part.ChunkSizes, pendingChunkSizes...) + part.ChunkCount = uint64(len(part.ChunkSizes)) + pendingBatch = pendingBatch[:0] + pendingChunkSizes = pendingChunkSizes[:0] + return nil + } + for { + n, readErr := r.Body.Read(buf) + if n > 0 { + chunk := append([]byte(nil), buf[:n]...) + if _, err := hasher.Write(chunk); err != nil { + writeS3InternalError(w, err) + return + } + if validatePayloadSHA { + if _, err := sha256Hasher.Write(chunk); err != nil { + writeS3InternalError(w, err) + return + } + } + chunkKey := s3keys.BlobKey(bucket, meta.Generation, objectKey, uploadID, part.PartNo, chunkNo) + pendingBatch = append(pendingBatch, &kv.Elem[kv.OP]{Op: kv.Put, Key: chunkKey, Value: chunk}) + chunkSize, err := uint64FromInt(n) + if err != nil { + s.cleanupManifestBlobs(r.Context(), bucket, meta.Generation, objectKey, uploadedManifest()) + writeS3InternalError(w, err) + return + } + pendingChunkSizes = append(pendingChunkSizes, chunkSize) + if len(pendingBatch) >= s3ChunkBatchOps { + if err := flushBatch(); err != nil { + s.cleanupManifestBlobs(r.Context(), bucket, meta.Generation, objectKey, uploadedManifest()) + writeS3InternalError(w, err) + return + } + } + sizeBytes += int64(n) + chunkNo++ + } + if errors.Is(readErr, io.EOF) { + break + } + if readErr != nil { + s.cleanupManifestBlobs(r.Context(), bucket, meta.Generation, objectKey, uploadedManifest()) + writeS3InternalError(w, readErr) + return + } + } + if err := flushBatch(); err != nil { + s.cleanupManifestBlobs(r.Context(), bucket, meta.Generation, objectKey, uploadedManifest()) + writeS3InternalError(w, err) + return + } + if validatePayloadSHA { + actualPayloadSHA := hex.EncodeToString(sha256Hasher.Sum(nil)) + if !strings.EqualFold(actualPayloadSHA, expectedPayloadSHA) { + s.cleanupManifestBlobs(r.Context(), bucket, meta.Generation, objectKey, uploadedManifest()) + writeS3Error(w, http.StatusBadRequest, "XAmzContentSHA256Mismatch", "payload SHA256 does not match x-amz-content-sha256", bucket, objectKey) + return + } + } + + etag := hex.EncodeToString(hasher.Sum(nil)) + part.ETag = etag + part.SizeBytes = sizeBytes + part.ChunkCount = uint64(len(part.ChunkSizes)) + commitTS, err := s.nextTxnCommitTS(startTS) + if err != nil { + s.cleanupManifestBlobs(r.Context(), bucket, meta.Generation, objectKey, uploadedManifest()) + writeS3InternalError(w, err) + return + } + manifest := &s3ObjectManifest{ + UploadID: uploadID, + ETag: etag, + SizeBytes: sizeBytes, + LastModifiedHLC: commitTS, + ContentType: headerOrDefault(r.Header.Get("Content-Type"), "application/octet-stream"), + ContentEncoding: r.Header.Get("Content-Encoding"), + CacheControl: r.Header.Get("Cache-Control"), + ContentDisposition: r.Header.Get("Content-Disposition"), + UserMetadata: collectS3UserMetadata(r.Header), + } + if sizeBytes > 0 { + manifest.Parts = []s3ObjectPart{part} + } + body, err := encodeS3ObjectManifest(manifest) + if err != nil { + s.cleanupManifestBlobs(r.Context(), bucket, meta.Generation, objectKey, uploadedManifest()) + writeS3InternalError(w, err) + return + } + bucketFence, err := encodeS3BucketMeta(meta) + if err != nil { + s.cleanupManifestBlobs(r.Context(), bucket, meta.Generation, objectKey, uploadedManifest()) + writeS3InternalError(w, err) + return + } + if _, err := s.coordinator.Dispatch(r.Context(), &kv.OperationGroup[kv.OP]{ + IsTxn: true, + StartTS: startTS, + CommitTS: commitTS, + Elems: []*kv.Elem[kv.OP]{ + {Op: kv.Put, Key: s3keys.BucketMetaKey(bucket), Value: bucketFence}, + {Op: kv.Put, Key: headKey, Value: body}, + }, + }); err != nil { + s.cleanupManifestBlobs(r.Context(), bucket, meta.Generation, objectKey, uploadedManifest()) + writeS3MutationError(w, err, bucket, objectKey) + return + } + + if previous != nil { + s.cleanupManifestBlobsAsync(bucket, meta.Generation, objectKey, previous) + } + w.Header().Set("ETag", quoteS3ETag(etag)) + w.WriteHeader(http.StatusOK) +} + +//nolint:cyclop // The read handler branches on bucket/object existence and HEAD vs GET response flow. +func (s *S3Server) getObject(w http.ResponseWriter, r *http.Request, bucket string, objectKey string, headOnly bool) { + if !headOnly && strings.TrimSpace(r.Header.Get("Range")) != "" { + writeS3Error(w, http.StatusNotImplemented, "NotImplemented", "range reads are not implemented yet", bucket, objectKey) + return + } + readTS := s.readTS() + readPin := s.pinReadTS(readTS) + defer readPin.Release() + meta, exists, err := s.loadBucketMetaAt(r.Context(), bucket, readTS) + if err != nil { + writeS3InternalError(w, err) + return + } + if !exists || meta == nil { + writeS3Error(w, http.StatusNotFound, "NoSuchBucket", "bucket not found", bucket, objectKey) + return + } + + headKey := s3keys.ObjectManifestKey(bucket, meta.Generation, objectKey) + manifest, found, err := s.loadObjectManifestAt(r.Context(), headKey, readTS) + if err != nil { + writeS3InternalError(w, err) + return + } + if !found || manifest == nil { + writeS3Error(w, http.StatusNotFound, "NoSuchKey", "object not found", bucket, objectKey) + return + } + + writeS3ObjectHeaders(w.Header(), manifest) + w.WriteHeader(http.StatusOK) + if headOnly { + return + } + for _, part := range manifest.Parts { + for chunkNo := range part.ChunkSizes { + chunkIndex, err := uint64FromInt(chunkNo) + if err != nil { + writeS3InternalError(w, err) + return + } + chunkKey := s3keys.BlobKey(bucket, meta.Generation, objectKey, manifest.UploadID, part.PartNo, chunkIndex) + chunk, err := s.store.GetAt(r.Context(), chunkKey, readTS) + if err != nil { + writeS3InternalError(w, err) + return + } + if _, err := w.Write(chunk); err != nil { + return + } + } + } +} + +func (s *S3Server) deleteObject(w http.ResponseWriter, r *http.Request, bucket string, objectKey string) { + var cleanupManifest *s3ObjectManifest + var generation uint64 + err := s.retryS3Mutation(r.Context(), func() error { + readTS := s.readTS() + startTS := s.txnStartTS(readTS) + readPin := s.pinReadTS(readTS) + defer readPin.Release() + + meta, exists, err := s.loadBucketMetaAt(r.Context(), bucket, readTS) + if err != nil { + return errors.WithStack(err) + } + if !exists || meta == nil { + return &s3ResponseError{ + Status: http.StatusNotFound, + Code: "NoSuchBucket", + Message: "bucket not found", + Bucket: bucket, + Key: objectKey, + } + } + + headKey := s3keys.ObjectManifestKey(bucket, meta.Generation, objectKey) + manifest, found, err := s.loadObjectManifestAt(r.Context(), headKey, readTS) + if err != nil { + return errors.WithStack(err) + } + if !found { + cleanupManifest = nil + return nil + } + _, err = s.coordinator.Dispatch(r.Context(), &kv.OperationGroup[kv.OP]{ + IsTxn: true, + StartTS: startTS, + Elems: []*kv.Elem[kv.OP]{ + {Op: kv.Del, Key: headKey}, + }, + }) + if err != nil { + return errors.WithStack(err) + } + cleanupManifest = manifest + generation = meta.Generation + return nil + }) + if err != nil { + writeS3MutationError(w, err, bucket, objectKey) + return + } + if cleanupManifest != nil { + s.cleanupManifestBlobsAsync(bucket, generation, objectKey, cleanupManifest) + } + w.WriteHeader(http.StatusNoContent) +} + +//nolint:cyclop,gocognit,gocyclo,nestif // ListObjectsV2 combines token validation, shard-stable snapshotting, and delimiter pagination rules. +func (s *S3Server) listObjectsV2(w http.ResponseWriter, r *http.Request, bucket string) { + readTS := s.readTS() + meta, exists, err := s.loadBucketMetaAt(r.Context(), bucket, readTS) + if err != nil { + writeS3InternalError(w, err) + return + } + if !exists || meta == nil { + writeS3Error(w, http.StatusNotFound, "NoSuchBucket", "bucket not found", bucket, "") + return + } + + query := r.URL.Query() + prefix := query.Get("prefix") + delimiter := query.Get("delimiter") + maxKeys := parseS3MaxKeys(query.Get("max-keys")) + token, err := decodeS3ContinuationToken(query.Get("continuation-token")) + if err != nil { + writeS3Error(w, http.StatusBadRequest, "InvalidArgument", "invalid continuation token", bucket, "") + return + } + if token != nil { + if token.Bucket != bucket || token.Generation != meta.Generation || token.Prefix != prefix || token.Delimiter != delimiter { + writeS3Error(w, http.StatusBadRequest, "InvalidArgument", "continuation token does not match request", bucket, "") + return + } + if token.ReadTS != 0 { + readTS = token.ReadTS + } + } + readPin := s.pinReadTS(readTS) + defer readPin.Release() + + start := s3keys.ObjectManifestScanStart(bucket, meta.Generation, prefix) + if token != nil { + start = nextScanCursor(s3keys.ObjectManifestKey(bucket, token.Generation, token.LastKey)) + } + if token == nil { + startAfter := query.Get("start-after") + if startAfter != "" { + start = nextScanCursor(s3keys.ObjectManifestKey(bucket, meta.Generation, startAfter)) + } + } + end := prefixScanEnd(s3keys.ObjectManifestScanStart(bucket, meta.Generation, prefix)) + + result := s3ListBucketResult{ + XMLNS: s3XMLNamespace, + Name: bucket, + Prefix: prefix, + Delimiter: delimiter, + MaxKeys: maxKeys, + ContinuationToken: query.Get("continuation-token"), + } + commonPrefixSeen := map[string]struct{}{} + if token != nil && token.LastCommonPrefix != "" { + commonPrefixSeen[token.LastCommonPrefix] = struct{}{} + } + cursor := start + truncated := false + nextToken := "" + + for result.KeyCount < maxKeys { + pageLimit := s3ListPageSize + if remaining := maxKeys - result.KeyCount; remaining > pageLimit { + pageLimit = remaining + } + page, err := s.store.ScanAt(r.Context(), cursor, end, pageLimit, readTS) + if err != nil { + if token != nil && errors.Is(err, store.ErrReadTSCompacted) { + writeS3Error(w, http.StatusBadRequest, "InvalidArgument", "continuation token has expired", bucket, "") + return + } + writeS3InternalError(w, err) + return + } + if len(page) == 0 { + break + } + for _, kvp := range page { + _, generation, objectKey, ok := s3keys.ParseObjectManifestKey(kvp.Key) + if !ok || generation != meta.Generation || !strings.HasPrefix(objectKey, prefix) { + continue + } + if delimiter != "" { + suffix := strings.TrimPrefix(objectKey, prefix) + if idx := strings.Index(suffix, delimiter); idx >= 0 { + commonPrefix := prefix + suffix[:idx+len(delimiter)] + if _, exists := commonPrefixSeen[commonPrefix]; !exists { + commonPrefixSeen[commonPrefix] = struct{}{} + result.CommonPrefixes = append(result.CommonPrefixes, s3CommonPrefix{Prefix: commonPrefix}) + result.KeyCount++ + } + if result.KeyCount >= maxKeys { + truncated = true + nextToken = encodeS3ContinuationToken(bucket, meta.Generation, prefix, delimiter, objectKey, commonPrefix, readTS) + break + } + continue + } + } + manifest, err := decodeS3ObjectManifest(kvp.Value) + if err != nil { + writeS3InternalError(w, err) + return + } + result.Contents = append(result.Contents, s3ListObjectContent{ + Key: objectKey, + LastModified: formatS3ISOTime(manifest.LastModifiedHLC), + ETag: quoteS3ETag(manifest.ETag), + Size: manifest.SizeBytes, + StorageClass: "STANDARD", + }) + result.KeyCount++ + if result.KeyCount >= maxKeys { + truncated = true + nextToken = encodeS3ContinuationToken(bucket, meta.Generation, prefix, delimiter, objectKey, "", readTS) + break + } + } + if truncated { + break + } + cursor = nextScanCursor(page[len(page)-1].Key) + if len(page) < pageLimit { + break + } + } + + result.IsTruncated = truncated + result.NextContinuationToken = nextToken + writeS3XML(w, http.StatusOK, result) +} + +func (s *S3Server) loadBucketMetaAt(ctx context.Context, bucket string, readTS uint64) (*s3BucketMeta, bool, error) { + raw, err := s.store.GetAt(ctx, s3keys.BucketMetaKey(bucket), readTS) + if err != nil { + if errors.Is(err, store.ErrKeyNotFound) { + return nil, false, nil + } + return nil, false, errors.WithStack(err) + } + meta, err := decodeS3BucketMeta(raw) + if err != nil { + return nil, false, err + } + return meta, true, nil +} + +func (s *S3Server) nextBucketGenerationAt(ctx context.Context, bucket string, readTS uint64) (uint64, error) { + raw, err := s.store.GetAt(ctx, s3keys.BucketGenerationKey(bucket), readTS) + if err != nil { + if errors.Is(err, store.ErrKeyNotFound) { + return 1, nil + } + return 0, errors.WithStack(err) + } + v, err := decodeS3Generation(raw) + if err != nil { + return 0, err + } + return v + 1, nil +} + +func (s *S3Server) loadObjectManifestAt(ctx context.Context, key []byte, readTS uint64) (*s3ObjectManifest, bool, error) { + raw, err := s.store.GetAt(ctx, key, readTS) + if err != nil { + if errors.Is(err, store.ErrKeyNotFound) { + return nil, false, nil + } + return nil, false, errors.WithStack(err) + } + manifest, err := decodeS3ObjectManifest(raw) + if err != nil { + return nil, false, err + } + return manifest, true, nil +} + +func (s *S3Server) cleanupManifestBlobsAsync(bucket string, generation uint64, objectKey string, manifest *s3ObjectManifest) { + if manifest == nil { + return + } + go s.cleanupManifestBlobs(context.Background(), bucket, generation, objectKey, manifest) +} + +func (s *S3Server) cleanupManifestBlobs(ctx context.Context, bucket string, generation uint64, objectKey string, manifest *s3ObjectManifest) { + if s == nil || manifest == nil || manifest.UploadID == "" || s.coordinator == nil { + return + } + pending := make([]*kv.Elem[kv.OP], 0, s3ChunkBatchOps) + flush := func() { + if len(pending) == 0 { + return + } + _, _ = s.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{Elems: pending}) + pending = pending[:0] + } + for _, part := range manifest.Parts { + for chunkNo := range part.ChunkSizes { + chunkIndex, err := uint64FromInt(chunkNo) + if err != nil { + return + } + pending = append(pending, &kv.Elem[kv.OP]{ + Op: kv.Del, + Key: s3keys.BlobKey(bucket, generation, objectKey, manifest.UploadID, part.PartNo, chunkIndex), + }) + if len(pending) >= s3ChunkBatchOps { + flush() + } + } + } + flush() +} + +//nolint:cyclop // Proxying depends on root, bucket, and object-level leadership decisions. +func (s *S3Server) maybeProxyToLeader(w http.ResponseWriter, r *http.Request) bool { + if s == nil || s.coordinator == nil { + return false + } + key, err := s.proxyRouteKey(r) + if err != nil { + return false + } + var leader raft.ServerAddress + if len(key) > 0 { + if s.coordinator.IsLeaderForKey(key) && s.coordinator.VerifyLeaderForKey(key) == nil { + return false + } + leader = s.coordinator.RaftLeaderForKey(key) + } else { + if s.coordinator.IsLeader() && s.coordinator.VerifyLeader() == nil { + return false + } + leader = s.coordinator.RaftLeader() + } + if leader == "" { + writeS3Error(w, http.StatusServiceUnavailable, "ServiceUnavailable", "leader not found", "", "") + return true + } + targetHost, ok := s.leaderS3[leader] + if !ok || strings.TrimSpace(targetHost) == "" { + writeS3Error(w, http.StatusServiceUnavailable, "ServiceUnavailable", "leader endpoint not found", "", "") + return true + } + target := &url.URL{Scheme: "http", Host: targetHost} + originalHost := r.Host + proxy := httputil.NewSingleHostReverseProxy(target) + origDirector := proxy.Director + proxy.Director = func(req *http.Request) { + origDirector(req) + req.Host = originalHost + } + proxy.ErrorHandler = func(rw http.ResponseWriter, _ *http.Request, err error) { + writeS3InternalError(rw, err) + } + proxy.ServeHTTP(w, r) + return true +} + +func (s *S3Server) proxyRouteKey(r *http.Request) ([]byte, error) { + bucket, objectKey, hasObject, err := parseS3Path(r) + if err != nil || bucket == "" { + return nil, errors.WithStack(err) + } + if !hasObject { + return s3keys.BucketMetaKey(bucket), nil + } + + meta, exists, err := s.loadBucketMetaAt(r.Context(), bucket, s.readTS()) + if err != nil { + return nil, errors.WithStack(err) + } + if !exists || meta == nil { + return s3keys.BucketMetaKey(bucket), nil + } + return s3keys.RouteKey(bucket, meta.Generation, objectKey), nil +} + +func (s *S3Server) clock() *kv.HLC { + if s == nil || s.coordinator == nil { + return nil + } + return s.coordinator.Clock() +} + +func (s *S3Server) coordinatorLeaderAddress() raft.ServerAddress { + if s == nil || s.coordinator == nil { + return "" + } + return s.coordinator.RaftLeader() +} + +func parseS3Path(r *http.Request) (string, string, bool, error) { + escaped := "/" + if r != nil && r.URL != nil { + escaped = r.URL.EscapedPath() + } + trimmed := strings.TrimPrefix(escaped, "/") + if trimmed == "" { + return "", "", false, nil + } + parts := strings.SplitN(trimmed, "/", s3PathSplitParts) + bucket, err := url.PathUnescape(parts[0]) + if err != nil { + return "", "", false, errors.WithStack(err) + } + if len(parts) == 1 { + return bucket, "", false, nil + } + objectKey, err := url.PathUnescape(parts[1]) + if err != nil { + return "", "", false, errors.WithStack(err) + } + return bucket, objectKey, true, nil +} + +func serveS3Healthz(w http.ResponseWriter, r *http.Request) bool { + if r == nil || r.URL == nil || r.URL.Path != s3HealthPath { + return false + } + if r.Method != http.MethodGet && r.Method != http.MethodHead { + w.WriteHeader(http.StatusMethodNotAllowed) + return true + } + w.WriteHeader(http.StatusOK) + if r.Method != http.MethodHead { + _, _ = io.WriteString(w, "ok") + } + return true +} + +func encodeS3BucketMeta(meta *s3BucketMeta) ([]byte, error) { + if meta == nil { + return nil, errors.New("s3 bucket metadata is required") + } + body, err := json.Marshal(meta) + if err != nil { + return nil, errors.WithStack(err) + } + return body, nil +} + +func decodeS3BucketMeta(raw []byte) (*s3BucketMeta, error) { + if len(raw) == 0 { + return nil, nil + } + meta := &s3BucketMeta{} + if err := json.Unmarshal(raw, meta); err != nil { + return nil, errors.WithStack(err) + } + return meta, nil +} + +func encodeS3ObjectManifest(manifest *s3ObjectManifest) ([]byte, error) { + if manifest == nil { + return nil, errors.New("s3 manifest is required") + } + body, err := json.Marshal(manifest) + if err != nil { + return nil, errors.WithStack(err) + } + return body, nil +} + +func decodeS3ObjectManifest(raw []byte) (*s3ObjectManifest, error) { + if len(raw) == 0 { + return nil, nil + } + manifest := &s3ObjectManifest{} + if err := json.Unmarshal(raw, manifest); err != nil { + return nil, errors.WithStack(err) + } + return manifest, nil +} + +func encodeS3Generation(generation uint64) []byte { + var raw [8]byte + binary.BigEndian.PutUint64(raw[:], generation) + return raw[:] +} + +func decodeS3Generation(raw []byte) (uint64, error) { + if len(raw) == s3GenerationBytes { + return binary.BigEndian.Uint64(raw), nil + } + v, err := strconv.ParseUint(strings.TrimSpace(string(raw)), 10, 64) + if err != nil { + return 0, errors.WithStack(err) + } + return v, nil +} + +func writeS3XML(w http.ResponseWriter, status int, body any) { + payload, err := xml.Marshal(body) + if err != nil { + writeS3InternalError(w, err) + return + } + w.Header().Set("Content-Type", "application/xml") + w.WriteHeader(status) + _, _ = w.Write([]byte(xml.Header)) + _, _ = w.Write(payload) +} + +func writeS3Error(w http.ResponseWriter, status int, code string, message string, bucket string, key string) { + writeS3XML(w, status, s3ErrorResponse{ + XMLNS: s3XMLNamespace, + Code: code, + Message: message, + Bucket: bucket, + Key: key, + }) +} + +func writeS3InternalError(w http.ResponseWriter, err error) { + message := "internal error" + if err != nil { + message = err.Error() + } + writeS3Error(w, http.StatusInternalServerError, "InternalError", message, "", "") +} + +func writeS3ObjectHeaders(h http.Header, manifest *s3ObjectManifest) { + if h == nil || manifest == nil { + return + } + h.Set("ETag", quoteS3ETag(manifest.ETag)) + h.Set("Content-Length", strconv.FormatInt(manifest.SizeBytes, 10)) + h.Set("Last-Modified", formatS3HTTPTime(manifest.LastModifiedHLC)) + h.Set("Content-Type", headerOrDefault(manifest.ContentType, "application/octet-stream")) + if manifest.ContentEncoding != "" { + h.Set("Content-Encoding", manifest.ContentEncoding) + } + if manifest.CacheControl != "" { + h.Set("Cache-Control", manifest.CacheControl) + } + if manifest.ContentDisposition != "" { + h.Set("Content-Disposition", manifest.ContentDisposition) + } + for key, value := range manifest.UserMetadata { + h.Set("x-amz-meta-"+key, value) + } +} + +func collectS3UserMetadata(h http.Header) map[string]string { + if h == nil { + return nil + } + out := map[string]string{} + for key, values := range h { + lower := strings.ToLower(key) + if !strings.HasPrefix(lower, "x-amz-meta-") || len(values) == 0 { + continue + } + out[strings.TrimPrefix(lower, "x-amz-meta-")] = values[0] + } + if len(out) == 0 { + return nil + } + return out +} + +func decodeS3ContinuationToken(raw string) (*s3ContinuationToken, error) { + if strings.TrimSpace(raw) == "" { + return nil, nil + } + data, err := base64.StdEncoding.DecodeString(raw) + if err != nil { + return nil, errors.WithStack(err) + } + token := &s3ContinuationToken{} + if err := json.Unmarshal(data, token); err != nil { + return nil, errors.WithStack(err) + } + return token, nil +} + +func encodeS3ContinuationToken(bucket string, generation uint64, prefix string, delimiter string, lastKey string, lastCommonPrefix string, readTS uint64) string { + data, err := json.Marshal(&s3ContinuationToken{ + Bucket: bucket, + Generation: generation, + Prefix: prefix, + Delimiter: delimiter, + LastKey: lastKey, + LastCommonPrefix: lastCommonPrefix, + ReadTS: readTS, + }) + if err != nil { + return "" + } + return base64.StdEncoding.EncodeToString(data) +} + +func parseS3MaxKeys(raw string) int { + if strings.TrimSpace(raw) == "" { + return s3MaxKeys + } + v, err := strconv.Atoi(raw) + if err != nil || v <= 0 { + return s3MaxKeys + } + if v > s3MaxKeys { + return s3MaxKeys + } + return v +} + +func quoteS3ETag(etag string) string { + if etag == "" { + return `""` + } + return fmt.Sprintf("%q", etag) +} + +func formatS3HTTPTime(ts uint64) string { + return hlcToTime(ts).UTC().Format(http.TimeFormat) +} + +func formatS3ISOTime(ts uint64) string { + return hlcToTime(ts).UTC().Format("2006-01-02T15:04:05.000Z") +} + +func hlcToTime(ts uint64) time.Time { + if ts == 0 || ts == ^uint64(0) { + return time.Unix(0, 0).UTC() + } + millis := ts >> s3HLCPhysicalShift + if millis > math.MaxInt64 { + millis = math.MaxInt64 + } + return time.UnixMilli(int64(millis)).UTC() //nolint:gosec // millis is clamped to MaxInt64 above. +} + +func (s *S3Server) readTS() uint64 { + return snapshotTS(s.clock(), s.store) +} + +func (s *S3Server) pinReadTS(ts uint64) *kv.ActiveTimestampToken { + if s == nil || s.readTracker == nil { + return nil + } + return s.readTracker.Pin(ts) +} + +func (s *S3Server) txnStartTS(readTS uint64) uint64 { + if readTS == ^uint64(0) { + if clock := s.clock(); clock != nil { + return clock.Next() + } + return 1 + } + return readTS +} + +func newS3UploadID(clock *kv.HLC) string { + var raw [16]byte + if _, err := rand.Read(raw[:]); err == nil { + return hex.EncodeToString(raw[:]) + } + if clock != nil { + return fmt.Sprintf("%016x", clock.Next()) + } + return strconv.FormatInt(time.Now().UnixNano(), 16) +} + +func (s *S3Server) effectiveRegion() string { + if s == nil || strings.TrimSpace(s.region) == "" { + return s3DefaultRegion + } + return s.region +} + +func (s *S3Server) nextTxnCommitTS(startTS uint64) (uint64, error) { + clock := s.clock() + if clock == nil { + if startTS == ^uint64(0) { + return 0, errors.WithStack(kv.ErrTxnCommitTSRequired) + } + return startTS + 1, nil + } + clock.Observe(startTS) + commitTS := clock.Next() + if commitTS <= startTS { + return 0, errors.WithStack(kv.ErrTxnCommitTSRequired) + } + return commitTS, nil +} + +func isRetryableS3MutationErr(err error) bool { + return errors.Is(err, store.ErrWriteConflict) || errors.Is(err, kv.ErrTxnLocked) +} + +func waitS3RetryBackoff(ctx context.Context, delay time.Duration) bool { + timer := time.NewTimer(delay) + defer timer.Stop() + + select { + case <-ctx.Done(): + return false + case <-timer.C: + return true + } +} + +func nextS3RetryBackoff(current time.Duration) time.Duration { + next := current * s3TxnRetryBackoffFactor + if next > s3TxnRetryMaxBackoff { + return s3TxnRetryMaxBackoff + } + return next +} + +func (s *S3Server) retryS3Mutation(ctx context.Context, fn func() error) error { + backoff := s3TxnRetryInitialBackoff + for attempt := 0; ; attempt++ { + err := fn() + if err == nil { + return nil + } + if !isRetryableS3MutationErr(err) { + return errors.WithStack(err) + } + if attempt >= s3TxnRetryMaxAttempts { + return errors.WithStack(err) + } + if !waitS3RetryBackoff(ctx, backoff) { + return errors.WithStack(err) + } + backoff = nextS3RetryBackoff(backoff) + } +} + +func writeS3MutationError(w http.ResponseWriter, err error, bucket string, key string) { + var responseErr *s3ResponseError + if errors.As(err, &responseErr) { + writeS3Error(w, responseErr.Status, responseErr.Code, responseErr.Message, responseErr.Bucket, responseErr.Key) + return + } + if isRetryableS3MutationErr(err) { + writeS3Error(w, http.StatusConflict, "OperationAborted", "conflicting conditional operation in progress", bucket, key) + return + } + writeS3InternalError(w, err) +} + +func validateS3BucketName(bucket string) error { + if len(bucket) < 3 || len(bucket) > 63 { + return errors.New("bucket name must be 3-63 characters") + } + for _, r := range bucket { + if !isValidS3BucketRune(r) { + return errors.New("bucket name contains unsupported characters") + } + } + return nil +} + +func isValidS3BucketRune(r rune) bool { + switch { + case r >= 'a' && r <= 'z': + return true + case r >= '0' && r <= '9': + return true + case r == '.' || r == '-': + return true + default: + return false + } +} + +func uint64FromInt(v int) (uint64, error) { + if v < 0 { + return 0, errors.New("negative integer cannot be converted to uint64") + } + return uint64(v), nil +} + +func validateS3PutPreconditions(r *http.Request, previous *s3ObjectManifest) error { + if r == nil { + return nil + } + if ifNoneMatch := strings.TrimSpace(r.Header.Get("If-None-Match")); ifNoneMatch == "*" && previous != nil { + return errors.New("object already exists") + } + if ifMatch := strings.TrimSpace(r.Header.Get("If-Match")); ifMatch != "" { + if previous == nil || strings.Trim(ifMatch, `"`) != previous.ETag { + return errors.New("etag precondition failed") + } + } + return nil +} + +func cloneLeaderAddrMap(src map[raft.ServerAddress]string) map[raft.ServerAddress]string { + if len(src) == 0 { + return nil + } + out := make(map[raft.ServerAddress]string, len(src)) + for key, value := range src { + out[key] = value + } + return out +} + +func headerOrDefault(value string, fallback string) string { + if strings.TrimSpace(value) == "" { + return fallback + } + return value +} diff --git a/adapter/s3_auth.go b/adapter/s3_auth.go new file mode 100644 index 00000000..6d0f0fe9 --- /dev/null +++ b/adapter/s3_auth.go @@ -0,0 +1,248 @@ +package adapter + +import ( + "context" + "crypto/subtle" + "net/http" + "strings" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + v4 "github.com/aws/aws-sdk-go-v2/aws/signer/v4" + "github.com/bootjp/elastickv/kv" + "github.com/cockroachdb/errors" +) + +const ( + s3SigV4Algorithm = "AWS4-HMAC-SHA256" + s3UnsignedPayload = "UNSIGNED-PAYLOAD" + s3EmptyPayloadHash = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + s3DateHeaderFormat = "20060102T150405Z" +) + +type S3ServerOption func(*S3Server) + +type s3AuthError struct { + Status int + Code string + Message string +} + +type s3AuthorizationHeader struct { + Algorithm string + AccessKeyID string + Date string + Region string + Service string +} + +func WithS3Region(region string) S3ServerOption { + return func(server *S3Server) { + if server == nil || strings.TrimSpace(region) == "" { + return + } + server.region = strings.TrimSpace(region) + } +} + +func WithS3StaticCredentials(creds map[string]string) S3ServerOption { + return func(server *S3Server) { + if server == nil || len(creds) == 0 { + return + } + server.staticCreds = make(map[string]string, len(creds)) + for accessKeyID, secretAccessKey := range creds { + accessKeyID = strings.TrimSpace(accessKeyID) + secretAccessKey = strings.TrimSpace(secretAccessKey) + if accessKeyID == "" || secretAccessKey == "" { + continue + } + server.staticCreds[accessKeyID] = secretAccessKey + } + if len(server.staticCreds) == 0 { + server.staticCreds = nil + } + } +} + +func WithS3ActiveTimestampTracker(tracker *kv.ActiveTimestampTracker) S3ServerOption { + return func(server *S3Server) { + if server == nil { + return + } + server.readTracker = tracker + } +} + +//nolint:cyclop // SigV4 validation must preserve AWS-compatible error ordering. +func (s *S3Server) authorizeRequest(r *http.Request) *s3AuthError { + if s == nil || r == nil || len(s.staticCreds) == 0 { + return nil + } + if strings.TrimSpace(r.URL.Query().Get("X-Amz-Algorithm")) != "" { + return &s3AuthError{ + Status: http.StatusNotImplemented, + Code: "NotImplemented", + Message: "presigned URLs are not supported yet", + } + } + + authHeader := strings.TrimSpace(r.Header.Get("Authorization")) + if authHeader == "" { + return &s3AuthError{ + Status: http.StatusForbidden, + Code: "AccessDenied", + Message: "missing Authorization header", + } + } + payloadHash := normalizeS3PayloadHash(r.Header.Get("X-Amz-Content-Sha256")) + if payloadHash == "" { + return &s3AuthError{ + Status: http.StatusForbidden, + Code: "AccessDenied", + Message: "missing x-amz-content-sha256 header", + } + } + + parsed, err := parseS3AuthorizationHeader(authHeader) + if err != nil { + return &s3AuthError{ + Status: http.StatusForbidden, + Code: "AccessDenied", + Message: "invalid Authorization header", + } + } + if parsed.Algorithm != s3SigV4Algorithm { + return &s3AuthError{ + Status: http.StatusForbidden, + Code: "AccessDenied", + Message: "unsupported signature algorithm", + } + } + secretAccessKey, ok := s.staticCreds[parsed.AccessKeyID] + if !ok { + return &s3AuthError{ + Status: http.StatusForbidden, + Code: "InvalidAccessKeyId", + Message: "unknown access key", + } + } + if parsed.Service != "s3" { + return &s3AuthError{ + Status: http.StatusForbidden, + Code: "AuthorizationHeaderMalformed", + Message: "credential scope service must be s3", + } + } + if parsed.Region != s.effectiveRegion() { + return &s3AuthError{ + Status: http.StatusForbidden, + Code: "AuthorizationHeaderMalformed", + Message: "credential scope region does not match server region", + } + } + + amzDate := strings.TrimSpace(r.Header.Get("X-Amz-Date")) + if amzDate == "" { + return &s3AuthError{ + Status: http.StatusForbidden, + Code: "AccessDenied", + Message: "missing x-amz-date header", + } + } + signingTime, err := time.Parse(s3DateHeaderFormat, amzDate) + if err != nil { + return &s3AuthError{ + Status: http.StatusForbidden, + Code: "AccessDenied", + Message: "invalid x-amz-date header", + } + } + if parsed.Date != signingTime.UTC().Format("20060102") { + return &s3AuthError{ + Status: http.StatusForbidden, + Code: "AuthorizationHeaderMalformed", + Message: "credential scope date does not match x-amz-date", + } + } + + expectedAuth, err := buildS3AuthorizationHeader(r, parsed.AccessKeyID, secretAccessKey, s.effectiveRegion(), signingTime, payloadHash) + if err != nil { + return &s3AuthError{ + Status: http.StatusForbidden, + Code: "AccessDenied", + Message: "failed to verify request signature", + } + } + if subtle.ConstantTimeCompare([]byte(authHeader), []byte(expectedAuth)) != 1 { + return &s3AuthError{ + Status: http.StatusForbidden, + Code: "SignatureDoesNotMatch", + Message: "request signature does not match", + } + } + return nil +} + +func buildS3AuthorizationHeader(r *http.Request, accessKeyID string, secretAccessKey string, region string, signingTime time.Time, payloadHash string) (string, error) { + if r == nil { + return "", errors.New("request is required") + } + clone := r.Clone(context.Background()) + clone.Host = r.Host + clone.Header = clone.Header.Clone() + clone.Header.Del("Authorization") + + signer := v4.NewSigner(func(opts *v4.SignerOptions) { + opts.DisableURIPathEscaping = true + }) + creds := aws.Credentials{ + AccessKeyID: accessKeyID, + SecretAccessKey: secretAccessKey, + Source: "elastickv-s3", + } + if err := signer.SignHTTP(context.Background(), creds, clone, payloadHash, "s3", region, signingTime.UTC()); err != nil { + return "", errors.WithStack(err) + } + return strings.TrimSpace(clone.Header.Get("Authorization")), nil +} + +//nolint:cyclop // AWS authorization parsing is branchy because malformed scopes must be rejected precisely. +func parseS3AuthorizationHeader(raw string) (s3AuthorizationHeader, error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return s3AuthorizationHeader{}, errors.New("authorization header is required") + } + algorithm, rest, ok := strings.Cut(raw, " ") + if !ok { + return s3AuthorizationHeader{}, errors.New("authorization header is malformed") + } + out := s3AuthorizationHeader{Algorithm: strings.TrimSpace(algorithm)} + params := strings.Split(rest, ",") + for _, param := range params { + key, value, ok := strings.Cut(strings.TrimSpace(param), "=") + if !ok { + continue + } + if key != "Credential" { + continue + } + scope := strings.Split(value, "/") + if len(scope) != 5 || scope[4] != "aws4_request" { + return s3AuthorizationHeader{}, errors.New("credential scope is malformed") + } + out.AccessKeyID = scope[0] + out.Date = scope[1] + out.Region = scope[2] + out.Service = scope[3] + break + } + if out.AccessKeyID == "" || out.Date == "" || out.Region == "" || out.Service == "" { + return s3AuthorizationHeader{}, errors.New("credential scope is required") + } + return out, nil +} + +func normalizeS3PayloadHash(raw string) string { + return strings.TrimSpace(raw) +} diff --git a/adapter/s3_test.go b/adapter/s3_test.go new file mode 100644 index 00000000..e36f062c --- /dev/null +++ b/adapter/s3_test.go @@ -0,0 +1,564 @@ +package adapter + +import ( + "bytes" + "context" + "crypto/md5" //nolint:gosec // S3 ETag compatibility requires MD5. + "crypto/sha256" + "encoding/hex" + "encoding/xml" + "io" + "net/http" + "net/http/httptest" + "strings" + "sync" + "testing" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + v4 "github.com/aws/aws-sdk-go-v2/aws/signer/v4" + "github.com/bootjp/elastickv/distribution" + "github.com/bootjp/elastickv/internal/s3keys" + "github.com/bootjp/elastickv/kv" + "github.com/bootjp/elastickv/store" + "github.com/hashicorp/raft" + "github.com/stretchr/testify/require" +) + +const ( + testS3AccessKey = "test-access" + testS3SecretKey = "test-secret" + testS3Region = "us-east-1" +) + +func TestS3Server_BucketAndObjectLifecycle(t *testing.T) { + t.Parallel() + + st := store.NewMVCCStore() + server := NewS3Server(nil, "", st, newLocalAdapterCoordinator(st), nil) + + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPut, "/bucket-a", nil) + server.handle(rec, req) + require.Equal(t, http.StatusOK, rec.Code) + + payload := "hello world" + rec = httptest.NewRecorder() + req = httptest.NewRequest(http.MethodPut, "/bucket-a/dir/file.txt", strings.NewReader(payload)) + req.Header.Set("Content-Type", "text/plain") + server.handle(rec, req) + require.Equal(t, http.StatusOK, rec.Code) + require.Equal(t, `"`+md5Hex(payload)+`"`, rec.Header().Get("ETag")) + + rec = httptest.NewRecorder() + req = httptest.NewRequest(http.MethodHead, "/bucket-a", nil) + server.handle(rec, req) + require.Equal(t, http.StatusOK, rec.Code) + + rec = httptest.NewRecorder() + req = httptest.NewRequest(http.MethodHead, "/bucket-a/dir/file.txt", nil) + server.handle(rec, req) + require.Equal(t, http.StatusOK, rec.Code) + require.Equal(t, "11", rec.Header().Get("Content-Length")) + require.Equal(t, `"`+md5Hex(payload)+`"`, rec.Header().Get("ETag")) + + rec = httptest.NewRecorder() + req = httptest.NewRequest(http.MethodGet, "/bucket-a/dir/file.txt", nil) + server.handle(rec, req) + require.Equal(t, http.StatusOK, rec.Code) + require.Equal(t, payload, rec.Body.String()) + + rec = httptest.NewRecorder() + req = httptest.NewRequest(http.MethodGet, "/bucket-a?list-type=2", nil) + server.handle(rec, req) + require.Equal(t, http.StatusOK, rec.Code) + require.Contains(t, rec.Body.String(), "dir/file.txt") + + rec = httptest.NewRecorder() + req = httptest.NewRequest(http.MethodDelete, "/bucket-a/dir/file.txt", nil) + server.handle(rec, req) + require.Equal(t, http.StatusNoContent, rec.Code) + + rec = httptest.NewRecorder() + req = httptest.NewRequest(http.MethodDelete, "/bucket-a", nil) + server.handle(rec, req) + require.Equal(t, http.StatusNoContent, rec.Code) +} + +func TestS3Server_ProxiesFollowerRequests(t *testing.T) { + t.Parallel() + + var proxied bool + upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + proxied = true + _, _ = io.WriteString(w, "proxied") + })) + defer upstream.Close() + + targetHost := strings.TrimPrefix(upstream.URL, "http://") + server := NewS3Server(nil, "", store.NewMVCCStore(), &followerS3Coordinator{}, map[raft.ServerAddress]string{ + raft.ServerAddress("leader"): targetHost, + }) + + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/", nil) + server.handle(rec, req) + + require.True(t, proxied) + require.Equal(t, http.StatusOK, rec.Code) + require.Equal(t, "proxied", rec.Body.String()) +} + +func TestS3Server_RejectsUnsignedRequestWhenCredentialsConfigured(t *testing.T) { + t.Parallel() + + st := store.NewMVCCStore() + server := NewS3Server( + nil, + "", + st, + newLocalAdapterCoordinator(st), + nil, + WithS3StaticCredentials(map[string]string{testS3AccessKey: testS3SecretKey}), + ) + + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/", nil) + server.handle(rec, req) + + require.Equal(t, http.StatusForbidden, rec.Code) + require.Contains(t, rec.Body.String(), "AccessDenied") +} + +func TestS3Server_AcceptsSigV4SignedRequests(t *testing.T) { + t.Parallel() + + st := store.NewMVCCStore() + server := NewS3Server( + nil, + "", + st, + newLocalAdapterCoordinator(st), + nil, + WithS3Region(testS3Region), + WithS3StaticCredentials(map[string]string{testS3AccessKey: testS3SecretKey}), + ) + signingTime := time.Date(2026, 3, 22, 12, 0, 0, 0, time.UTC) + + rec := httptest.NewRecorder() + req := newSignedS3Request(t, "/bucket-a", "", signingTime) + server.handle(rec, req) + require.Equal(t, http.StatusOK, rec.Code) + + rec = httptest.NewRecorder() + req = newSignedS3Request(t, "/bucket-a/dir/file.txt", "hello world", signingTime) + server.handle(rec, req) + require.Equal(t, http.StatusOK, rec.Code) + require.Equal(t, `"`+md5Hex("hello world")+`"`, rec.Header().Get("ETag")) +} + +func TestS3Server_RejectsPayloadHashMismatch(t *testing.T) { + t.Parallel() + + st := store.NewMVCCStore() + server := NewS3Server( + nil, + "", + st, + newLocalAdapterCoordinator(st), + nil, + WithS3Region(testS3Region), + WithS3StaticCredentials(map[string]string{testS3AccessKey: testS3SecretKey}), + ) + signingTime := time.Date(2026, 3, 22, 12, 0, 0, 0, time.UTC) + + rec := httptest.NewRecorder() + req := newSignedS3Request(t, "/bucket-a", "", signingTime) + server.handle(rec, req) + require.Equal(t, http.StatusOK, rec.Code) + + rec = httptest.NewRecorder() + req = newSignedS3Request(t, "/bucket-a/object.txt", "hello", signingTime) + req.Body = io.NopCloser(strings.NewReader("HELLO")) + req.ContentLength = int64(len("HELLO")) + server.handle(rec, req) + + require.Equal(t, http.StatusBadRequest, rec.Code) + require.Contains(t, rec.Body.String(), "XAmzContentSHA256Mismatch") +} + +func TestS3Server_ProxiesFollowerRequestsBeforeAuth(t *testing.T) { + t.Parallel() + + var proxied bool + upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + proxied = true + _, _ = io.WriteString(w, "proxied") + })) + defer upstream.Close() + + targetHost := strings.TrimPrefix(upstream.URL, "http://") + server := NewS3Server( + nil, + "", + store.NewMVCCStore(), + &followerS3Coordinator{}, + map[raft.ServerAddress]string{ + raft.ServerAddress("leader"): targetHost, + }, + WithS3StaticCredentials(map[string]string{testS3AccessKey: testS3SecretKey}), + ) + + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/", nil) + server.handle(rec, req) + + require.True(t, proxied) + require.Equal(t, http.StatusOK, rec.Code) + require.Equal(t, "proxied", rec.Body.String()) +} + +func TestS3Server_ProxiesObjectRequestsUsingObjectRouteLeader(t *testing.T) { + t.Parallel() + + ctx := context.Background() + st := store.NewMVCCStore() + metaBody, err := encodeS3BucketMeta(&s3BucketMeta{ + BucketName: "bucket-a", + Generation: 7, + CreatedAtHLC: 1, + Region: "us-east-1", + }) + require.NoError(t, err) + require.NoError(t, st.PutAt(ctx, s3keys.BucketMetaKey("bucket-a"), metaBody, 1, 0)) + + var proxied bool + upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + proxied = true + _, _ = io.WriteString(w, "proxied-object") + })) + defer upstream.Close() + + coord := &routeAwareS3Coordinator{ + localForKey: func(key []byte) bool { + return !bytes.HasPrefix(key, []byte(s3keys.RoutePrefix)) + }, + leaderForKey: func(key []byte) raft.ServerAddress { + if bytes.HasPrefix(key, []byte(s3keys.RoutePrefix)) { + return raft.ServerAddress("object-leader") + } + return "" + }, + } + server := NewS3Server(nil, "", st, coord, map[raft.ServerAddress]string{ + raft.ServerAddress("object-leader"): strings.TrimPrefix(upstream.URL, "http://"), + }) + + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/bucket-a/dir/file.txt", nil) + server.handle(rec, req) + + require.True(t, proxied) + require.Equal(t, http.StatusOK, rec.Code) + require.Equal(t, "proxied-object", rec.Body.String()) +} + +func TestS3Server_ListObjectsV2DelimiterContinuationAvoidsDuplicateCommonPrefixes(t *testing.T) { + t.Parallel() + + st := store.NewMVCCStore() + server := NewS3Server(nil, "", st, newLocalAdapterCoordinator(st), nil) + + rec := httptest.NewRecorder() + server.handle(rec, httptest.NewRequest(http.MethodPut, "/bucket-a", nil)) + require.Equal(t, http.StatusOK, rec.Code) + + for _, objectKey := range []string{"dir-a/one.txt", "dir-a/two.txt", "dir-b/three.txt"} { + rec = httptest.NewRecorder() + server.handle(rec, httptest.NewRequest(http.MethodPut, "/bucket-a/"+objectKey, strings.NewReader(objectKey))) + require.Equal(t, http.StatusOK, rec.Code) + } + + rec = httptest.NewRecorder() + server.handle(rec, httptest.NewRequest(http.MethodGet, "/bucket-a?list-type=2&delimiter=%2F&max-keys=1", nil)) + require.Equal(t, http.StatusOK, rec.Code) + + firstPage := decodeListBucketResult(t, rec.Body.Bytes()) + require.Len(t, firstPage.CommonPrefixes, 1) + require.Equal(t, "dir-a/", firstPage.CommonPrefixes[0].Prefix) + require.NotEmpty(t, firstPage.NextContinuationToken) + + rec = httptest.NewRecorder() + server.handle(rec, httptest.NewRequest(http.MethodGet, "/bucket-a?list-type=2&delimiter=%2F&max-keys=1&continuation-token="+firstPage.NextContinuationToken, nil)) + require.Equal(t, http.StatusOK, rec.Code) + + secondPage := decodeListBucketResult(t, rec.Body.Bytes()) + require.Len(t, secondPage.CommonPrefixes, 1) + require.Equal(t, "dir-b/", secondPage.CommonPrefixes[0].Prefix) +} + +func TestS3Server_PutObjectConflictsWhenBucketDeletedDuringFinalize(t *testing.T) { + t.Parallel() + + st := store.NewMVCCStore() + coord := &bucketDeleteRaceCoordinator{ + localAdapterCoordinator: newLocalAdapterCoordinator(st), + bucket: "bucket-a", + } + server := NewS3Server(nil, "", st, coord, nil) + + rec := httptest.NewRecorder() + server.handle(rec, httptest.NewRequest(http.MethodPut, "/bucket-a", nil)) + require.Equal(t, http.StatusOK, rec.Code) + + rec = httptest.NewRecorder() + server.handle(rec, httptest.NewRequest(http.MethodPut, "/bucket-a/object.txt", strings.NewReader("payload"))) + require.Equal(t, http.StatusConflict, rec.Code) + require.Contains(t, rec.Body.String(), "OperationAborted") + + readTS := snapshotTS(coord.Clock(), st) + _, exists, err := server.loadBucketMetaAt(context.Background(), "bucket-a", readTS) + require.NoError(t, err) + require.False(t, exists) + + _, found, err := server.loadObjectManifestAt(context.Background(), s3keys.ObjectManifestKey("bucket-a", 1, "object.txt"), readTS) + require.NoError(t, err) + require.False(t, found) + + kvs, err := st.ScanAt(context.Background(), []byte(s3keys.BlobPrefix), prefixScanEnd([]byte(s3keys.BlobPrefix)), 10, readTS) + require.NoError(t, err) + require.Empty(t, kvs) +} + +func TestS3Server_ShardedStoreRoutesBucketAndObjectData(t *testing.T) { + t.Parallel() + + ctx := context.Background() + engine := distribution.NewEngine() + engine.UpdateRoute([]byte(s3keys.RoutePrefix), []byte("!s3|"), 2) + engine.UpdateRoute([]byte("!s3|"), nil, 1) + + store1 := store.NewMVCCStore() + raft1, stop1 := newSingleRaftForS3Test(t, "g1", kv.NewKvFSM(store1)) + defer stop1() + + store2 := store.NewMVCCStore() + raft2, stop2 := newSingleRaftForS3Test(t, "g2", kv.NewKvFSM(store2)) + defer stop2() + + groups := map[uint64]*kv.ShardGroup{ + 1: {Raft: raft1, Store: store1, Txn: kv.NewLeaderProxy(raft1)}, + 2: {Raft: raft2, Store: store2, Txn: kv.NewLeaderProxy(raft2)}, + } + shardStore := kv.NewShardStore(engine, groups) + coord := kv.NewShardedCoordinator(engine, groups, 1, kv.NewHLC(), shardStore) + server := NewS3Server(nil, "", shardStore, coord, nil) + + rec := httptest.NewRecorder() + server.handle(rec, httptest.NewRequest(http.MethodPut, "/bucket-a", nil)) + require.Equal(t, http.StatusOK, rec.Code, rec.Body.String()) + + rec = httptest.NewRecorder() + server.handle(rec, httptest.NewRequest(http.MethodPut, "/bucket-a/dir/file.txt", strings.NewReader("payload"))) + require.Equal(t, http.StatusOK, rec.Code, rec.Body.String()) + + rec = httptest.NewRecorder() + server.handle(rec, httptest.NewRequest(http.MethodGet, "/bucket-a/dir/file.txt", nil)) + require.Equal(t, http.StatusOK, rec.Code, rec.Body.String()) + require.Equal(t, "payload", rec.Body.String()) + + rec = httptest.NewRecorder() + server.handle(rec, httptest.NewRequest(http.MethodGet, "/bucket-a?list-type=2", nil)) + require.Equal(t, http.StatusOK, rec.Code, rec.Body.String()) + require.Contains(t, rec.Body.String(), "dir/file.txt") + + readTS := shardStore.LastCommitTS() + var err error + _, err = store1.GetAt(ctx, s3keys.BucketMetaKey("bucket-a"), readTS) + require.NoError(t, err) + + _, err = store1.GetAt(ctx, s3keys.ObjectManifestKey("bucket-a", 1, "dir/file.txt"), readTS) + require.ErrorIs(t, err, store.ErrKeyNotFound) + + _, err = store2.GetAt(ctx, s3keys.ObjectManifestKey("bucket-a", 1, "dir/file.txt"), readTS) + require.NoError(t, err) +} + +type followerS3Coordinator struct { + stubAdapterCoordinator +} + +func (c *followerS3Coordinator) Dispatch(context.Context, *kv.OperationGroup[kv.OP]) (*kv.CoordinateResponse, error) { + return &kv.CoordinateResponse{}, nil +} + +func (c *followerS3Coordinator) IsLeader() bool { + return false +} + +func (c *followerS3Coordinator) VerifyLeader() error { + return kv.ErrLeaderNotFound +} + +func (c *followerS3Coordinator) RaftLeader() raft.ServerAddress { + return raft.ServerAddress("leader") +} + +type routeAwareS3Coordinator struct { + stubAdapterCoordinator + localForKey func([]byte) bool + leaderForKey func([]byte) raft.ServerAddress +} + +func (c *routeAwareS3Coordinator) IsLeaderForKey(key []byte) bool { + if c.localForKey == nil { + return true + } + return c.localForKey(key) +} + +func (c *routeAwareS3Coordinator) VerifyLeaderForKey(key []byte) error { + if c.IsLeaderForKey(key) { + return nil + } + return kv.ErrLeaderNotFound +} + +func (c *routeAwareS3Coordinator) RaftLeaderForKey(key []byte) raft.ServerAddress { + if c.leaderForKey == nil { + return "" + } + return c.leaderForKey(key) +} + +type bucketDeleteRaceCoordinator struct { + *localAdapterCoordinator + bucket string + once sync.Once +} + +//nolint:cyclop // This test coordinator intentionally injects a specific race and conflict path. +func (c *bucketDeleteRaceCoordinator) Dispatch(ctx context.Context, req *kv.OperationGroup[kv.OP]) (*kv.CoordinateResponse, error) { + if req != nil && req.IsTxn && containsObjectManifestMutation(req.Elems) { + c.once.Do(func() { + deleteTS := req.CommitTS + 1 + if deleteTS == 0 { + deleteTS = req.StartTS + 1 + } + _ = c.store.DeleteAt(ctx, s3keys.BucketMetaKey(c.bucket), deleteTS) + }) + } + if req != nil && req.IsTxn { + for _, elem := range req.Elems { + if elem == nil { + continue + } + latestTS, exists, err := c.store.LatestCommitTS(ctx, elem.Key) + if err != nil { + return nil, err + } + if exists && latestTS > req.StartTS { + return nil, store.NewWriteConflictError(elem.Key) + } + } + } + return c.localAdapterCoordinator.Dispatch(ctx, req) +} + +func containsObjectManifestMutation(elems []*kv.Elem[kv.OP]) bool { + for _, elem := range elems { + if elem == nil { + continue + } + if _, _, _, ok := s3keys.ParseObjectManifestKey(elem.Key); ok { + return true + } + } + return false +} + +func decodeListBucketResult(t *testing.T, body []byte) s3ListBucketResult { + t.Helper() + + out := s3ListBucketResult{} + require.NoError(t, xml.Unmarshal(body, &out)) + return out +} + +func newSingleRaftForS3Test(t *testing.T, id string, fsm raft.FSM) (*raft.Raft, func()) { + t.Helper() + + addr, trans := raft.NewInmemTransport(raft.ServerAddress(id)) + cfg := raft.DefaultConfig() + cfg.LocalID = raft.ServerID(id) + cfg.HeartbeatTimeout = 50 * time.Millisecond + cfg.ElectionTimeout = 100 * time.Millisecond + cfg.LeaderLeaseTimeout = 50 * time.Millisecond + + ldb := raft.NewInmemStore() + sdb := raft.NewInmemStore() + fss := raft.NewInmemSnapshotStore() + r, err := raft.NewRaft(cfg, fsm, ldb, sdb, fss, trans) + require.NoError(t, err) + require.NoError(t, r.BootstrapCluster(raft.Configuration{ + Servers: []raft.Server{{ + Suffrage: raft.Voter, + ID: raft.ServerID(id), + Address: addr, + }}, + }).Error()) + + for range 100 { + if r.State() == raft.Leader { + break + } + time.Sleep(10 * time.Millisecond) + } + require.Equal(t, raft.Leader, r.State()) + return r, func() { _ = r.Shutdown().Error() } +} + +func md5Hex(v string) string { + sum := md5.Sum([]byte(v)) //nolint:gosec // S3 ETag compatibility requires MD5. + return hex.EncodeToString(sum[:]) +} + +func sha256Hex(v string) string { + sum := sha256.Sum256([]byte(v)) + return hex.EncodeToString(sum[:]) +} + +func newSignedS3Request( + t *testing.T, + target string, + body string, + signingTime time.Time, +) *http.Request { + t.Helper() + + req := httptest.NewRequest(http.MethodPut, target, strings.NewReader(body)) + payloadHash := sha256Hex(body) + req.Header.Set("X-Amz-Content-Sha256", payloadHash) + + signer := v4.NewSigner(func(opts *v4.SignerOptions) { + opts.DisableURIPathEscaping = true + }) + err := signer.SignHTTP( + context.Background(), + aws.Credentials{ + AccessKeyID: testS3AccessKey, + SecretAccessKey: testS3SecretKey, + Source: "test", + }, + req, + payloadHash, + "s3", + testS3Region, + signingTime, + ) + require.NoError(t, err) + expectedAuth, err := buildS3AuthorizationHeader(req, testS3AccessKey, testS3SecretKey, testS3Region, signingTime, payloadHash) + require.NoError(t, err) + require.Equal(t, strings.TrimSpace(req.Header.Get("Authorization")), expectedAuth) + return req +} diff --git a/cmd/server/demo.go b/cmd/server/demo.go index 0f622a7c..54bcfd06 100644 --- a/cmd/server/demo.go +++ b/cmd/server/demo.go @@ -2,6 +2,7 @@ package main import ( "context" + "encoding/json" "flag" "fmt" "io" @@ -36,6 +37,10 @@ var ( address = flag.String("address", ":50051", "gRPC/Raft address") redisAddress = flag.String("redisAddress", ":6379", "Redis address") dynamoAddress = flag.String("dynamoAddress", ":8000", "DynamoDB-compatible API address") + s3Address = flag.String("s3Address", ":9000", "S3-compatible API address") + s3Region = flag.String("s3Region", "us-east-1", "S3 signing region") + s3CredsFile = flag.String("s3CredentialsFile", "", "Path to a JSON file containing static S3 credentials") + s3PathStyle = flag.Bool("s3PathStyleOnly", true, "Only accept path-style S3 requests") metricsAddress = flag.String("metricsAddress", "127.0.0.1:9090", "Prometheus metrics address") metricsToken = flag.String("metricsToken", "", "Bearer token for Prometheus metrics; required for non-loopback metricsAddress") pprofAddress = flag.String("pprofAddress", "localhost:6060", "TCP host+port for pprof debug endpoints; empty to disable") @@ -44,6 +49,7 @@ var ( raftDataDir = flag.String("raftDataDir", "/var/lib/elastickv", "Raft data directory") raftBootstrap = flag.Bool("raftBootstrap", false, "Bootstrap cluster") raftRedisMap = flag.String("raftRedisMap", "", "Map of Raft address to Redis address (raftAddr=redisAddr,...)") + raftS3Map = flag.String("raftS3Map", "", "Map of Raft address to S3 address (raftAddr=s3Addr,...)") ) const ( @@ -67,6 +73,10 @@ type config struct { address string redisAddress string dynamoAddress string + s3Address string + s3Region string + s3CredsFile string + s3PathStyle bool metricsAddress string metricsToken string pprofAddress string @@ -75,6 +85,7 @@ type config struct { raftDataDir string raftBootstrap bool raftRedisMap string + raftS3Map string } func main() { @@ -88,6 +99,10 @@ func main() { address: *address, redisAddress: *redisAddress, dynamoAddress: *dynamoAddress, + s3Address: *s3Address, + s3Region: *s3Region, + s3CredsFile: *s3CredsFile, + s3PathStyle: *s3PathStyle, metricsAddress: *metricsAddress, metricsToken: *metricsToken, pprofAddress: *pprofAddress, @@ -96,6 +111,7 @@ func main() { raftDataDir: *raftDataDir, raftBootstrap: *raftBootstrap, raftRedisMap: *raftRedisMap, + raftS3Map: *raftS3Map, } if err := run(runCtx, eg, cfg); err != nil { slog.Error(err.Error()) @@ -110,6 +126,9 @@ func main() { address: "127.0.0.1:50051", redisAddress: "127.0.0.1:63791", dynamoAddress: "127.0.0.1:63801", + s3Address: "127.0.0.1:63901", + s3Region: "us-east-1", + s3PathStyle: true, metricsAddress: "0.0.0.0:9091", metricsToken: demoMetricsToken, pprofAddress: "127.0.0.1:6061", @@ -121,6 +140,9 @@ func main() { address: "127.0.0.1:50052", redisAddress: "127.0.0.1:63792", dynamoAddress: "127.0.0.1:63802", + s3Address: "127.0.0.1:63902", + s3Region: "us-east-1", + s3PathStyle: true, metricsAddress: "0.0.0.0:9092", metricsToken: demoMetricsToken, pprofAddress: "127.0.0.1:6062", @@ -132,6 +154,9 @@ func main() { address: "127.0.0.1:50053", redisAddress: "127.0.0.1:63793", dynamoAddress: "127.0.0.1:63803", + s3Address: "127.0.0.1:63903", + s3Region: "us-east-1", + s3PathStyle: true, metricsAddress: "0.0.0.0:9093", metricsToken: demoMetricsToken, pprofAddress: "127.0.0.1:6063", @@ -141,15 +166,19 @@ func main() { }, } - // Build raftRedisMap string - var mapParts []string + // Build raftRedisMap/raftS3Map strings. + var redisMapParts []string + var s3MapParts []string for _, n := range nodes { - mapParts = append(mapParts, n.address+"="+n.redisAddress) + redisMapParts = append(redisMapParts, n.address+"="+n.redisAddress) + s3MapParts = append(s3MapParts, n.address+"="+n.s3Address) } - raftRedisMapStr := strings.Join(mapParts, ",") + raftRedisMapStr := strings.Join(redisMapParts, ",") + raftS3MapStr := strings.Join(s3MapParts, ",") for _, n := range nodes { n.raftRedisMap = raftRedisMapStr + n.raftS3Map = raftS3MapStr cfg := n // capture loop variable if err := run(runCtx, eg, cfg); err != nil { slog.Error(err.Error()) @@ -366,6 +395,56 @@ func setupRedis(ctx context.Context, lc net.ListenConfig, st store.MVCCStore, co return adapter.NewRedisServer(l, redisAddr, routedStore, coordinator, leaderRedis, relay, adapter.WithRedisActiveTimestampTracker(readTracker)), nil } +func setupS3( + ctx context.Context, + lc net.ListenConfig, + st store.MVCCStore, + coordinator *kv.Coordinate, + addr string, + s3Addr string, + raftS3MapStr string, + region string, + credentialsFile string, + pathStyleOnly bool, + readTracker *kv.ActiveTimestampTracker, +) (*adapter.S3Server, error) { + if !pathStyleOnly { + return nil, errors.New("virtual-hosted style S3 requests are not implemented") + } + leaderS3 := make(map[raft.ServerAddress]string) + if raftS3MapStr != "" { + parts := strings.SplitSeq(raftS3MapStr, ",") + for part := range parts { + kv := strings.Split(part, "=") + if len(kv) == kvParts { + leaderS3[raft.ServerAddress(kv[0])] = kv[1] + } + } + } + leaderS3[raft.ServerAddress(addr)] = s3Addr + + l, err := lc.Listen(ctx, "tcp", s3Addr) + if err != nil { + return nil, errors.WithStack(err) + } + staticCreds, err := loadS3StaticCredentials(credentialsFile) + if err != nil { + _ = l.Close() + return nil, err + } + routedStore := kv.NewLeaderRoutedStore(st, coordinator) + return adapter.NewS3Server( + l, + s3Addr, + routedStore, + coordinator, + leaderS3, + adapter.WithS3Region(region), + adapter.WithS3StaticCredentials(staticCreds), + adapter.WithS3ActiveTimestampTracker(readTracker), + ), nil +} + func run(ctx context.Context, eg *errgroup.Group, cfg config) error { var lc net.ListenConfig cleanup := internalutil.CleanupStack{} @@ -441,6 +520,11 @@ func run(ctx context.Context, eg *errgroup.Group, cfg config) error { return err } cleanup.Add(rd.Stop) + s3s, err := setupS3(ctx, lc, st, coordinator, cfg.address, cfg.s3Address, cfg.raftS3Map, cfg.s3Region, cfg.s3CredsFile, cfg.s3PathStyle, readTracker) + if err != nil { + return err + } + cleanup.Add(s3s.Stop) dynamoL, err := lc.Listen(ctx, "tcp", cfg.dynamoAddress) if err != nil { return errors.WithStack(err) @@ -463,6 +547,8 @@ func run(ctx context.Context, eg *errgroup.Group, cfg config) error { eg.Go(grpcServeTask(s, grpcSock, cfg.address)) eg.Go(redisShutdownTask(ctx, rd, cfg.redisAddress)) eg.Go(redisServeTask(rd, cfg.redisAddress)) + eg.Go(s3ShutdownTask(ctx, s3s, cfg.s3Address)) + eg.Go(s3ServeTask(s3s, cfg.s3Address)) eg.Go(dynamoShutdownTask(ctx, ds, cfg.dynamoAddress)) eg.Go(dynamoServeTask(ds, cfg.dynamoAddress)) eg.Go(monitoring.MetricsShutdownTask(ctx, ms, cfg.metricsAddress)) @@ -625,3 +711,57 @@ func dynamoServeTask(dynamoServer *adapter.DynamoDBServer, address string) func( return errors.WithStack(err) } } + +func s3ShutdownTask(ctx context.Context, s3Server *adapter.S3Server, address string) func() error { + return func() error { + <-ctx.Done() + slog.Info("Shutting down S3 server", "address", address, "reason", ctx.Err()) + s3Server.Stop() + return nil + } +} + +func s3ServeTask(s3Server *adapter.S3Server, address string) func() error { + return func() error { + slog.Info("Starting S3 server", "address", address) + err := s3Server.Run() + if err == nil || errors.Is(err, net.ErrClosed) { + return nil + } + return errors.WithStack(err) + } +} + +type s3CredentialFile struct { + Credentials []s3CredentialEntry `json:"credentials"` +} + +type s3CredentialEntry struct { + AccessKeyID string `json:"access_key_id"` + SecretAccessKey string `json:"secret_access_key"` +} + +func loadS3StaticCredentials(path string) (map[string]string, error) { + path = strings.TrimSpace(path) + if path == "" { + return nil, nil + } + body, err := os.ReadFile(path) + if err != nil { + return nil, errors.WithStack(err) + } + file := s3CredentialFile{} + if err := json.Unmarshal(body, &file); err != nil { + return nil, errors.WithStack(err) + } + out := make(map[string]string, len(file.Credentials)) + for _, cred := range file.Credentials { + accessKeyID := strings.TrimSpace(cred.AccessKeyID) + secretAccessKey := strings.TrimSpace(cred.SecretAccessKey) + if accessKeyID == "" || secretAccessKey == "" { + return nil, errors.New("s3 credentials file contains an empty access key or secret key") + } + out[accessKeyID] = secretAccessKey + } + return out, nil +} diff --git a/cmd/server/demo_test.go b/cmd/server/demo_test.go index 9d2ca551..bb31b34d 100644 --- a/cmd/server/demo_test.go +++ b/cmd/server/demo_test.go @@ -4,8 +4,11 @@ import ( "context" "net" "net/http" + "os" + "path/filepath" "testing" + "github.com/bootjp/elastickv/store" "github.com/stretchr/testify/require" ) @@ -41,3 +44,34 @@ func TestSetupPprofHTTPServerRejectsInvalidAddressBeforeTokenCheck(t *testing.T) require.Nil(t, listener) require.Nil(t, server) } + +func TestLoadS3StaticCredentials(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "s3-creds.json") + require.NoError(t, os.WriteFile(path, []byte(`{"credentials":[{"access_key_id":"akid","secret_access_key":"secret"}]}`), 0o600)) + + creds, err := loadS3StaticCredentials(path) + require.NoError(t, err) + require.Equal(t, map[string]string{"akid": "secret"}, creds) +} + +func TestLoadS3StaticCredentialsAllowsEmptyPath(t *testing.T) { + creds, err := loadS3StaticCredentials("") + require.NoError(t, err) + require.Nil(t, creds) +} + +func TestLoadS3StaticCredentialsRejectsEmptyFields(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "s3-creds.json") + require.NoError(t, os.WriteFile(path, []byte(`{"credentials":[{"access_key_id":"akid","secret_access_key":""}]}`), 0o600)) + + _, err := loadS3StaticCredentials(path) + require.ErrorContains(t, err, "empty access key or secret key") +} + +func TestSetupS3RejectsVirtualHostedStyle(t *testing.T) { + s3s, err := setupS3(context.Background(), net.ListenConfig{}, store.NewMVCCStore(), nil, "127.0.0.1:50051", "127.0.0.1:9000", "", "us-east-1", "", false, nil) + require.ErrorContains(t, err, "virtual-hosted style S3 requests are not implemented") + require.Nil(t, s3s) +} diff --git a/internal/s3keys/keys.go b/internal/s3keys/keys.go new file mode 100644 index 00000000..513a0625 --- /dev/null +++ b/internal/s3keys/keys.go @@ -0,0 +1,307 @@ +package s3keys + +import ( + "bytes" + "encoding/binary" +) + +const ( + BucketMetaPrefix = "!s3|bucket|meta|" + BucketGenerationPrefix = "!s3|bucket|gen|" + ObjectManifestPrefix = "!s3|obj|head|" + UploadMetaPrefix = "!s3|upload|meta|" + UploadPartPrefix = "!s3|upload|part|" + BlobPrefix = "!s3|blob|" + GCUploadPrefix = "!s3|gc|upload|" + RoutePrefix = "!s3route|" + + escapeByte = byte(0x00) + terminatorByte = byte(0x01) + escapedZeroByte = byte(0xFF) + u64Bytes = 8 + + segmentEscapeOverhead = 2 + routeKeyExtraBytes = 6 + buildObjectExtraBytes = 8 + segmentTerminatorBytes = 2 +) + +var ( + bucketMetaPrefixBytes = []byte(BucketMetaPrefix) + bucketGenerationPrefixBytes = []byte(BucketGenerationPrefix) + objectManifestPrefixBytes = []byte(ObjectManifestPrefix) + uploadMetaPrefixBytes = []byte(UploadMetaPrefix) + uploadPartPrefixBytes = []byte(UploadPartPrefix) + blobPrefixBytes = []byte(BlobPrefix) + gcUploadPrefixBytes = []byte(GCUploadPrefix) + routePrefixBytes = []byte(RoutePrefix) +) + +func EncodeSegment(raw []byte) []byte { + out := EncodeSegmentPrefix(raw) + out = append(out, escapeByte, terminatorByte) + return out +} + +func EncodeSegmentPrefix(raw []byte) []byte { + out := make([]byte, 0, len(raw)+segmentEscapeOverhead) + for _, b := range raw { + if b == escapeByte { + out = append(out, escapeByte, escapedZeroByte) + continue + } + out = append(out, b) + } + return out +} + +func BucketMetaKey(bucket string) []byte { + out := make([]byte, 0, len(BucketMetaPrefix)+len(bucket)+segmentEscapeOverhead) + out = append(out, bucketMetaPrefixBytes...) + out = append(out, EncodeSegment([]byte(bucket))...) + return out +} + +func BucketGenerationKey(bucket string) []byte { + out := make([]byte, 0, len(BucketGenerationPrefix)+len(bucket)+segmentEscapeOverhead) + out = append(out, bucketGenerationPrefixBytes...) + out = append(out, EncodeSegment([]byte(bucket))...) + return out +} + +func ParseBucketMetaKey(key []byte) (string, bool) { + if !bytes.HasPrefix(key, bucketMetaPrefixBytes) { + return "", false + } + segment, next, ok := decodeSegment(key, len(bucketMetaPrefixBytes)) + if !ok || next != len(key) { + return "", false + } + return string(segment), true +} + +func ObjectManifestKey(bucket string, generation uint64, object string) []byte { + return buildObjectKey(objectManifestPrefixBytes, bucket, generation, object, "", 0, 0) +} + +func UploadMetaKey(bucket string, generation uint64, object string, uploadID string) []byte { + return buildObjectKey(uploadMetaPrefixBytes, bucket, generation, object, uploadID, 0, 0) +} + +func UploadPartKey(bucket string, generation uint64, object string, uploadID string, partNo uint64) []byte { + return buildObjectKey(uploadPartPrefixBytes, bucket, generation, object, uploadID, partNo, 0) +} + +func BlobKey(bucket string, generation uint64, object string, uploadID string, partNo uint64, chunkNo uint64) []byte { + return buildObjectKey(blobPrefixBytes, bucket, generation, object, uploadID, partNo, chunkNo) +} + +func GCUploadKey(bucket string, generation uint64, object string, uploadID string) []byte { + return buildObjectKey(gcUploadPrefixBytes, bucket, generation, object, uploadID, 0, 0) +} + +func RouteKey(bucket string, generation uint64, object string) []byte { + out := make([]byte, 0, len(RoutePrefix)+len(bucket)+len(object)+2*u64Bytes+routeKeyExtraBytes) + out = append(out, routePrefixBytes...) + out = append(out, EncodeSegment([]byte(bucket))...) + out = appendU64(out, generation) + out = append(out, EncodeSegment([]byte(object))...) + return out +} + +func ObjectManifestPrefixForBucket(bucket string, generation uint64) []byte { + out := make([]byte, 0, len(ObjectManifestPrefix)+len(bucket)+u64Bytes+segmentEscapeOverhead) + out = append(out, objectManifestPrefixBytes...) + out = append(out, EncodeSegment([]byte(bucket))...) + out = appendU64(out, generation) + return out +} + +func ObjectManifestScanStart(bucket string, generation uint64, objectPrefix string) []byte { + out := ObjectManifestPrefixForBucket(bucket, generation) + out = append(out, EncodeSegmentPrefix([]byte(objectPrefix))...) + return out +} + +func ParseObjectManifestKey(key []byte) (bucket string, generation uint64, object string, ok bool) { + if !bytes.HasPrefix(key, objectManifestPrefixBytes) { + return "", 0, "", false + } + bucketRaw, next, ok := decodeSegment(key, len(objectManifestPrefixBytes)) + if !ok { + return "", 0, "", false + } + generation, next, ok = readU64(key, next) + if !ok { + return "", 0, "", false + } + objectRaw, next, ok := decodeSegment(key, next) + if !ok || next != len(key) { + return "", 0, "", false + } + return string(bucketRaw), generation, string(objectRaw), true +} + +func ExtractRouteKey(key []byte) []byte { + prefix := objectScopedPrefix(key) + if prefix == nil { + return nil + } + + offset := len(prefix) + bucketEnd, ok := segmentEnd(key, offset) + if !ok { + return nil + } + offset = bucketEnd + var next int + if _, next, ok = readU64(key, offset); !ok { + return nil + } + offset = next + objectEnd, ok := segmentEnd(key, offset) + if !ok { + return nil + } + + out := make([]byte, 0, len(RoutePrefix)+objectEnd-len(prefix)) + out = append(out, routePrefixBytes...) + out = append(out, key[len(prefix):objectEnd]...) + return out +} + +func ManifestScanRouteBounds(start []byte, end []byte) ([]byte, []byte, bool) { + routeStart, ok := manifestScanBound(start) + if !ok { + return nil, nil, false + } + if end == nil { + return routeStart, nil, true + } + routeEnd, ok := manifestScanBound(end) + if !ok { + return nil, nil, false + } + return routeStart, routeEnd, true +} + +func manifestScanBound(key []byte) ([]byte, bool) { + if !bytes.HasPrefix(key, objectManifestPrefixBytes) { + return nil, false + } + + offset := len(objectManifestPrefixBytes) + bucketEnd, ok := segmentEnd(key, offset) + if !ok { + return nil, false + } + offset = bucketEnd + if _, _, ok := readU64(key, offset); !ok { + return nil, false + } + + out := make([]byte, 0, len(RoutePrefix)+len(key)-len(objectManifestPrefixBytes)) + out = append(out, routePrefixBytes...) + out = append(out, key[len(objectManifestPrefixBytes):]...) + return out, true +} + +func buildObjectKey(prefix []byte, bucket string, generation uint64, object string, uploadID string, partNo uint64, chunkNo uint64) []byte { + out := make([]byte, 0, len(prefix)+len(bucket)+len(object)+len(uploadID)+4*u64Bytes+buildObjectExtraBytes) + out = append(out, prefix...) + out = append(out, EncodeSegment([]byte(bucket))...) + out = appendU64(out, generation) + out = append(out, EncodeSegment([]byte(object))...) + if uploadID == "" && bytes.Equal(prefix, objectManifestPrefixBytes) { + return out + } + out = append(out, EncodeSegment([]byte(uploadID))...) + if bytes.Equal(prefix, uploadMetaPrefixBytes) || bytes.Equal(prefix, gcUploadPrefixBytes) { + return out + } + out = appendU64(out, partNo) + if bytes.Equal(prefix, uploadPartPrefixBytes) { + return out + } + out = appendU64(out, chunkNo) + return out +} + +func objectScopedPrefix(key []byte) []byte { + switch { + case bytes.HasPrefix(key, objectManifestPrefixBytes): + return objectManifestPrefixBytes + case bytes.HasPrefix(key, uploadMetaPrefixBytes): + return uploadMetaPrefixBytes + case bytes.HasPrefix(key, uploadPartPrefixBytes): + return uploadPartPrefixBytes + case bytes.HasPrefix(key, blobPrefixBytes): + return blobPrefixBytes + case bytes.HasPrefix(key, gcUploadPrefixBytes): + return gcUploadPrefixBytes + default: + return nil + } +} + +func appendU64(dst []byte, v uint64) []byte { + var raw [u64Bytes]byte + binary.BigEndian.PutUint64(raw[:], v) + return append(dst, raw[:]...) +} + +func readU64(src []byte, offset int) (uint64, int, bool) { + if offset < 0 || len(src)-offset < u64Bytes { + return 0, 0, false + } + return binary.BigEndian.Uint64(src[offset : offset+u64Bytes]), offset + u64Bytes, true +} + +func decodeSegment(src []byte, offset int) ([]byte, int, bool) { + if offset < 0 || offset > len(src) { + return nil, 0, false + } + out := make([]byte, 0, len(src)-offset) + for i := offset; i < len(src); i++ { + if src[i] != escapeByte { + out = append(out, src[i]) + continue + } + if i+1 >= len(src) { + return nil, 0, false + } + switch src[i+1] { + case escapedZeroByte: + out = append(out, escapeByte) + i++ + case terminatorByte: + return out, i + segmentTerminatorBytes, true + default: + return nil, 0, false + } + } + return nil, 0, false +} + +func segmentEnd(src []byte, offset int) (int, bool) { + if offset < 0 || offset > len(src) { + return 0, false + } + for i := offset; i < len(src); i++ { + if src[i] != escapeByte { + continue + } + if i+1 >= len(src) { + return 0, false + } + switch src[i+1] { + case escapedZeroByte: + i++ + case terminatorByte: + return i + segmentTerminatorBytes, true + default: + return 0, false + } + } + return 0, false +} diff --git a/internal/s3keys/keys_test.go b/internal/s3keys/keys_test.go new file mode 100644 index 00000000..143f9b72 --- /dev/null +++ b/internal/s3keys/keys_test.go @@ -0,0 +1,82 @@ +package s3keys + +import ( + "bytes" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestBucketMetaKey_RoundTripsZeroByteSegments(t *testing.T) { + t.Parallel() + + bucket := string([]byte{'b', 'u', 0x00, 'c', 'k', 'e', 't'}) + key := BucketMetaKey(bucket) + + parsed, ok := ParseBucketMetaKey(key) + require.True(t, ok) + require.Equal(t, bucket, parsed) +} + +func TestObjectManifestKey_RoundTripsZeroByteSegments(t *testing.T) { + t.Parallel() + + bucket := string([]byte{'b', 0x00, 'k'}) + object := string([]byte{'o', 'b', 'j', 0x00, '/', 'x'}) + key := ObjectManifestKey(bucket, 7, object) + + parsedBucket, generation, parsedObject, ok := ParseObjectManifestKey(key) + require.True(t, ok) + require.Equal(t, bucket, parsedBucket) + require.Equal(t, uint64(7), generation) + require.Equal(t, object, parsedObject) +} + +func TestExtractRouteKey_ObjectScopedKeys(t *testing.T) { + t.Parallel() + + bucket := "bucket-a" + generation := uint64(3) + object := "dir/file.txt" + want := RouteKey(bucket, generation, object) + + keys := [][]byte{ + ObjectManifestKey(bucket, generation, object), + UploadMetaKey(bucket, generation, object, "upload-1"), + UploadPartKey(bucket, generation, object, "upload-1", 9), + BlobKey(bucket, generation, object, "upload-1", 9, 2), + GCUploadKey(bucket, generation, object, "upload-1"), + } + for _, key := range keys { + require.Equal(t, want, ExtractRouteKey(key)) + } +} + +func TestManifestScanRouteBounds(t *testing.T) { + t.Parallel() + + start := ObjectManifestScanStart("bucket-a", 5, string([]byte{'a', 0x00, 'b'})) + end := ObjectManifestScanStart("bucket-a", 5, "z") + + routeStart, routeEnd, ok := ManifestScanRouteBounds(start, end) + require.True(t, ok) + require.Equal(t, append([]byte(RoutePrefix), start[len(ObjectManifestPrefix):]...), routeStart) + require.Equal(t, append([]byte(RoutePrefix), end[len(ObjectManifestPrefix):]...), routeEnd) +} + +func TestManifestScanRouteBoundsRejectsNonManifestKeys(t *testing.T) { + t.Parallel() + + routeStart, routeEnd, ok := ManifestScanRouteBounds(BucketMetaKey("bucket-a"), nil) + require.False(t, ok) + require.Nil(t, routeStart) + require.Nil(t, routeEnd) +} + +func TestEncodeSegmentPrefix_EscapesZeroBytes(t *testing.T) { + t.Parallel() + + encoded := EncodeSegmentPrefix([]byte{0x00, 'a', 0x00}) + require.Equal(t, []byte{0x00, 0xFF, 'a', 0x00, 0xFF}, encoded) + require.False(t, bytes.Contains(encoded, []byte{0x00, 0x00})) +} diff --git a/kv/coordinator.go b/kv/coordinator.go index d9ad5f7d..858cca1f 100644 --- a/kv/coordinator.go +++ b/kv/coordinator.go @@ -61,7 +61,7 @@ func (c *Coordinate) Dispatch(ctx context.Context, reqs *OperationGroup[OP]) (*C } if reqs.IsTxn { - return c.dispatchTxn(reqs.Elems, reqs.StartTS) + return c.dispatchTxn(reqs.Elems, reqs.StartTS, reqs.CommitTS) } return c.dispatchRaw(reqs.Elems) @@ -101,16 +101,18 @@ func (c *Coordinate) nextStartTS() uint64 { return c.clock.Next() } -func (c *Coordinate) dispatchTxn(reqs []*Elem[OP], startTS uint64) (*CoordinateResponse, error) { +func (c *Coordinate) dispatchTxn(reqs []*Elem[OP], startTS uint64, commitTS uint64) (*CoordinateResponse, error) { primary := primaryKeyForElems(reqs) if len(primary) == 0 { return nil, errors.WithStack(ErrTxnPrimaryKeyRequired) } - commitTS := c.clock.Next() - if commitTS <= startTS { - c.clock.Observe(startTS) + if commitTS == 0 { commitTS = c.clock.Next() + if commitTS <= startTS { + c.clock.Observe(startTS) + commitTS = c.clock.Next() + } } if commitTS <= startTS { return nil, errors.WithStack(ErrTxnCommitTSRequired) @@ -210,7 +212,7 @@ func (c *Coordinate) redirect(ctx context.Context, reqs *OperationGroup[OP]) (*C return nil, errors.WithStack(ErrTxnPrimaryKeyRequired) } requests = []*pb.Request{ - onePhaseTxnRequest(reqs.StartTS, 0, primary, reqs.Elems), + onePhaseTxnRequest(reqs.StartTS, reqs.CommitTS, primary, reqs.Elems), } } else { for _, req := range reqs.Elems { diff --git a/kv/coordinator_txn_test.go b/kv/coordinator_txn_test.go index 9c992935..6ef319fd 100644 --- a/kv/coordinator_txn_test.go +++ b/kv/coordinator_txn_test.go @@ -36,7 +36,7 @@ func TestCoordinateDispatchTxn_RejectsNonMonotonicCommitTS(t *testing.T) { _, err := c.dispatchTxn([]*Elem[OP]{ {Op: Put, Key: []byte("k"), Value: []byte("v")}, - }, startTS) + }, startTS, 0) require.ErrorIs(t, err, ErrTxnCommitTSRequired) require.Equal(t, 0, tx.commits) } @@ -52,7 +52,7 @@ func TestCoordinateDispatchTxn_RejectsMissingPrimaryKey(t *testing.T) { _, err := c.dispatchTxn([]*Elem[OP]{ {Op: Put, Key: nil, Value: []byte("v")}, - }, 1) + }, 1, 0) require.ErrorIs(t, err, ErrTxnPrimaryKeyRequired) require.Equal(t, 0, tx.commits) } @@ -70,7 +70,7 @@ func TestCoordinateDispatchTxn_UsesOnePhaseRequest(t *testing.T) { _, err := c.dispatchTxn([]*Elem[OP]{ {Op: Put, Key: []byte("b"), Value: []byte("v1")}, {Op: Del, Key: []byte("x")}, - }, startTS) + }, startTS, 0) require.NoError(t, err) require.Equal(t, 1, tx.commits) require.Len(t, tx.reqs, 1) @@ -93,3 +93,26 @@ func TestCoordinateDispatchTxn_UsesOnePhaseRequest(t *testing.T) { require.Equal(t, []byte("b"), meta.PrimaryKey) require.Greater(t, meta.CommitTS, startTS) } + +func TestCoordinateDispatchTxn_UsesProvidedCommitTS(t *testing.T) { + t.Parallel() + + tx := &stubTransactional{} + c := &Coordinate{ + transactionManager: tx, + clock: NewHLC(), + } + + startTS := uint64(10) + commitTS := uint64(25) + _, err := c.dispatchTxn([]*Elem[OP]{ + {Op: Put, Key: []byte("k"), Value: []byte("v")}, + }, startTS, commitTS) + require.NoError(t, err) + require.Len(t, tx.reqs, 1) + require.Len(t, tx.reqs[0], 1) + + meta, err := DecodeTxnMeta(tx.reqs[0][0].Mutations[0].Value) + require.NoError(t, err) + require.Equal(t, commitTS, meta.CommitTS) +} diff --git a/kv/shard_key.go b/kv/shard_key.go index d45c277d..f7b3e34d 100644 --- a/kv/shard_key.go +++ b/kv/shard_key.go @@ -3,6 +3,7 @@ package kv import ( "bytes" + "github.com/bootjp/elastickv/internal/s3keys" "github.com/bootjp/elastickv/store" ) @@ -20,12 +21,18 @@ func routeKey(key []byte) []byte { return user } if embedded, ok := txnRouteKey(key); ok { + if user := s3keys.ExtractRouteKey(embedded); user != nil { + return user + } // Transaction internal keys embed the logical key after the prefix. if user := store.ExtractListUserKey(embedded); user != nil { return user } return embedded } + if user := s3keys.ExtractRouteKey(key); user != nil { + return user + } if user := store.ExtractListUserKey(key); user != nil { return user } diff --git a/kv/shard_key_test.go b/kv/shard_key_test.go new file mode 100644 index 00000000..1d353b84 --- /dev/null +++ b/kv/shard_key_test.go @@ -0,0 +1,29 @@ +package kv + +import ( + "testing" + + "github.com/bootjp/elastickv/internal/s3keys" + "github.com/stretchr/testify/require" +) + +func TestRouteKey_NormalizesS3ManifestKey(t *testing.T) { + t.Parallel() + + key := s3keys.ObjectManifestKey("bucket-a", 7, "path/to/object") + require.Equal(t, s3keys.RouteKey("bucket-a", 7, "path/to/object"), routeKey(key)) +} + +func TestRouteKey_NormalizesS3BlobKey(t *testing.T) { + t.Parallel() + + key := s3keys.BlobKey("bucket-a", 7, "path/to/object", "upload-1", 1, 2) + require.Equal(t, s3keys.RouteKey("bucket-a", 7, "path/to/object"), routeKey(key)) +} + +func TestRouteKey_NormalizesTxnWrappedS3Key(t *testing.T) { + t.Parallel() + + embedded := s3keys.UploadPartKey("bucket-a", 7, "path/to/object", "upload-1", 3) + require.Equal(t, s3keys.RouteKey("bucket-a", 7, "path/to/object"), routeKey(txnLockKey(embedded))) +} diff --git a/kv/shard_store.go b/kv/shard_store.go index e8c5df16..1dbb25a4 100644 --- a/kv/shard_store.go +++ b/kv/shard_store.go @@ -7,6 +7,7 @@ import ( "sort" "github.com/bootjp/elastickv/distribution" + "github.com/bootjp/elastickv/internal/s3keys" pb "github.com/bootjp/elastickv/proto" "github.com/bootjp/elastickv/store" "github.com/cockroachdb/errors" @@ -122,6 +123,9 @@ func (s *ShardStore) ReverseScanAt(ctx context.Context, start []byte, end []byte } func (s *ShardStore) routesForScan(start []byte, end []byte) ([]distribution.Route, bool) { + if routeStart, routeEnd, ok := s3keys.ManifestScanRouteBounds(start, end); ok { + return s.engine.GetIntersectingRoutes(routeStart, routeEnd), false + } // For internal list keys, shard routing is based on the logical user key // rather than the raw key prefix. if userKey := store.ExtractListUserKey(start); userKey != nil { diff --git a/kv/shard_store_test.go b/kv/shard_store_test.go index ebeba691..87feaccb 100644 --- a/kv/shard_store_test.go +++ b/kv/shard_store_test.go @@ -5,6 +5,7 @@ import ( "testing" "github.com/bootjp/elastickv/distribution" + "github.com/bootjp/elastickv/internal/s3keys" "github.com/bootjp/elastickv/store" "github.com/stretchr/testify/require" ) @@ -70,6 +71,63 @@ func TestShardStoreScanAt_RoutesListItemScansByUserKey(t *testing.T) { require.Equal(t, k2, kvs[2].Key) } +func TestShardStoreScanAt_IncludesS3ManifestKeysAcrossShards(t *testing.T) { + t.Parallel() + + ctx := context.Background() + + engine := distribution.NewEngine() + engine.UpdateRoute([]byte(""), []byte("m"), 1) + engine.UpdateRoute([]byte("m"), nil, 2) + + groups := map[uint64]*ShardGroup{ + 1: {Store: store.NewMVCCStore()}, + 2: {Store: store.NewMVCCStore()}, + } + st := NewShardStore(engine, groups) + + k1 := s3keys.ObjectManifestKey("bucket-a", 1, "alpha") + k2 := s3keys.ObjectManifestKey("bucket-a", 1, "zeta") + require.NoError(t, st.PutAt(ctx, k1, []byte("m1"), 1, 0)) + require.NoError(t, st.PutAt(ctx, k2, []byte("m2"), 2, 0)) + + start := s3keys.ObjectManifestPrefixForBucket("bucket-a", 1) + kvs, err := st.ScanAt(ctx, start, prefixScanEnd(start), 10, ^uint64(0)) + require.NoError(t, err) + require.Len(t, kvs, 2) + require.Equal(t, k1, kvs[0].Key) + require.Equal(t, k2, kvs[1].Key) +} + +func TestShardStoreScanAt_RoutesS3ManifestScansByLogicalObjectKey(t *testing.T) { + t.Parallel() + + ctx := context.Background() + + engine := distribution.NewEngine() + engine.UpdateRoute([]byte(""), []byte("m"), 1) + engine.UpdateRoute([]byte("m"), nil, 2) + + groups := map[uint64]*ShardGroup{ + 1: {Store: store.NewMVCCStore()}, + 2: {Store: store.NewMVCCStore()}, + } + st := NewShardStore(engine, groups) + + k0 := s3keys.ObjectManifestKey("bucket-a", 1, "z/object-0") + k1 := s3keys.ObjectManifestKey("bucket-a", 1, "z/object-1") + require.NoError(t, st.PutAt(ctx, k0, []byte("m0"), 1, 0)) + require.NoError(t, st.PutAt(ctx, k1, []byte("m1"), 2, 0)) + + start := s3keys.ObjectManifestScanStart("bucket-a", 1, "z/") + end := prefixScanEnd(start) + kvs, err := st.ScanAt(ctx, start, end, 10, ^uint64(0)) + require.NoError(t, err) + require.Len(t, kvs, 2) + require.Equal(t, k0, kvs[0].Key) + require.Equal(t, k1, kvs[1].Key) +} + func TestScanLockBoundsForKVs_ReverseOrder(t *testing.T) { t.Parallel() diff --git a/kv/sharded_coordinator.go b/kv/sharded_coordinator.go index 537216bb..e2718a88 100644 --- a/kv/sharded_coordinator.go +++ b/kv/sharded_coordinator.go @@ -73,7 +73,7 @@ func (c *ShardedCoordinator) Dispatch(ctx context.Context, reqs *OperationGroup[ } if reqs.IsTxn { - return c.dispatchTxn(reqs.StartTS, reqs.Elems) + return c.dispatchTxn(reqs.StartTS, reqs.CommitTS, reqs.Elems) } logs, err := c.requestLogs(reqs) @@ -88,7 +88,7 @@ func (c *ShardedCoordinator) Dispatch(ctx context.Context, reqs *OperationGroup[ return &CoordinateResponse{CommitIndex: r.CommitIndex}, nil } -func (c *ShardedCoordinator) dispatchTxn(startTS uint64, elems []*Elem[OP]) (*CoordinateResponse, error) { +func (c *ShardedCoordinator) dispatchTxn(startTS uint64, commitTS uint64, elems []*Elem[OP]) (*CoordinateResponse, error) { grouped, gids, err := c.groupMutations(elems) if err != nil { return nil, err @@ -98,7 +98,9 @@ func (c *ShardedCoordinator) dispatchTxn(startTS uint64, elems []*Elem[OP]) (*Co return nil, errors.WithStack(ErrTxnPrimaryKeyRequired) } - commitTS := c.nextTxnTSAfter(startTS) + if commitTS == 0 { + commitTS = c.nextTxnTSAfter(startTS) + } if commitTS == 0 || commitTS <= startTS { return nil, errors.WithStack(ErrTxnCommitTSRequired) } @@ -463,7 +465,10 @@ func (c *ShardedCoordinator) txnLogs(reqs *OperationGroup[OP]) ([]*pb.Request, e if len(gids) != 1 { return nil, errors.WithStack(ErrInvalidRequest) } - commitTS := c.nextTxnTSAfter(reqs.StartTS) + commitTS := reqs.CommitTS + if commitTS == 0 { + commitTS = c.nextTxnTSAfter(reqs.StartTS) + } if commitTS == 0 || commitTS <= reqs.StartTS { return nil, errors.WithStack(ErrTxnCommitTSRequired) } diff --git a/kv/sharded_coordinator_txn_test.go b/kv/sharded_coordinator_txn_test.go index fb5080c3..7b300c4f 100644 --- a/kv/sharded_coordinator_txn_test.go +++ b/kv/sharded_coordinator_txn_test.go @@ -166,6 +166,41 @@ func TestShardedCoordinatorDispatchTxn_CrossShardPhasesAndCommitIndex(t *testing require.Equal(t, commitMeta1.CommitTS, commitMeta2.CommitTS) } +func TestShardedCoordinatorDispatchTxn_UsesProvidedCommitTS(t *testing.T) { + t.Parallel() + + engine := distribution.NewEngine() + engine.UpdateRoute([]byte("a"), []byte("m"), 1) + engine.UpdateRoute([]byte("m"), nil, 2) + + g1Txn := &recordingTransactional{} + g2Txn := &recordingTransactional{} + coord := NewShardedCoordinator(engine, map[uint64]*ShardGroup{ + 1: {Txn: g1Txn}, + 2: {Txn: g2Txn}, + }, 1, NewHLC(), nil) + + startTS := uint64(10) + commitTS := uint64(25) + _, err := coord.Dispatch(context.Background(), &OperationGroup[OP]{ + IsTxn: true, + StartTS: startTS, + CommitTS: commitTS, + Elems: []*Elem[OP]{ + {Op: Put, Key: []byte("b"), Value: []byte("v1")}, + {Op: Put, Key: []byte("x"), Value: []byte("v2")}, + }, + }) + require.NoError(t, err) + require.Len(t, g1Txn.requests, 2) + require.Len(t, g2Txn.requests, 2) + + commitMeta1 := requestTxnMeta(t, g1Txn.requests[1]) + commitMeta2 := requestTxnMeta(t, g2Txn.requests[1]) + require.Equal(t, commitTS, commitMeta1.CommitTS) + require.Equal(t, commitTS, commitMeta2.CommitTS) +} + func TestCommitSecondaryWithRetry_RetriesAndSucceeds(t *testing.T) { t.Parallel() diff --git a/kv/transcoder.go b/kv/transcoder.go index 837232ac..f4e613f0 100644 --- a/kv/transcoder.go +++ b/kv/transcoder.go @@ -23,4 +23,7 @@ type OperationGroup[T OP] struct { // StartTS is a logical timestamp captured at transaction begin. // It is ignored for non-transactional groups. StartTS uint64 + // CommitTS optionally pins the transaction commit timestamp. + // Coordinators choose one automatically when this is zero. + CommitTS uint64 } From bf5f05932004b270a4c7de4131aa5b129210e377 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sun, 22 Mar 2026 19:17:33 +0900 Subject: [PATCH 02/27] adapter: address s3 review feedback --- adapter/s3.go | 26 +++-- adapter/s3_auth.go | 20 +++- adapter/s3_test.go | 80 +++++++++++----- .../elastickv-dynamodb-summary.json | 95 ++++--------------- 4 files changed, 106 insertions(+), 115 deletions(-) diff --git a/adapter/s3.go b/adapter/s3.go index f3b31030..e0ca40d3 100644 --- a/adapter/s3.go +++ b/adapter/s3.go @@ -29,13 +29,14 @@ import ( ) const ( - s3HealthPath = "/healthz" - s3ChunkSize = 1 << 20 - s3ChunkBatchOps = 16 - s3XMLNamespace = "http://s3.amazonaws.com/doc/2006-03-01/" - s3DefaultRegion = "us-east-1" - s3MaxKeys = 1000 - s3ListPageSize = 256 + s3HealthPath = "/healthz" + s3ChunkSize = 1 << 20 + s3ChunkBatchOps = 16 + s3XMLNamespace = "http://s3.amazonaws.com/doc/2006-03-01/" + s3DefaultRegion = "us-east-1" + s3MaxKeys = 1000 + s3ListPageSize = 256 + s3ManifestCleanupTimeout = 2 * time.Minute s3TxnRetryInitialBackoff = 2 * time.Millisecond s3TxnRetryMaxBackoff = 32 * time.Millisecond @@ -661,6 +662,7 @@ func (s *S3Server) getObject(w http.ResponseWriter, r *http.Request, bucket stri writeS3InternalError(w, err) return } + //nolint:gosec // G705: S3 serves stored object bytes verbatim by design. if _, err := w.Write(chunk); err != nil { return } @@ -788,7 +790,7 @@ func (s *S3Server) listObjectsV2(w http.ResponseWriter, r *http.Request, bucket for result.KeyCount < maxKeys { pageLimit := s3ListPageSize - if remaining := maxKeys - result.KeyCount; remaining > pageLimit { + if remaining := maxKeys - result.KeyCount; remaining < pageLimit { pageLimit = remaining } page, err := s.store.ScanAt(r.Context(), cursor, end, pageLimit, readTS) @@ -907,7 +909,11 @@ func (s *S3Server) cleanupManifestBlobsAsync(bucket string, generation uint64, o if manifest == nil { return } - go s.cleanupManifestBlobs(context.Background(), bucket, generation, objectKey, manifest) + go func() { + ctx, cancel := context.WithTimeout(context.Background(), s3ManifestCleanupTimeout) + defer cancel() + s.cleanupManifestBlobs(ctx, bucket, generation, objectKey, manifest) + }() } func (s *S3Server) cleanupManifestBlobs(ctx context.Context, bucket string, generation uint64, objectKey string, manifest *s3ObjectManifest) { @@ -1256,7 +1262,7 @@ func hlcToTime(ts uint64) time.Time { if millis > math.MaxInt64 { millis = math.MaxInt64 } - return time.UnixMilli(int64(millis)).UTC() //nolint:gosec // millis is clamped to MaxInt64 above. + return time.UnixMilli(int64(millis)).UTC() //nolint:gosec // G115: millis is clamped to MaxInt64 above. } func (s *S3Server) readTS() uint64 { diff --git a/adapter/s3_auth.go b/adapter/s3_auth.go index 6d0f0fe9..936ff017 100644 --- a/adapter/s3_auth.go +++ b/adapter/s3_auth.go @@ -14,10 +14,11 @@ import ( ) const ( - s3SigV4Algorithm = "AWS4-HMAC-SHA256" - s3UnsignedPayload = "UNSIGNED-PAYLOAD" - s3EmptyPayloadHash = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" - s3DateHeaderFormat = "20060102T150405Z" + s3SigV4Algorithm = "AWS4-HMAC-SHA256" + s3UnsignedPayload = "UNSIGNED-PAYLOAD" + s3EmptyPayloadHash = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + s3DateHeaderFormat = "20060102T150405Z" + s3RequestTimeMaxSkew = 15 * time.Minute ) type S3ServerOption func(*S3Server) @@ -165,6 +166,17 @@ func (s *S3Server) authorizeRequest(r *http.Request) *s3AuthError { Message: "credential scope date does not match x-amz-date", } } + skew := time.Now().UTC().Sub(signingTime.UTC()) + if skew < 0 { + skew = -skew + } + if skew > s3RequestTimeMaxSkew { + return &s3AuthError{ + Status: http.StatusForbidden, + Code: "RequestTimeTooSkewed", + Message: "The difference between the request time and the server's time is too large", + } + } expectedAuth, err := buildS3AuthorizationHeader(r, parsed.AccessKeyID, secretAccessKey, s.effectiveRegion(), signingTime, payloadHash) if err != nil { diff --git a/adapter/s3_test.go b/adapter/s3_test.go index e36f062c..ebf0dbae 100644 --- a/adapter/s3_test.go +++ b/adapter/s3_test.go @@ -38,49 +38,49 @@ func TestS3Server_BucketAndObjectLifecycle(t *testing.T) { server := NewS3Server(nil, "", st, newLocalAdapterCoordinator(st), nil) rec := httptest.NewRecorder() - req := httptest.NewRequest(http.MethodPut, "/bucket-a", nil) + req := newS3TestRequest(http.MethodPut, "/bucket-a", nil) server.handle(rec, req) require.Equal(t, http.StatusOK, rec.Code) payload := "hello world" rec = httptest.NewRecorder() - req = httptest.NewRequest(http.MethodPut, "/bucket-a/dir/file.txt", strings.NewReader(payload)) + req = newS3TestRequest(http.MethodPut, "/bucket-a/dir/file.txt", strings.NewReader(payload)) req.Header.Set("Content-Type", "text/plain") server.handle(rec, req) require.Equal(t, http.StatusOK, rec.Code) require.Equal(t, `"`+md5Hex(payload)+`"`, rec.Header().Get("ETag")) rec = httptest.NewRecorder() - req = httptest.NewRequest(http.MethodHead, "/bucket-a", nil) + req = newS3TestRequest(http.MethodHead, "/bucket-a", nil) server.handle(rec, req) require.Equal(t, http.StatusOK, rec.Code) rec = httptest.NewRecorder() - req = httptest.NewRequest(http.MethodHead, "/bucket-a/dir/file.txt", nil) + req = newS3TestRequest(http.MethodHead, "/bucket-a/dir/file.txt", nil) server.handle(rec, req) require.Equal(t, http.StatusOK, rec.Code) require.Equal(t, "11", rec.Header().Get("Content-Length")) require.Equal(t, `"`+md5Hex(payload)+`"`, rec.Header().Get("ETag")) rec = httptest.NewRecorder() - req = httptest.NewRequest(http.MethodGet, "/bucket-a/dir/file.txt", nil) + req = newS3TestRequest(http.MethodGet, "/bucket-a/dir/file.txt", nil) server.handle(rec, req) require.Equal(t, http.StatusOK, rec.Code) require.Equal(t, payload, rec.Body.String()) rec = httptest.NewRecorder() - req = httptest.NewRequest(http.MethodGet, "/bucket-a?list-type=2", nil) + req = newS3TestRequest(http.MethodGet, "/bucket-a?list-type=2", nil) server.handle(rec, req) require.Equal(t, http.StatusOK, rec.Code) require.Contains(t, rec.Body.String(), "dir/file.txt") rec = httptest.NewRecorder() - req = httptest.NewRequest(http.MethodDelete, "/bucket-a/dir/file.txt", nil) + req = newS3TestRequest(http.MethodDelete, "/bucket-a/dir/file.txt", nil) server.handle(rec, req) require.Equal(t, http.StatusNoContent, rec.Code) rec = httptest.NewRecorder() - req = httptest.NewRequest(http.MethodDelete, "/bucket-a", nil) + req = newS3TestRequest(http.MethodDelete, "/bucket-a", nil) server.handle(rec, req) require.Equal(t, http.StatusNoContent, rec.Code) } @@ -101,7 +101,7 @@ func TestS3Server_ProxiesFollowerRequests(t *testing.T) { }) rec := httptest.NewRecorder() - req := httptest.NewRequest(http.MethodGet, "/", nil) + req := newS3TestRequest(http.MethodGet, "/", nil) server.handle(rec, req) require.True(t, proxied) @@ -123,7 +123,7 @@ func TestS3Server_RejectsUnsignedRequestWhenCredentialsConfigured(t *testing.T) ) rec := httptest.NewRecorder() - req := httptest.NewRequest(http.MethodGet, "/", nil) + req := newS3TestRequest(http.MethodGet, "/", nil) server.handle(rec, req) require.Equal(t, http.StatusForbidden, rec.Code) @@ -143,7 +143,7 @@ func TestS3Server_AcceptsSigV4SignedRequests(t *testing.T) { WithS3Region(testS3Region), WithS3StaticCredentials(map[string]string{testS3AccessKey: testS3SecretKey}), ) - signingTime := time.Date(2026, 3, 22, 12, 0, 0, 0, time.UTC) + signingTime := currentS3SigningTime() rec := httptest.NewRecorder() req := newSignedS3Request(t, "/bucket-a", "", signingTime) @@ -170,7 +170,7 @@ func TestS3Server_RejectsPayloadHashMismatch(t *testing.T) { WithS3Region(testS3Region), WithS3StaticCredentials(map[string]string{testS3AccessKey: testS3SecretKey}), ) - signingTime := time.Date(2026, 3, 22, 12, 0, 0, 0, time.UTC) + signingTime := currentS3SigningTime() rec := httptest.NewRecorder() req := newSignedS3Request(t, "/bucket-a", "", signingTime) @@ -187,6 +187,28 @@ func TestS3Server_RejectsPayloadHashMismatch(t *testing.T) { require.Contains(t, rec.Body.String(), "XAmzContentSHA256Mismatch") } +func TestS3Server_RejectsSigV4RequestWithExcessiveClockSkew(t *testing.T) { + t.Parallel() + + st := store.NewMVCCStore() + server := NewS3Server( + nil, + "", + st, + newLocalAdapterCoordinator(st), + nil, + WithS3Region(testS3Region), + WithS3StaticCredentials(map[string]string{testS3AccessKey: testS3SecretKey}), + ) + + rec := httptest.NewRecorder() + req := newSignedS3Request(t, "/bucket-a", "", time.Now().UTC().Add(-s3RequestTimeMaxSkew-time.Hour)) + server.handle(rec, req) + + require.Equal(t, http.StatusForbidden, rec.Code) + require.Contains(t, rec.Body.String(), "RequestTimeTooSkewed") +} + func TestS3Server_ProxiesFollowerRequestsBeforeAuth(t *testing.T) { t.Parallel() @@ -210,7 +232,7 @@ func TestS3Server_ProxiesFollowerRequestsBeforeAuth(t *testing.T) { ) rec := httptest.NewRecorder() - req := httptest.NewRequest(http.MethodGet, "/", nil) + req := newS3TestRequest(http.MethodGet, "/", nil) server.handle(rec, req) require.True(t, proxied) @@ -255,7 +277,7 @@ func TestS3Server_ProxiesObjectRequestsUsingObjectRouteLeader(t *testing.T) { }) rec := httptest.NewRecorder() - req := httptest.NewRequest(http.MethodGet, "/bucket-a/dir/file.txt", nil) + req := newS3TestRequest(http.MethodGet, "/bucket-a/dir/file.txt", nil) server.handle(rec, req) require.True(t, proxied) @@ -270,17 +292,17 @@ func TestS3Server_ListObjectsV2DelimiterContinuationAvoidsDuplicateCommonPrefixe server := NewS3Server(nil, "", st, newLocalAdapterCoordinator(st), nil) rec := httptest.NewRecorder() - server.handle(rec, httptest.NewRequest(http.MethodPut, "/bucket-a", nil)) + server.handle(rec, newS3TestRequest(http.MethodPut, "/bucket-a", nil)) require.Equal(t, http.StatusOK, rec.Code) for _, objectKey := range []string{"dir-a/one.txt", "dir-a/two.txt", "dir-b/three.txt"} { rec = httptest.NewRecorder() - server.handle(rec, httptest.NewRequest(http.MethodPut, "/bucket-a/"+objectKey, strings.NewReader(objectKey))) + server.handle(rec, newS3TestRequest(http.MethodPut, "/bucket-a/"+objectKey, strings.NewReader(objectKey))) require.Equal(t, http.StatusOK, rec.Code) } rec = httptest.NewRecorder() - server.handle(rec, httptest.NewRequest(http.MethodGet, "/bucket-a?list-type=2&delimiter=%2F&max-keys=1", nil)) + server.handle(rec, newS3TestRequest(http.MethodGet, "/bucket-a?list-type=2&delimiter=%2F&max-keys=1", nil)) require.Equal(t, http.StatusOK, rec.Code) firstPage := decodeListBucketResult(t, rec.Body.Bytes()) @@ -289,7 +311,7 @@ func TestS3Server_ListObjectsV2DelimiterContinuationAvoidsDuplicateCommonPrefixe require.NotEmpty(t, firstPage.NextContinuationToken) rec = httptest.NewRecorder() - server.handle(rec, httptest.NewRequest(http.MethodGet, "/bucket-a?list-type=2&delimiter=%2F&max-keys=1&continuation-token="+firstPage.NextContinuationToken, nil)) + server.handle(rec, newS3TestRequest(http.MethodGet, "/bucket-a?list-type=2&delimiter=%2F&max-keys=1&continuation-token="+firstPage.NextContinuationToken, nil)) require.Equal(t, http.StatusOK, rec.Code) secondPage := decodeListBucketResult(t, rec.Body.Bytes()) @@ -308,11 +330,11 @@ func TestS3Server_PutObjectConflictsWhenBucketDeletedDuringFinalize(t *testing.T server := NewS3Server(nil, "", st, coord, nil) rec := httptest.NewRecorder() - server.handle(rec, httptest.NewRequest(http.MethodPut, "/bucket-a", nil)) + server.handle(rec, newS3TestRequest(http.MethodPut, "/bucket-a", nil)) require.Equal(t, http.StatusOK, rec.Code) rec = httptest.NewRecorder() - server.handle(rec, httptest.NewRequest(http.MethodPut, "/bucket-a/object.txt", strings.NewReader("payload"))) + server.handle(rec, newS3TestRequest(http.MethodPut, "/bucket-a/object.txt", strings.NewReader("payload"))) require.Equal(t, http.StatusConflict, rec.Code) require.Contains(t, rec.Body.String(), "OperationAborted") @@ -355,20 +377,20 @@ func TestS3Server_ShardedStoreRoutesBucketAndObjectData(t *testing.T) { server := NewS3Server(nil, "", shardStore, coord, nil) rec := httptest.NewRecorder() - server.handle(rec, httptest.NewRequest(http.MethodPut, "/bucket-a", nil)) + server.handle(rec, newS3TestRequest(http.MethodPut, "/bucket-a", nil)) require.Equal(t, http.StatusOK, rec.Code, rec.Body.String()) rec = httptest.NewRecorder() - server.handle(rec, httptest.NewRequest(http.MethodPut, "/bucket-a/dir/file.txt", strings.NewReader("payload"))) + server.handle(rec, newS3TestRequest(http.MethodPut, "/bucket-a/dir/file.txt", strings.NewReader("payload"))) require.Equal(t, http.StatusOK, rec.Code, rec.Body.String()) rec = httptest.NewRecorder() - server.handle(rec, httptest.NewRequest(http.MethodGet, "/bucket-a/dir/file.txt", nil)) + server.handle(rec, newS3TestRequest(http.MethodGet, "/bucket-a/dir/file.txt", nil)) require.Equal(t, http.StatusOK, rec.Code, rec.Body.String()) require.Equal(t, "payload", rec.Body.String()) rec = httptest.NewRecorder() - server.handle(rec, httptest.NewRequest(http.MethodGet, "/bucket-a?list-type=2", nil)) + server.handle(rec, newS3TestRequest(http.MethodGet, "/bucket-a?list-type=2", nil)) require.Equal(t, http.StatusOK, rec.Code, rec.Body.String()) require.Contains(t, rec.Body.String(), "dir/file.txt") @@ -528,6 +550,14 @@ func sha256Hex(v string) string { return hex.EncodeToString(sum[:]) } +func newS3TestRequest(method string, target string, body io.Reader) *http.Request { + return httptest.NewRequestWithContext(context.Background(), method, target, body) +} + +func currentS3SigningTime() time.Time { + return time.Now().UTC().Add(-time.Minute).Truncate(time.Second) +} + func newSignedS3Request( t *testing.T, target string, @@ -536,7 +566,7 @@ func newSignedS3Request( ) *http.Request { t.Helper() - req := httptest.NewRequest(http.MethodPut, target, strings.NewReader(body)) + req := newS3TestRequest(http.MethodPut, target, strings.NewReader(body)) payloadHash := sha256Hex(body) req.Header.Set("X-Amz-Content-Sha256", payloadHash) diff --git a/monitoring/grafana/dashboards/elastickv-dynamodb-summary.json b/monitoring/grafana/dashboards/elastickv-dynamodb-summary.json index 83766600..4bcd399c 100644 --- a/monitoring/grafana/dashboards/elastickv-dynamodb-summary.json +++ b/monitoring/grafana/dashboards/elastickv-dynamodb-summary.json @@ -22,10 +22,7 @@ "links": [], "panels": [ { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "fieldConfig": { "defaults": { "color": { @@ -72,10 +69,7 @@ "pluginVersion": "12.1.1", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "editorMode": "code", "expr": "sum(rate(elastickv_dynamodb_requests_total{job=\"elastickv\"}[5m]))", "instant": false, @@ -88,10 +82,7 @@ "type": "stat" }, { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "fieldConfig": { "defaults": { "color": { @@ -146,10 +137,7 @@ "pluginVersion": "12.1.1", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "editorMode": "code", "expr": "sum(rate(elastickv_dynamodb_requests_total{job=\"elastickv\",outcome!=\"success\"}[5m]))", "instant": false, @@ -162,10 +150,7 @@ "type": "stat" }, { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "fieldConfig": { "defaults": { "color": { @@ -220,10 +205,7 @@ "pluginVersion": "12.1.1", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "editorMode": "code", "expr": "histogram_quantile(0.95, sum by (le) (rate(elastickv_dynamodb_request_duration_seconds_bucket{job=\"elastickv\"}[5m])))", "instant": false, @@ -236,10 +218,7 @@ "type": "stat" }, { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "fieldConfig": { "defaults": { "color": { @@ -286,10 +265,7 @@ "pluginVersion": "12.1.1", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "editorMode": "code", "expr": "sum(elastickv_dynamodb_inflight_requests{job=\"elastickv\"})", "instant": false, @@ -302,10 +278,7 @@ "type": "stat" }, { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "fieldConfig": { "defaults": { "color": { @@ -385,10 +358,7 @@ "pluginVersion": "12.1.1", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "editorMode": "code", "expr": "sum by (operation) (rate(elastickv_dynamodb_requests_total{job=\"elastickv\"}[5m]))", "legendFormat": "{{operation}}", @@ -400,10 +370,7 @@ "type": "timeseries" }, { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "fieldConfig": { "defaults": { "color": { @@ -483,10 +450,7 @@ "pluginVersion": "12.1.1", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "editorMode": "code", "expr": "sum by (outcome) (rate(elastickv_dynamodb_requests_total{job=\"elastickv\",outcome!=\"success\"}[5m]))", "legendFormat": "{{outcome}}", @@ -498,10 +462,7 @@ "type": "timeseries" }, { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "fieldConfig": { "defaults": { "color": { @@ -581,10 +542,7 @@ "pluginVersion": "12.1.1", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "editorMode": "code", "expr": "sum by (table) (rate(elastickv_dynamodb_returned_items_total{job=\"elastickv\"}[5m]))", "legendFormat": "returned {{table}}", @@ -592,10 +550,7 @@ "refId": "A" }, { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "editorMode": "code", "expr": "sum by (table) (rate(elastickv_dynamodb_scanned_items_total{job=\"elastickv\"}[5m]))", "legendFormat": "scanned {{table}}", @@ -603,10 +558,7 @@ "refId": "B" }, { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "editorMode": "code", "expr": "sum by (table) (rate(elastickv_dynamodb_written_items_total{job=\"elastickv\"}[5m]))", "legendFormat": "written {{table}}", @@ -618,10 +570,7 @@ "type": "timeseries" }, { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "fieldConfig": { "defaults": { "color": { @@ -701,10 +650,7 @@ "pluginVersion": "12.1.1", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "editorMode": "code", "expr": "max by (group, node_id) (elastickv_raft_commit_index{job=\"elastickv\"})", "legendFormat": "commit g{{group}} {{node_id}}", @@ -712,10 +658,7 @@ "refId": "A" }, { - "datasource": { - "type": "prometheus", - "uid": null - }, + "datasource": null, "editorMode": "code", "expr": "max by (group, node_id) (elastickv_raft_applied_index{job=\"elastickv\"})", "legendFormat": "applied g{{group}} {{node_id}}", From 1e9f817db8d98f3035be0f060a961b1a7a7c5d6e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Mar 2026 10:31:16 +0000 Subject: [PATCH 03/27] Initial plan From 8d240fb9e9f15f8764e53bca5a693c0614840af6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Mar 2026 10:41:54 +0000 Subject: [PATCH 04/27] address review comments: HLC monotonicity, SigV4 sig comparison, coordinator redirect Co-authored-by: bootjp <1306365+bootjp@users.noreply.github.com> Agent-Logs-Url: https://github.com/bootjp/elastickv/sessions/26a0122b-27bb-4ba7-8596-ae3af70b98dd --- adapter/s3_auth.go | 23 +++++++++++++++++- adapter/s3_test.go | 49 +++++++++++++++++++++++++++++++++++++++ kv/coordinator.go | 10 +++++++- kv/sharded_coordinator.go | 8 +++++++ 4 files changed, 88 insertions(+), 2 deletions(-) diff --git a/adapter/s3_auth.go b/adapter/s3_auth.go index 936ff017..4dbd1191 100644 --- a/adapter/s3_auth.go +++ b/adapter/s3_auth.go @@ -186,7 +186,12 @@ func (s *S3Server) authorizeRequest(r *http.Request) *s3AuthError { Message: "failed to verify request signature", } } - if subtle.ConstantTimeCompare([]byte(authHeader), []byte(expectedAuth)) != 1 { + // Compare only the Signature component to avoid false rejections caused by + // equivalent Authorization headers that differ in whitespace or parameter + // ordering (but carry the same cryptographic signature). + gotSig := extractS3Signature(authHeader) + expectedSig := extractS3Signature(expectedAuth) + if gotSig == "" || subtle.ConstantTimeCompare([]byte(gotSig), []byte(expectedSig)) != 1 { return &s3AuthError{ Status: http.StatusForbidden, Code: "SignatureDoesNotMatch", @@ -258,3 +263,19 @@ func parseS3AuthorizationHeader(raw string) (s3AuthorizationHeader, error) { func normalizeS3PayloadHash(raw string) string { return strings.TrimSpace(raw) } + +// extractS3Signature returns the hex signature value from a SigV4 +// Authorization header (the "Signature=" component). +func extractS3Signature(auth string) string { + _, params, ok := strings.Cut(auth, " ") + if !ok { + return "" + } + for _, param := range strings.Split(params, ",") { + key, value, ok := strings.Cut(strings.TrimSpace(param), "=") + if ok && key == "Signature" { + return strings.TrimSpace(value) + } + } + return "" +} diff --git a/adapter/s3_test.go b/adapter/s3_test.go index ebf0dbae..7e55e12f 100644 --- a/adapter/s3_test.go +++ b/adapter/s3_test.go @@ -592,3 +592,52 @@ func newSignedS3Request( require.Equal(t, strings.TrimSpace(req.Header.Get("Authorization")), expectedAuth) return req } + +func TestExtractS3Signature(t *testing.T) { + t.Parallel() + cases := []struct { + name string + input string + want string + }{ + { + name: "standard SigV4 header", + input: "AWS4-HMAC-SHA256 Credential=AKID/20240101/us-east-1/s3/aws4_request, SignedHeaders=host;x-amz-date, Signature=abc123def456", + want: "abc123def456", + }, + { + name: "signature with extra whitespace", + input: "AWS4-HMAC-SHA256 Credential=AKID/20240101/us-east-1/s3/aws4_request, SignedHeaders=host, Signature= abc123 ", + want: "abc123", + }, + { + name: "different parameter order", + input: "AWS4-HMAC-SHA256 SignedHeaders=host;x-amz-date, Credential=AKID/20240101/us-east-1/s3/aws4_request, Signature=deadbeef", + want: "deadbeef", + }, + { + name: "missing signature", + input: "AWS4-HMAC-SHA256 Credential=AKID/20240101/us-east-1/s3/aws4_request, SignedHeaders=host", + want: "", + }, + { + name: "empty header", + input: "", + want: "", + }, + { + name: "no space separator", + input: "AWS4-HMAC-SHA256Credential=foo", + want: "", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + got := extractS3Signature(tc.input) + if got != tc.want { + t.Errorf("extractS3Signature(%q) = %q, want %q", tc.input, got, tc.want) + } + }) + } +} diff --git a/kv/coordinator.go b/kv/coordinator.go index 858cca1f..f25f6a1d 100644 --- a/kv/coordinator.go +++ b/kv/coordinator.go @@ -211,8 +211,16 @@ func (c *Coordinate) redirect(ctx context.Context, reqs *OperationGroup[OP]) (*C if len(primary) == 0 { return nil, errors.WithStack(ErrTxnPrimaryKeyRequired) } + // When StartTS is absent (leader will assign it), also clear CommitTS + // so the leader assigns both timestamps consistently. A caller-provided + // CommitTS without a StartTS would produce an invalid txn where + // CommitTS <= StartTS (because StartTS=0 at the forwarding site). + commitTS := reqs.CommitTS + if reqs.StartTS == 0 { + commitTS = 0 + } requests = []*pb.Request{ - onePhaseTxnRequest(reqs.StartTS, reqs.CommitTS, primary, reqs.Elems), + onePhaseTxnRequest(reqs.StartTS, commitTS, primary, reqs.Elems), } } else { for _, req := range reqs.Elems { diff --git a/kv/sharded_coordinator.go b/kv/sharded_coordinator.go index e2718a88..7ecc58ca 100644 --- a/kv/sharded_coordinator.go +++ b/kv/sharded_coordinator.go @@ -100,6 +100,10 @@ func (c *ShardedCoordinator) dispatchTxn(startTS uint64, commitTS uint64, elems if commitTS == 0 { commitTS = c.nextTxnTSAfter(startTS) + } else { + // Observe caller-provided commitTS to keep the HLC monotonic; without + // this the clock could later issue timestamps smaller than commitTS. + c.clock.Observe(commitTS) } if commitTS == 0 || commitTS <= startTS { return nil, errors.WithStack(ErrTxnCommitTSRequired) @@ -468,6 +472,10 @@ func (c *ShardedCoordinator) txnLogs(reqs *OperationGroup[OP]) ([]*pb.Request, e commitTS := reqs.CommitTS if commitTS == 0 { commitTS = c.nextTxnTSAfter(reqs.StartTS) + } else { + // Observe caller-provided commitTS to keep the HLC monotonic; without + // this the clock could later issue timestamps smaller than commitTS. + c.clock.Observe(commitTS) } if commitTS == 0 || commitTS <= reqs.StartTS { return nil, errors.WithStack(ErrTxnCommitTSRequired) From 8a833667c342c77a902845bf08c6acdeaa4772ee Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Mar 2026 12:10:47 +0000 Subject: [PATCH 05/27] Initial plan From 60ed2776f25e385a777f81e302f60ebab392be1e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Mar 2026 12:18:51 +0000 Subject: [PATCH 06/27] fix: HLC monotonicity, nil guards in Run(), and coordinator/config parsing hardening Co-authored-by: bootjp <1306365+bootjp@users.noreply.github.com> Agent-Logs-Url: https://github.com/bootjp/elastickv/sessions/82513c62-6b3f-4606-9c2f-4f1fbaa470ad --- adapter/s3.go | 6 ++++++ cmd/server/demo.go | 10 +++++++--- kv/coordinator.go | 4 ++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/adapter/s3.go b/adapter/s3.go index e0ca40d3..2576bbdd 100644 --- a/adapter/s3.go +++ b/adapter/s3.go @@ -194,6 +194,12 @@ func NewS3Server(listen net.Listener, s3Addr string, st store.MVCCStore, coordin } func (s *S3Server) Run() error { + if s.httpServer == nil { + return errors.New("s3 server httpServer is nil") + } + if s.listen == nil { + return errors.New("s3 server listener is nil") + } if err := s.httpServer.Serve(s.listen); err != nil && !errors.Is(err, http.ErrServerClosed) { return errors.WithStack(err) } diff --git a/cmd/server/demo.go b/cmd/server/demo.go index 54bcfd06..d49bd4c2 100644 --- a/cmd/server/demo.go +++ b/cmd/server/demo.go @@ -411,14 +411,18 @@ func setupS3( if !pathStyleOnly { return nil, errors.New("virtual-hosted style S3 requests are not implemented") } + if coordinator == nil { + return nil, errors.New("coordinator must not be nil") + } leaderS3 := make(map[raft.ServerAddress]string) if raftS3MapStr != "" { parts := strings.SplitSeq(raftS3MapStr, ",") for part := range parts { - kv := strings.Split(part, "=") - if len(kv) == kvParts { - leaderS3[raft.ServerAddress(kv[0])] = kv[1] + kv := strings.SplitN(strings.TrimSpace(part), "=", kvParts) + if len(kv) != kvParts { + return nil, fmt.Errorf("invalid raft-s3 map entry %q: expected format addr=s3addr", part) } + leaderS3[raft.ServerAddress(strings.TrimSpace(kv[0]))] = strings.TrimSpace(kv[1]) } } leaderS3[raft.ServerAddress(addr)] = s3Addr diff --git a/kv/coordinator.go b/kv/coordinator.go index f25f6a1d..15db4b0c 100644 --- a/kv/coordinator.go +++ b/kv/coordinator.go @@ -113,6 +113,10 @@ func (c *Coordinate) dispatchTxn(reqs []*Elem[OP], startTS uint64, commitTS uint c.clock.Observe(startTS) commitTS = c.clock.Next() } + } else { + // Observe the caller-provided commitTS so the HLC never issues + // a smaller timestamp in subsequent calls, preserving monotonicity. + c.clock.Observe(commitTS) } if commitTS <= startTS { return nil, errors.WithStack(ErrTxnCommitTSRequired) From 5259dfc4c2af5da6331bccda2a0a84ec0e3b4e6c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Mar 2026 13:06:16 +0000 Subject: [PATCH 07/27] Initial plan From 41a7f2c48ca8f2615666017ee5c9026248033a5e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Mar 2026 13:15:30 +0000 Subject: [PATCH 08/27] fix: s3 error leakage, HLC observe, and PUT body size limit Co-authored-by: bootjp <1306365+bootjp@users.noreply.github.com> Agent-Logs-Url: https://github.com/bootjp/elastickv/sessions/bdcd88ec-08f1-4793-9105-60cc5e1c55e1 --- adapter/dynamodb_migration_test.go | 2 ++ adapter/s3.go | 13 ++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/adapter/dynamodb_migration_test.go b/adapter/dynamodb_migration_test.go index e2838ded..909af053 100644 --- a/adapter/dynamodb_migration_test.go +++ b/adapter/dynamodb_migration_test.go @@ -49,6 +49,8 @@ func (c *localAdapterCoordinator) commitTSForRequest(req *kv.OperationGroup[kv.O c.Clock().Observe(req.StartTS) commitTS = c.Clock().Next() } + } else { + c.Clock().Observe(commitTS) } if req.IsTxn && commitTS <= req.StartTS { return 0, kv.ErrTxnCommitTSRequired diff --git a/adapter/s3.go b/adapter/s3.go index 2576bbdd..8a0119d6 100644 --- a/adapter/s3.go +++ b/adapter/s3.go @@ -11,6 +11,7 @@ import ( "encoding/xml" "fmt" "io" + "log/slog" "math" "net" "net/http" @@ -37,6 +38,7 @@ const ( s3MaxKeys = 1000 s3ListPageSize = 256 s3ManifestCleanupTimeout = 2 * time.Minute + s3MaxObjectSizeBytes = 5 * 1024 * 1024 * 1024 // 5 GiB, matching AWS S3 single PUT limit. s3TxnRetryInitialBackoff = 2 * time.Millisecond s3TxnRetryMaxBackoff = 32 * time.Millisecond @@ -475,6 +477,7 @@ func (s *S3Server) putObject(w http.ResponseWriter, r *http.Request, bucket stri sha256Hasher := sha256.New() expectedPayloadSHA := normalizeS3PayloadHash(r.Header.Get("X-Amz-Content-Sha256")) validatePayloadSHA := expectedPayloadSHA != "" && !strings.EqualFold(expectedPayloadSHA, s3UnsignedPayload) + r.Body = http.MaxBytesReader(w, r.Body, s3MaxObjectSizeBytes) part := s3ObjectPart{PartNo: 1} sizeBytes := int64(0) chunkNo := uint64(0) @@ -545,6 +548,11 @@ func (s *S3Server) putObject(w http.ResponseWriter, r *http.Request, bucket stri } if readErr != nil { s.cleanupManifestBlobs(r.Context(), bucket, meta.Generation, objectKey, uploadedManifest()) + var maxBytesErr *http.MaxBytesError + if errors.As(readErr, &maxBytesErr) { + writeS3Error(w, http.StatusRequestEntityTooLarge, "EntityTooLarge", "object exceeds maximum allowed size", bucket, objectKey) + return + } writeS3InternalError(w, readErr) return } @@ -1153,11 +1161,10 @@ func writeS3Error(w http.ResponseWriter, status int, code string, message string } func writeS3InternalError(w http.ResponseWriter, err error) { - message := "internal error" if err != nil { - message = err.Error() + slog.Error("s3 internal error", "error", err) } - writeS3Error(w, http.StatusInternalServerError, "InternalError", message, "", "") + writeS3Error(w, http.StatusInternalServerError, "InternalError", "internal error", "", "") } func writeS3ObjectHeaders(h http.Header, manifest *s3ObjectManifest) { From e8085eabdc2819aeadf5de67f239278438fed91e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Mar 2026 15:55:05 +0000 Subject: [PATCH 09/27] Initial plan From 2edcd2a29e5fb1d72bc6557895e74abadd2a8ea4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Mar 2026 16:03:22 +0000 Subject: [PATCH 10/27] fix: bound cleanup goroutines, fix retry off-by-one, log dispatch errors Co-authored-by: bootjp <1306365+bootjp@users.noreply.github.com> Agent-Logs-Url: https://github.com/bootjp/elastickv/sessions/05b29480-e03a-463f-bf8b-bc6bddb738bd --- adapter/s3.go | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/adapter/s3.go b/adapter/s3.go index 8a0119d6..1998b61b 100644 --- a/adapter/s3.go +++ b/adapter/s3.go @@ -45,6 +45,8 @@ const ( s3TxnRetryBackoffFactor = 2 s3TxnRetryMaxAttempts = 8 + s3ManifestCleanupWorkers = 16 + s3PathSplitParts = 2 s3GenerationBytes = 8 s3HLCPhysicalShift = 16 @@ -60,6 +62,7 @@ type S3Server struct { staticCreds map[string]string readTracker *kv.ActiveTimestampTracker httpServer *http.Server + cleanupSem chan struct{} } type s3BucketMeta struct { @@ -175,6 +178,7 @@ func NewS3Server(listen net.Listener, s3Addr string, st store.MVCCStore, coordin store: st, coordinator: coordinate, leaderS3: cloneLeaderAddrMap(leaderS3), + cleanupSem: make(chan struct{}, s3ManifestCleanupWorkers), } for _, opt := range opts { if opt != nil { @@ -923,7 +927,15 @@ func (s *S3Server) cleanupManifestBlobsAsync(bucket string, generation uint64, o if manifest == nil { return } + select { + case s.cleanupSem <- struct{}{}: + default: + // All cleanup workers are busy; skip this cleanup to avoid unbounded goroutine growth. + // Orphaned blobs from skipped cleanups may persist until explicitly overwritten or deleted. + return + } go func() { + defer func() { <-s.cleanupSem }() ctx, cancel := context.WithTimeout(context.Background(), s3ManifestCleanupTimeout) defer cancel() s.cleanupManifestBlobs(ctx, bucket, generation, objectKey, manifest) @@ -939,7 +951,15 @@ func (s *S3Server) cleanupManifestBlobs(ctx context.Context, bucket string, gene if len(pending) == 0 { return } - _, _ = s.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{Elems: pending}) + if _, err := s.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{Elems: pending}); err != nil { + slog.ErrorContext(ctx, "cleanupManifestBlobs: coordinator dispatch failed", + "bucket", bucket, + "generation", generation, + "object_key", objectKey, + "upload_id", manifest.UploadID, + "err", err, + ) + } pending = pending[:0] } for _, part := range manifest.Parts { @@ -1359,22 +1379,24 @@ func nextS3RetryBackoff(current time.Duration) time.Duration { func (s *S3Server) retryS3Mutation(ctx context.Context, fn func() error) error { backoff := s3TxnRetryInitialBackoff - for attempt := 0; ; attempt++ { - err := fn() - if err == nil { + var lastErr error + for attempt := range s3TxnRetryMaxAttempts { + lastErr = fn() + if lastErr == nil { return nil } - if !isRetryableS3MutationErr(err) { - return errors.WithStack(err) + if !isRetryableS3MutationErr(lastErr) { + return errors.WithStack(lastErr) } - if attempt >= s3TxnRetryMaxAttempts { - return errors.WithStack(err) + if attempt == s3TxnRetryMaxAttempts-1 { + break } if !waitS3RetryBackoff(ctx, backoff) { - return errors.WithStack(err) + break } backoff = nextS3RetryBackoff(backoff) } + return errors.WithStack(lastErr) } func writeS3MutationError(w http.ResponseWriter, err error, bucket string, key string) { From 1fe34902a191e7d61380cdcdf1789d228e5b586b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Mar 2026 16:03:34 +0000 Subject: [PATCH 11/27] Initial plan From e951182e2939b70d0435cd0e79bb7dbb3f3af9ad Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Mar 2026 16:07:33 +0000 Subject: [PATCH 12/27] fix: add missing setupFSMStore function lost in merge Co-authored-by: bootjp <1306365+bootjp@users.noreply.github.com> Agent-Logs-Url: https://github.com/bootjp/elastickv/sessions/fae5663b-beb5-497e-8c4f-42fdcfd3cd60 --- cmd/server/demo.go | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/cmd/server/demo.go b/cmd/server/demo.go index cc3dbf1c..30b3eb0f 100644 --- a/cmd/server/demo.go +++ b/cmd/server/demo.go @@ -358,6 +358,32 @@ func setupStorage(dir string) (raft.LogStore, raft.StableStore, raft.SnapshotSto return raftStore, raftStore, fss, nil } +// setupFSMStore creates and returns the MVCCStore for the Raft FSM. +// When raftDataDir is non-empty the store is persisted under that directory; +// otherwise a temporary directory is used and registered for cleanup on exit. +func setupFSMStore(raftDataDir string, cleanup *internalutil.CleanupStack) (store.MVCCStore, error) { + if raftDataDir != "" { + if err := os.MkdirAll(raftDataDir, defaultFileMode); err != nil { + return nil, errors.WithStack(err) + } + st, err := store.NewPebbleStore(filepath.Join(raftDataDir, "fsm.db")) + if err != nil { + return nil, errors.WithStack(err) + } + return st, nil + } + fsmDir, err := os.MkdirTemp("", "elastickv-fsm-*") + if err != nil { + return nil, errors.WithStack(err) + } + cleanup.Add(func() { os.RemoveAll(fsmDir) }) + st, err := store.NewPebbleStore(fsmDir) + if err != nil { + return nil, errors.WithStack(err) + } + return st, nil +} + func setupGRPC(r *raft.Raft, st store.MVCCStore, tm *transport.Manager, coordinator *kv.Coordinate, distServer *adapter.DistributionServer, relay *adapter.RedisPubSubRelay) (*grpc.Server, *adapter.GRPCServer) { s := grpc.NewServer(internalutil.GRPCServerOptions()...) trx := kv.NewTransaction(r) From 24df609bf6fc41c1082a4de7be462e4fc6ac1b51 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Mar 2026 16:29:53 +0000 Subject: [PATCH 13/27] fix: reduce run() cyclomatic complexity to satisfy cyclop linter Co-authored-by: bootjp <1306365+bootjp@users.noreply.github.com> Agent-Logs-Url: https://github.com/bootjp/elastickv/sessions/726915eb-a523-450c-9c19-f76c2cde66b9 --- cmd/server/demo.go | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/cmd/server/demo.go b/cmd/server/demo.go index 30b3eb0f..07f4519e 100644 --- a/cmd/server/demo.go +++ b/cmd/server/demo.go @@ -358,6 +358,19 @@ func setupStorage(dir string) (raft.LogStore, raft.StableStore, raft.SnapshotSto return raftStore, raftStore, fss, nil } +// setupStores creates both the Raft log/stable/snapshot stores and the FSM MVCCStore. +func setupStores(raftDataDir string, cleanup *internalutil.CleanupStack) (raft.LogStore, raft.StableStore, raft.SnapshotStore, store.MVCCStore, error) { + ldb, sdb, fss, err := setupStorage(raftDataDir) + if err != nil { + return nil, nil, nil, nil, err + } + st, err := setupFSMStore(raftDataDir, cleanup) + if err != nil { + return nil, nil, nil, nil, err + } + return ldb, sdb, fss, st, nil +} + // setupFSMStore creates and returns the MVCCStore for the Raft FSM. // When raftDataDir is non-empty the store is persisted under that directory; // otherwise a temporary directory is used and registered for cleanup on exit. @@ -480,13 +493,7 @@ func run(ctx context.Context, eg *errgroup.Group, cfg config) error { cleanup := internalutil.CleanupStack{} defer cleanup.Run() - ldb, sdb, fss, err := setupStorage(cfg.raftDataDir) - if err != nil { - return err - } - - var st store.MVCCStore - st, err = setupFSMStore(cfg.raftDataDir, &cleanup) + ldb, sdb, fss, st, err := setupStores(cfg.raftDataDir, &cleanup) if err != nil { return err } From 52cf0ecbc8592068ac9a1229505892525f1454b2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Mar 2026 16:52:37 +0000 Subject: [PATCH 14/27] fix: reduce cleanupManifestBlobs cyclomatic complexity for cyclop linter Co-authored-by: bootjp <1306365+bootjp@users.noreply.github.com> Agent-Logs-Url: https://github.com/bootjp/elastickv/sessions/79412b4b-dfc4-4afb-aa1e-6cd71bd2037f --- adapter/s3.go | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/adapter/s3.go b/adapter/s3.go index 1998b61b..939db01d 100644 --- a/adapter/s3.go +++ b/adapter/s3.go @@ -963,23 +963,31 @@ func (s *S3Server) cleanupManifestBlobs(ctx context.Context, bucket string, gene pending = pending[:0] } for _, part := range manifest.Parts { - for chunkNo := range part.ChunkSizes { - chunkIndex, err := uint64FromInt(chunkNo) - if err != nil { - return - } - pending = append(pending, &kv.Elem[kv.OP]{ - Op: kv.Del, - Key: s3keys.BlobKey(bucket, generation, objectKey, manifest.UploadID, part.PartNo, chunkIndex), - }) - if len(pending) >= s3ChunkBatchOps { - flush() - } + var ok bool + if pending, ok = s.appendPartBlobKeys(pending, bucket, generation, objectKey, manifest.UploadID, part, flush); !ok { + return } } flush() } +func (s *S3Server) appendPartBlobKeys(pending []*kv.Elem[kv.OP], bucket string, generation uint64, objectKey string, uploadID string, part s3ObjectPart, flush func()) ([]*kv.Elem[kv.OP], bool) { + for chunkNo := range part.ChunkSizes { + chunkIndex, err := uint64FromInt(chunkNo) + if err != nil { + return pending, false + } + pending = append(pending, &kv.Elem[kv.OP]{ + Op: kv.Del, + Key: s3keys.BlobKey(bucket, generation, objectKey, uploadID, part.PartNo, chunkIndex), + }) + if len(pending) >= s3ChunkBatchOps { + flush() + } + } + return pending, true +} + //nolint:cyclop // Proxying depends on root, bucket, and object-level leadership decisions. func (s *S3Server) maybeProxyToLeader(w http.ResponseWriter, r *http.Request) bool { if s == nil || s.coordinator == nil { From 05a1fd6dc594e2db4ae231b1dfb7b711ef513bc7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Mar 2026 18:54:21 +0000 Subject: [PATCH 15/27] Initial plan From 0d17d92fef4b8ada17b17d51b20a01500d8edc8f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Mar 2026 19:00:57 +0000 Subject: [PATCH 16/27] Fix ListObjectsV2 pagination: parse token first, load meta at consistent readTS Co-authored-by: bootjp <1306365+bootjp@users.noreply.github.com> Agent-Logs-Url: https://github.com/bootjp/elastickv/sessions/328b2498-848d-4fc7-bfe6-b34c1f95b849 --- adapter/s3.go | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/adapter/s3.go b/adapter/s3.go index 939db01d..165aa51b 100644 --- a/adapter/s3.go +++ b/adapter/s3.go @@ -746,7 +746,24 @@ func (s *S3Server) deleteObject(w http.ResponseWriter, r *http.Request, bucket s //nolint:cyclop,gocognit,gocyclo,nestif // ListObjectsV2 combines token validation, shard-stable snapshotting, and delimiter pagination rules. func (s *S3Server) listObjectsV2(w http.ResponseWriter, r *http.Request, bucket string) { + query := r.URL.Query() + prefix := query.Get("prefix") + delimiter := query.Get("delimiter") + maxKeys := parseS3MaxKeys(query.Get("max-keys")) + + // Parse the continuation token first so we can select the final readTS before loading metadata. + // This ensures meta is always loaded at the same snapshot used for the scan. + token, err := decodeS3ContinuationToken(query.Get("continuation-token")) + if err != nil { + writeS3Error(w, http.StatusBadRequest, "InvalidArgument", "invalid continuation token", bucket, "") + return + } + readTS := s.readTS() + if token != nil && token.ReadTS != 0 { + readTS = token.ReadTS + } + meta, exists, err := s.loadBucketMetaAt(r.Context(), bucket, readTS) if err != nil { writeS3InternalError(w, err) @@ -757,23 +774,11 @@ func (s *S3Server) listObjectsV2(w http.ResponseWriter, r *http.Request, bucket return } - query := r.URL.Query() - prefix := query.Get("prefix") - delimiter := query.Get("delimiter") - maxKeys := parseS3MaxKeys(query.Get("max-keys")) - token, err := decodeS3ContinuationToken(query.Get("continuation-token")) - if err != nil { - writeS3Error(w, http.StatusBadRequest, "InvalidArgument", "invalid continuation token", bucket, "") - return - } if token != nil { if token.Bucket != bucket || token.Generation != meta.Generation || token.Prefix != prefix || token.Delimiter != delimiter { writeS3Error(w, http.StatusBadRequest, "InvalidArgument", "continuation token does not match request", bucket, "") return } - if token.ReadTS != 0 { - readTS = token.ReadTS - } } readPin := s.pinReadTS(readTS) defer readPin.Release() From 128503f3530f0f056803744de322cf4a11944e38 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Mon, 23 Mar 2026 15:05:57 +0900 Subject: [PATCH 17/27] Update cmd/server/demo.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- cmd/server/demo.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cmd/server/demo.go b/cmd/server/demo.go index 07f4519e..0ad47916 100644 --- a/cmd/server/demo.go +++ b/cmd/server/demo.go @@ -456,10 +456,15 @@ func setupS3( leaderS3 := make(map[raft.ServerAddress]string) if raftS3MapStr != "" { parts := strings.SplitSeq(raftS3MapStr, ",") - for part := range parts { - kv := strings.SplitN(strings.TrimSpace(part), "=", kvParts) + for _, part := range parts { + part = strings.TrimSpace(part) + if part == "" { + continue + } + kv := strings.SplitN(part, "=", kvParts) if len(kv) != kvParts { - return nil, fmt.Errorf("invalid raft-s3 map entry %q: expected format addr=s3addr", part) + slog.Warn("ignoring invalid raft-s3 map entry; expected format addr=s3addr", "entry", part) + continue } leaderS3[raft.ServerAddress(strings.TrimSpace(kv[0]))] = strings.TrimSpace(kv[1]) } From 2d25ed46208fd3f2514f8922272f321892fb578b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 06:06:34 +0000 Subject: [PATCH 18/27] Initial plan From 65d9eafe3f7f6b4331074b868b6f7e2503b7d327 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 06:09:55 +0000 Subject: [PATCH 19/27] adapter/s3: use base64.RawURLEncoding for continuation tokens Co-authored-by: bootjp <1306365+bootjp@users.noreply.github.com> Agent-Logs-Url: https://github.com/bootjp/elastickv/sessions/5689b007-365d-4502-96e3-c005c23165c1 --- adapter/s3.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/adapter/s3.go b/adapter/s3.go index 165aa51b..079a5b2d 100644 --- a/adapter/s3.go +++ b/adapter/s3.go @@ -1244,7 +1244,7 @@ func decodeS3ContinuationToken(raw string) (*s3ContinuationToken, error) { if strings.TrimSpace(raw) == "" { return nil, nil } - data, err := base64.StdEncoding.DecodeString(raw) + data, err := base64.RawURLEncoding.DecodeString(raw) if err != nil { return nil, errors.WithStack(err) } @@ -1268,7 +1268,7 @@ func encodeS3ContinuationToken(bucket string, generation uint64, prefix string, if err != nil { return "" } - return base64.StdEncoding.EncodeToString(data) + return base64.RawURLEncoding.EncodeToString(data) } func parseS3MaxKeys(raw string) int { From b3c2459686cb17369474f0394ee6bbc718dbb500 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 06:29:08 +0000 Subject: [PATCH 20/27] cmd/server: fix range over iter.Seq compile error in demo.go Co-authored-by: bootjp <1306365+bootjp@users.noreply.github.com> Agent-Logs-Url: https://github.com/bootjp/elastickv/sessions/31d2c8bd-a348-4c8e-bc22-d059597c9c68 --- cmd/server/demo.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/server/demo.go b/cmd/server/demo.go index 0ad47916..d212c307 100644 --- a/cmd/server/demo.go +++ b/cmd/server/demo.go @@ -456,7 +456,7 @@ func setupS3( leaderS3 := make(map[raft.ServerAddress]string) if raftS3MapStr != "" { parts := strings.SplitSeq(raftS3MapStr, ",") - for _, part := range parts { + for part := range parts { part = strings.TrimSpace(part) if part == "" { continue From 7aa00f708c8045d352dda0156e2b80b000c7edac Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 06:42:15 +0000 Subject: [PATCH 21/27] Initial plan From 61fa09e352ee7da3ab5486c0cfb155af3857284e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 06:50:31 +0000 Subject: [PATCH 22/27] kv: fix reverseScanRoutesAt to merge+sort descending when clampToRoutes=false Co-authored-by: bootjp <1306365+bootjp@users.noreply.github.com> Agent-Logs-Url: https://github.com/bootjp/elastickv/sessions/f96f5f26-a7f9-4563-b9a8-db083ce32609 --- kv/shard_store.go | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/kv/shard_store.go b/kv/shard_store.go index 1dbb25a4..a5b3179f 100644 --- a/kv/shard_store.go +++ b/kv/shard_store.go @@ -184,19 +184,32 @@ func (s *ShardStore) reverseScanRoutesAt( clampToRoutes bool, ) ([]*store.KVPair, error) { out := make([]*store.KVPair, 0) - for i := len(routes) - 1; i >= 0 && len(out) < limit; i-- { + for i := len(routes) - 1; i >= 0; i-- { route := routes[i] scanStart := start scanEnd := end if clampToRoutes { + if len(out) >= limit { + break + } scanStart = clampScanStart(start, route.Start) scanEnd = clampScanEnd(end, route.End) + kvs, err := s.scanRouteAtDirection(ctx, route, scanStart, scanEnd, limit-len(out), ts, true) + if err != nil { + return nil, err + } + out = append(out, kvs...) + } else { + // When clampToRoutes is false (e.g. S3 manifest scans spanning multiple + // shards), keys from different routes may interleave in descending order. + // Fetch up to limit from every route and merge+sort descending so the + // result honours the ReverseScanAt contract. + kvs, err := s.scanRouteAtDirection(ctx, route, scanStart, scanEnd, limit, ts, true) + if err != nil { + return nil, err + } + out = mergeAndTrimReverseScanResults(out, kvs, limit) } - kvs, err := s.scanRouteAtDirection(ctx, route, scanStart, scanEnd, limit-len(out), ts, true) - if err != nil { - return nil, err - } - out = append(out, kvs...) } return out, nil } @@ -343,6 +356,21 @@ func mergeAndTrimScanResults(out []*store.KVPair, kvs []*store.KVPair, limit int return out[:limit] } +func mergeAndTrimReverseScanResults(out []*store.KVPair, kvs []*store.KVPair, limit int) []*store.KVPair { + if len(kvs) == 0 { + return out + } + out = append(out, kvs...) + if len(out) <= limit { + return out + } + sort.Slice(out, func(i, j int) bool { + return bytes.Compare(out[i].Key, out[j].Key) > 0 + }) + clear(out[limit:]) + return out[:limit] +} + func countNonInternalKVs(kvs []*store.KVPair) int { count := 0 for _, kvp := range kvs { From d6fd20f5525ffda3a13a8f69781c36a3940a2979 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 07:11:10 +0000 Subject: [PATCH 23/27] kv: add tests for ReverseScanAt cross-shard ordering and mergeAndTrimReverseScanResults Co-authored-by: bootjp <1306365+bootjp@users.noreply.github.com> Agent-Logs-Url: https://github.com/bootjp/elastickv/sessions/845127a7-c957-4c22-8699-b8584a84033d --- kv/shard_store_test.go | 191 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) diff --git a/kv/shard_store_test.go b/kv/shard_store_test.go index 87feaccb..d57095ea 100644 --- a/kv/shard_store_test.go +++ b/kv/shard_store_test.go @@ -128,6 +128,197 @@ func TestShardStoreScanAt_RoutesS3ManifestScansByLogicalObjectKey(t *testing.T) require.Equal(t, k1, kvs[1].Key) } +// TestShardStoreReverseScanAt_DescendingOrderAcrossShards verifies that +// ReverseScanAt with a nil start (clampToRoutes=false) merges results from all +// shards and returns them in descending key order. +func TestShardStoreReverseScanAt_DescendingOrderAcrossShards(t *testing.T) { + t.Parallel() + + ctx := context.Background() + + engine := distribution.NewEngine() + engine.UpdateRoute([]byte(""), []byte("m"), 1) + engine.UpdateRoute([]byte("m"), nil, 2) + + groups := map[uint64]*ShardGroup{ + 1: {Store: store.NewMVCCStore()}, + 2: {Store: store.NewMVCCStore()}, + } + st := NewShardStore(engine, groups) + + // Shard 1 (keys < "m") + require.NoError(t, st.PutAt(ctx, []byte("a"), []byte("va"), 1, 0)) + require.NoError(t, st.PutAt(ctx, []byte("c"), []byte("vc"), 2, 0)) + // Shard 2 (keys >= "m") + require.NoError(t, st.PutAt(ctx, []byte("x"), []byte("vx"), 3, 0)) + require.NoError(t, st.PutAt(ctx, []byte("z"), []byte("vz"), 4, 0)) + + // nil start → clampToRoutes=false; both shards must be merged in descending order. + kvs, err := st.ReverseScanAt(ctx, nil, nil, 4, ^uint64(0)) + require.NoError(t, err) + require.Len(t, kvs, 4) + require.Equal(t, []byte("z"), kvs[0].Key) + require.Equal(t, []byte("x"), kvs[1].Key) + require.Equal(t, []byte("c"), kvs[2].Key) + require.Equal(t, []byte("a"), kvs[3].Key) +} + +// TestShardStoreReverseScanAt_LimitAcrossShards verifies that the limit is +// correctly applied when results from multiple shards are merged in descending +// order. The top-N keys across all shards must be returned. +func TestShardStoreReverseScanAt_LimitAcrossShards(t *testing.T) { + t.Parallel() + + ctx := context.Background() + + engine := distribution.NewEngine() + engine.UpdateRoute([]byte(""), []byte("m"), 1) + engine.UpdateRoute([]byte("m"), nil, 2) + + groups := map[uint64]*ShardGroup{ + 1: {Store: store.NewMVCCStore()}, + 2: {Store: store.NewMVCCStore()}, + } + st := NewShardStore(engine, groups) + + // Shard 1 (keys < "m") + require.NoError(t, st.PutAt(ctx, []byte("a"), []byte("va"), 1, 0)) + require.NoError(t, st.PutAt(ctx, []byte("b"), []byte("vb"), 2, 0)) + require.NoError(t, st.PutAt(ctx, []byte("c"), []byte("vc"), 3, 0)) + // Shard 2 (keys >= "m") + require.NoError(t, st.PutAt(ctx, []byte("x"), []byte("vx"), 4, 0)) + require.NoError(t, st.PutAt(ctx, []byte("y"), []byte("vy"), 5, 0)) + require.NoError(t, st.PutAt(ctx, []byte("z"), []byte("vz"), 6, 0)) + + // limit=4: top-4 in descending order are z, y, x, c. + kvs, err := st.ReverseScanAt(ctx, nil, nil, 4, ^uint64(0)) + require.NoError(t, err) + require.Len(t, kvs, 4) + require.Equal(t, []byte("z"), kvs[0].Key) + require.Equal(t, []byte("y"), kvs[1].Key) + require.Equal(t, []byte("x"), kvs[2].Key) + require.Equal(t, []byte("c"), kvs[3].Key) +} + +// TestShardStoreReverseScanAt_SingleShard verifies that ReverseScanAt on a +// single shard returns results in descending key order. +func TestShardStoreReverseScanAt_SingleShard(t *testing.T) { + t.Parallel() + + ctx := context.Background() + + engine := distribution.NewEngine() + engine.UpdateRoute([]byte(""), nil, 1) + + groups := map[uint64]*ShardGroup{ + 1: {Store: store.NewMVCCStore()}, + } + st := NewShardStore(engine, groups) + + require.NoError(t, st.PutAt(ctx, []byte("a"), []byte("va"), 1, 0)) + require.NoError(t, st.PutAt(ctx, []byte("b"), []byte("vb"), 2, 0)) + require.NoError(t, st.PutAt(ctx, []byte("c"), []byte("vc"), 3, 0)) + + kvs, err := st.ReverseScanAt(ctx, nil, nil, 2, ^uint64(0)) + require.NoError(t, err) + require.Len(t, kvs, 2) + require.Equal(t, []byte("c"), kvs[0].Key) + require.Equal(t, []byte("b"), kvs[1].Key) +} + +// TestShardStoreReverseScanAt_IncludesS3ManifestKeysDescending mirrors +// TestShardStoreScanAt_IncludesS3ManifestKeysAcrossShards but for +// ReverseScanAt — results must be returned in descending key order. +func TestShardStoreReverseScanAt_IncludesS3ManifestKeysDescending(t *testing.T) { + t.Parallel() + + ctx := context.Background() + + engine := distribution.NewEngine() + engine.UpdateRoute([]byte(""), []byte("m"), 1) + engine.UpdateRoute([]byte("m"), nil, 2) + + groups := map[uint64]*ShardGroup{ + 1: {Store: store.NewMVCCStore()}, + 2: {Store: store.NewMVCCStore()}, + } + st := NewShardStore(engine, groups) + + k1 := s3keys.ObjectManifestKey("bucket-a", 1, "alpha") + k2 := s3keys.ObjectManifestKey("bucket-a", 1, "zeta") + require.NoError(t, st.PutAt(ctx, k1, []byte("m1"), 1, 0)) + require.NoError(t, st.PutAt(ctx, k2, []byte("m2"), 2, 0)) + + start := s3keys.ObjectManifestPrefixForBucket("bucket-a", 1) + kvs, err := st.ReverseScanAt(ctx, start, prefixScanEnd(start), 10, ^uint64(0)) + require.NoError(t, err) + require.Len(t, kvs, 2) + // "zeta" > "alpha" → descending order puts k2 first. + require.Equal(t, k2, kvs[0].Key) + require.Equal(t, k1, kvs[1].Key) +} + +// TestMergeAndTrimReverseScanResults verifies that the helper merges two +// slices, sorts them in descending key order, and trims to the given limit. +func TestMergeAndTrimReverseScanResults(t *testing.T) { + t.Parallel() + + out := []*store.KVPair{ + {Key: []byte("z"), Value: []byte("vz")}, + {Key: []byte("m"), Value: []byte("vm")}, + } + kvs := []*store.KVPair{ + {Key: []byte("y"), Value: []byte("vy")}, + {Key: []byte("a"), Value: []byte("va")}, + } + + result := mergeAndTrimReverseScanResults(out, kvs, 3) + require.Len(t, result, 3) + require.Equal(t, []byte("z"), result[0].Key) + require.Equal(t, []byte("y"), result[1].Key) + require.Equal(t, []byte("m"), result[2].Key) +} + +func TestMergeAndTrimReverseScanResults_EmptyInput(t *testing.T) { + t.Parallel() + + out := []*store.KVPair{{Key: []byte("z"), Value: []byte("vz")}} + result := mergeAndTrimReverseScanResults(out, nil, 10) + require.Len(t, result, 1) + require.Equal(t, []byte("z"), result[0].Key) +} + +func TestMergeAndTrimReverseScanResults_WithinLimit(t *testing.T) { + t.Parallel() + + out := []*store.KVPair{{Key: []byte("z"), Value: []byte("vz")}} + kvs := []*store.KVPair{{Key: []byte("a"), Value: []byte("va")}} + + result := mergeAndTrimReverseScanResults(out, kvs, 10) + require.Len(t, result, 2) + require.Equal(t, []byte("z"), result[0].Key) + require.Equal(t, []byte("a"), result[1].Key) +} + +func TestMergeAndTrimReverseScanResults_ExactLimit(t *testing.T) { + t.Parallel() + + out := []*store.KVPair{ + {Key: []byte("z"), Value: []byte("vz")}, + {Key: []byte("c"), Value: []byte("vc")}, + } + kvs := []*store.KVPair{ + {Key: []byte("y"), Value: []byte("vy")}, + {Key: []byte("a"), Value: []byte("va")}, + } + + // limit=2: top-2 in descending order are "z", "y". + result := mergeAndTrimReverseScanResults(out, kvs, 2) + require.Len(t, result, 2) + require.Equal(t, []byte("z"), result[0].Key) + require.Equal(t, []byte("y"), result[1].Key) +} + func TestScanLockBoundsForKVs_ReverseOrder(t *testing.T) { t.Parallel() From 90e6e7044b6e0bf04b47f54a7c8fa02df1f6361c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 08:48:20 +0000 Subject: [PATCH 24/27] Initial plan From 7682358e09cf901d1e80928e8ab778b3598bcaed Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 08:56:03 +0000 Subject: [PATCH 25/27] Fix s3/shard_store issues: always sort in mergeAndTrimReverseScanResults, dedup by GroupID, pinReadTS no-op token, guard leader S3 map entry Co-authored-by: bootjp <1306365+bootjp@users.noreply.github.com> Agent-Logs-Url: https://github.com/bootjp/elastickv/sessions/2f32e04f-12e2-4392-9ee8-2f1291e6b67f --- adapter/s3.go | 6 ++++-- kv/shard_store.go | 13 ++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/adapter/s3.go b/adapter/s3.go index 079a5b2d..1ba2d46a 100644 --- a/adapter/s3.go +++ b/adapter/s3.go @@ -190,7 +190,9 @@ func NewS3Server(listen net.Listener, s3Addr string, st store.MVCCStore, coordin s.leaderS3 = map[raft.ServerAddress]string{} } if leader := s.coordinatorLeaderAddress(); leader != "" { - s.leaderS3[leader] = s.s3Addr + if _, exists := s.leaderS3[leader]; !exists { + s.leaderS3[leader] = s.s3Addr + } } } mux := http.NewServeMux() @@ -1317,7 +1319,7 @@ func (s *S3Server) readTS() uint64 { func (s *S3Server) pinReadTS(ts uint64) *kv.ActiveTimestampToken { if s == nil || s.readTracker == nil { - return nil + return &kv.ActiveTimestampToken{} } return s.readTracker.Pin(ts) } diff --git a/kv/shard_store.go b/kv/shard_store.go index a5b3179f..7d6e8d35 100644 --- a/kv/shard_store.go +++ b/kv/shard_store.go @@ -184,6 +184,7 @@ func (s *ShardStore) reverseScanRoutesAt( clampToRoutes bool, ) ([]*store.KVPair, error) { out := make([]*store.KVPair, 0) + seenGroups := make(map[uint64]struct{}) for i := len(routes) - 1; i >= 0; i-- { route := routes[i] scanStart := start @@ -204,6 +205,12 @@ func (s *ShardStore) reverseScanRoutesAt( // shards), keys from different routes may interleave in descending order. // Fetch up to limit from every route and merge+sort descending so the // result honours the ReverseScanAt contract. + // De-duplicate by GroupID: after a range split both halves share the same + // GroupID (same backing shard store), so only scan each group once. + if _, seen := seenGroups[route.GroupID]; seen { + continue + } + seenGroups[route.GroupID] = struct{}{} kvs, err := s.scanRouteAtDirection(ctx, route, scanStart, scanEnd, limit, ts, true) if err != nil { return nil, err @@ -361,12 +368,12 @@ func mergeAndTrimReverseScanResults(out []*store.KVPair, kvs []*store.KVPair, li return out } out = append(out, kvs...) - if len(out) <= limit { - return out - } sort.Slice(out, func(i, j int) bool { return bytes.Compare(out[i].Key, out[j].Key) > 0 }) + if len(out) <= limit { + return out + } clear(out[limit:]) return out[:limit] } From 28d6633ed2368fb0fee43bd0ceab3975fbfe2579 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 10:35:09 +0000 Subject: [PATCH 26/27] Initial plan From c61c8d1663e1c88eaf7347449323259990f52958 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 10:42:39 +0000 Subject: [PATCH 27/27] kv: zero CommitTS when StartTS is auto-assigned in coordinators Co-authored-by: bootjp <1306365+bootjp@users.noreply.github.com> Agent-Logs-Url: https://github.com/bootjp/elastickv/sessions/4953ec80-5e46-4a9f-8aad-5babc83f3f41 --- kv/coordinator.go | 5 +++++ kv/sharded_coordinator.go | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/kv/coordinator.go b/kv/coordinator.go index 15db4b0c..157b8f5e 100644 --- a/kv/coordinator.go +++ b/kv/coordinator.go @@ -57,7 +57,12 @@ func (c *Coordinate) Dispatch(ctx context.Context, reqs *OperationGroup[OP]) (*C if reqs.IsTxn && reqs.StartTS == 0 { // Leader-only timestamp issuance to avoid divergence across shards. + // When the leader assigns StartTS, also clear any caller-provided + // CommitTS so dispatchTxn generates both timestamps consistently. + // A caller-supplied CommitTS without a matching StartTS could produce + // CommitTS <= StartTS (an invalid transaction). reqs.StartTS = c.nextStartTS() + reqs.CommitTS = 0 } if reqs.IsTxn { diff --git a/kv/sharded_coordinator.go b/kv/sharded_coordinator.go index 7ecc58ca..8f7a2ed9 100644 --- a/kv/sharded_coordinator.go +++ b/kv/sharded_coordinator.go @@ -70,6 +70,11 @@ func (c *ShardedCoordinator) Dispatch(ctx context.Context, reqs *OperationGroup[ return nil, err } reqs.StartTS = startTS + // When the coordinator assigns StartTS, also clear any caller-provided + // CommitTS so dispatchTxn generates both timestamps consistently. + // A caller-supplied CommitTS without a matching StartTS could produce + // CommitTS <= StartTS (an invalid transaction). + reqs.CommitTS = 0 } if reqs.IsTxn {